These are my notes for using the Abseil libraries, particularly the hash maps. Actual usage are in tdriver.cc, below.
The shared libs are easier, only requiring few libs. Static, otherwise, require having the libraries in the correct order. But let that not deter you from using the static libs. Build with the shared libs first and run ldd on the binary. That will provide you a list of the Abseil libraries and the order to go by. I also look at ~/abseil/lib64/pkgconfig for clues.
// tdriver.cc update
// https://perlmonks.org/?node_id=11150974
// - based on https://www.perlmonks.org/?node_id=11150945
//
// Enhancements made on 2023-03-14:
// - Consistent floating precision to stderr by showing trailing zer
+os.
// - Added comments for building with abseil shared/static libraries
+.
// - Added phmap::flat_hash_map to the list, faster than absl::flat_
+hash_map.
// - Added ankerl::unordered_dense::map to the mix, faster than phma
+p::flat_hash_map.
// - Added inclusion of absl::Hash with phmap, also used by ankerl b
+ehind the scene.
// Note: phmap::parallel_flat_hash_map is the fastest of the bunch
+ running parallel.
// - Added duration time.
// Enhancements made on 2023-03-21:
// - Added MemoryMapped option (enable by changing #if 0 to #if 1; l
+ine 275)
//
// Building:
// - std::map, std::unordered_map, boost::unordered_map
// clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc
//
// - phmap::flat_hash_map, phmap::parallel_flat_hash_map
// clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./parallel
+-hashmap
//
// - ankerl::unordered_dense::map
// clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./unordere
+d_dense/include
//
// Abseil shared libs: (uncomment MT_USE_ABSL_HASH_L below)
// - phmap with absl::Hash
// clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./parallel
+-hashmap -I${HOME}/abseil/include -L${HOME}/abseil/lib64 -Wl,-rpath=$
+{HOME}/abseil/lib64 -labsl_hash
//
// - ankerl::unordered_dense::map with absl::Hash
// clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./unordere
+d_dense/include -I${HOME}/abseil/include -L${HOME}/abseil/lib64 -Wl,-
+rpath=${HOME}/abseil/lib64 -labsl_hash
//
// - absl::flat_hash_map or absl::node_hash_map with absl::Hash
// clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I${HOME}/ab
+seil/include -L${HOME}/abseil/lib64 -Wl,-rpath=${HOME}/abseil/lib64 -
+labsl_hash -labsl_raw_hash_set -labsl_raw_logging_internal
//
// Abseil static libs: (uncomment MT_USE_ABSL_HASH_L below)
// - phmap with absl::Hash
// clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./parallel
+-hashmap -I${HOME}/abseil/include -L${HOME}/abseil/lib64 -l:libabsl_h
+ash.a -l:libabsl_low_level_hash.a -l:libabsl_city.a
//
// - ankerl::unordered_dense::map with absl::Hash
// clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./unordere
+d_dense/include -I${HOME}/abseil/include -L${HOME}/abseil/lib64 -l:li
+babsl_hash.a -l:libabsl_low_level_hash.a -l:libabsl_city.a
//
// - absl::flat_hash_map or absl::node_hash_map with absl::Hash
// clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I${HOME}/ab
+seil/include -L${HOME}/abseil/lib64 -l:libabsl_hash.a -l:libabsl_low_
+level_hash.a -l:libabsl_raw_hash_set.a -l:libabsl_raw_logging_interna
+l.a -l:libabsl_city.a
#include <cstdio>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <string>
#include <array>
#include <vector>
#include <utility>
#include <iterator>
#include <execution>
#include <algorithm>
// #include <boost/container_hash/hash.hpp>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <sstream>
// Test with std::map, std::unordered_map or phmap::parallel_flat_hash
+_map
// or boost::unorderedmap or absl::flat_hash_map
#define MT_STD_MAP_L 0
#define MT_STD_UNORDERED_MAP_L 1
#define MT_FLAT_HASH_MAP_L 2
#define MT_PARALLEL_FLAT_HASH_MAP_L 3
#define MT_BOOST_UNORDERED_MAP_L 4
#define MT_GOOGLE_HASH_MAP_L 5
#define MT_GOOGLE_NODE_MAP_L 6
#define MT_ANKERL_UNORDERED_DENSE_MAP_L 7
// Uncomment to use absl::Hash function object
// #define MT_USE_ABSL_HASH_L 1
#ifdef MT_USE_ABSL_HASH_L
#include <absl/hash/hash.h>
#endif
// Uncomment one of the map types below
// #define MAP_TYPE_L MT_STD_MAP_L
// #define MAP_TYPE_L MT_STD_UNORDERED_MAP_L
// #define MAP_TYPE_L MT_FLAT_HASH_MAP_L
#define MAP_TYPE_L MT_PARALLEL_FLAT_HASH_MAP_L
// #define MAP_TYPE_L MT_BOOST_UNORDERED_MAP_L
// #define MAP_TYPE_L MT_GOOGLE_HASH_MAP_L
// #define MAP_TYPE_L MT_GOOGLE_NODE_MAP_L
// #define MAP_TYPE_L MT_ANKERL_UNORDERED_DENSE_MAP_L
// -------------------------------------------------------------------
+---------
typedef int_fast64_t llil_int_type;
// All words in big1.txt, big2.txt, big3.txt are <= 6 chars in length.
// big.txt max word length is 6
// long.txt max word length is 208
//
// Based on rough benchmarking, the short fixed string hack below is o
+nly
// worth trying for MAX_STR_LEN_L up to about 22.
// See also https://backlinko.com/google-keyword-study
//
// To use (limited length) fixed length strings uncomment the next lin
+e.
#define MAX_STR_LEN_L 6
#ifdef MAX_STR_LEN_L
struct str_type : std::array<char, MAX_STR_LEN_L> {
bool operator==( const str_type& o ) const {
return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) == 0;
}
bool operator<( const str_type& o ) const {
return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) < 0;
}
};
#else
struct str_type : std::basic_string<char> {
bool operator==( const str_type& o ) const {
return ::strcmp(this->data(), o.data()) == 0;
}
bool operator<( const str_type& o ) const {
return ::strcmp(this->data(), o.data()) < 0;
}
};
#endif
using str_int_type = std::pair<str_type, llil_int_type>;
using vec_str_int_type = std::vector<str_int_type>;
// inject specialization of std::hash or absl::Hash for str_type into
+namespace std
namespace std {
template<> struct hash<str_type> {
std::size_t operator()( str_type const& v ) const noexcept {
#if 0
return boost::hash_range( v.cbegin(), v.cend() );
#else
std::basic_string_view<char> bv {
reinterpret_cast<const char*>(v.data()), v.size() * sizeof
+(char) };
#ifdef MT_USE_ABSL_HASH_L
return absl::Hash<std::basic_string_view<char>>()(bv);
#else
return std::hash<std::basic_string_view<char>>()(bv);
#endif
#endif
}
};
}
#if MAP_TYPE_L == MT_STD_MAP_L
// https://en.cppreference.com/w/cpp/container/map
#include <map>
using map_str_int_type = std::map<str_type, llil_int_type>;
#elif MAP_TYPE_L == MT_STD_UNORDERED_MAP_L
// https://en.cppreference.com/w/cpp/container/unordered_map
#include <unordered_map>
using map_str_int_type = std::unordered_map<str_type, llil_int_type
+>;
#elif MAP_TYPE_L == MT_FLAT_HASH_MAP_L
// https://github.com/greg7mdp/parallel-hashmap
// If you prefer the default hash to be the Abseil hash framework,
// include the Abseil headers before `phmap.h` and define the
// preprocessor macro `PHMAP_USE_ABSL_HASH`.
// - refer to the README and parallel_hashmap/phmap_fwd_decl.h
#ifdef MT_USE_ABSL_HASH_L
#define PHMAP_USE_ABSL_HASH 1
#endif
#include <parallel_hashmap/phmap.h>
using map_str_int_type = phmap::flat_hash_map<str_type, llil_int_ty
+pe>;
#elif MAP_TYPE_L == MT_PARALLEL_FLAT_HASH_MAP_L
// https://github.com/greg7mdp/parallel-hashmap
// If you prefer the default hash to be the Abseil hash framework,
// include the Abseil headers before `phmap.h` and define the
// preprocessor macro `PHMAP_USE_ABSL_HASH`.
// - refer to the README and parallel_hashmap/phmap_fwd_decl.h
#ifdef MT_USE_ABSL_HASH_L
#define PHMAP_USE_ABSL_HASH 1
#endif
#include <parallel_hashmap/phmap.h>
// create the parallel_flat_hash_map without internal mutexes
using map_str_int_type = phmap::parallel_flat_hash_map<
str_type, llil_int_type,
phmap::priv::hash_default_hash<str_type>,
phmap::priv::hash_default_eq<str_type>,
phmap::priv::Allocator<phmap::priv::Pair<const str_type, llil_in
+t_type>>,
8, phmap::NullMutex
>;
#elif MAP_TYPE_L == MT_BOOST_UNORDERED_MAP_L
// https://www.boost.org/doc/libs/1_80_0/libs/unordered/doc/html/un
+ordered.html
#include <boost/unordered_map.hpp>
using map_str_int_type = boost::unordered_map<str_type, llil_int_ty
+pe>;
#elif MAP_TYPE_L == MT_GOOGLE_HASH_MAP_L
// https://github.com/abseil/abseil-cpp
// By default, `flat_hash_map` uses the `absl::Hash` hashing framew
+ork.
// - refer to absl/container/internal/hash_function_defaults.h
#ifndef MT_USE_ABSL_HASH_L
#define MT_USE_ABSL_HASH_L 1
#endif
#include <absl/container/flat_hash_map.h>
using map_str_int_type = absl::flat_hash_map<str_type, llil_int_typ
+e>;
#elif MAP_TYPE_L == MT_GOOGLE_NODE_MAP_L
// https://github.com/abseil/abseil-cpp
// By default, `node_hash_map` uses the `absl::Hash` hashing framew
+ork.
// - refer to absl/container/internal/hash_function_defaults.h
#ifndef MT_USE_ABSL_HASH_L
#define MT_USE_ABSL_HASH_L 1
#endif
#include <absl/container/node_hash_map.h>
using map_str_int_type = absl::node_hash_map<str_type, llil_int_typ
+e>;
#elif MAP_TYPE_L == MT_ANKERL_UNORDERED_DENSE_MAP_L
// https://github.com/martinus/unordered_dense
#include <ankerl/unordered_dense.h>
using map_str_int_type = ankerl::unordered_dense::map<str_type, lli
+l_int_type>;
#else
#error "Unsupported map_str_int_type"
#endif
// Simple RAII timer -------------------------------------------------
+----------
// Create a MyTimer object in a scope:
// {
// MyTimer tt;
// ...
// }
// to automatically print the time taken in the block to stderr
#include <chrono>
inline double elaspe_time(
std::chrono::high_resolution_clock::time_point cend,
std::chrono::high_resolution_clock::time_point cstart)
{
return double( std::chrono::duration_cast<std::chrono::milliseconds
+>(cend - cstart).count() ) * 1e-3;
}
class MyTimer {
public:
MyTimer() { stnow_m = std::chrono::high_resolution_clock::now(); }
~MyTimer() {
auto endnow = std::chrono::high_resolution_clock::now();
std::cerr << " (" << elaspe_time(endnow, stnow_m) << " seconds)\
+n";
}
private:
std::chrono::time_point<std::chrono::high_resolution_clock> stnow_m
+;
};
// -------------------------------------------------------------------
+--
// #include "get_properties.inl"
// get_properties.inl
// Note: str_type, llil_int_type and map_str_int_type are not defined
+here.
// llil_int_type is normally defined as: typedef int_fast64_t llil_int
+_type;
// Note: int_fast64_t is defined in <cstdint>
// map_str_int_type is keyed by str_type with value of llil_int_type.
// These three types must be defined by the code that includes this .i
+nl file.
// map_str_int_type can be many different types, std::map, std::unorde
+red_map, ...
// so long as all operations on map_upd below are supported.
#include <cstdint>
#include <cstdio>
#include <array>
#include <algorithm>
#if 0
// Obtain the Portable Memory Mapping C++ Class.
// git clone --depth=1 https://github.com/stbrumme/portable-memory-
+mapping
//
// Copy two files to you work dir: *.cpp and *.h
// cp https://github.com/stbrumme/portable-memory-mapping/MemoryMap
+ped.* .
//
// Compile on Linux
// clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc MemoryMapped.
+cpp ...
inline llil_int_type fast_atoll64( const char* str, uint64_t* pos )
{
llil_int_type val = 0;
// int sign = 0;
// if ( *str == '-' ) {
// sign = 1, ++str, ++*pos;
// }
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9)
val = val * 10 + digit, ++*pos;
// return sign ? -val : val;
return val;
}
#include "MemoryMapped.h"
// Update map_upd with the properties found in file fname
// Return the number of properties in file fname (or -1 if fname could
+ not be opened)
static llil_int_type get_properties(
const char* fname, // in : the input file name
map_str_int_type& map_upd // inout: the map to be updated
)
{
// Map file to memory.
MemoryMapped data(fname, MemoryMapped::WholeFile, MemoryMapped::Seq
+uentialScan);
if (!data.isValid()) return -1;
// Get raw pointer to mapped memory.
char* buffer = (char*) data.getData();
uint64_t filesize = data.size();
// Process mapped file.
uint64_t length, strpos = 0;
llil_int_type count;
llil_int_type nprop = 0;
for (uint64_t pos = 0; pos < filesize; pos++) {
if (buffer[pos] == '\t') {
++nprop;
length = pos - strpos;
count = fast_atoll64( &buffer[pos + 1], &pos );
#ifdef MAX_STR_LEN_L
str_type fixword {}; // {} initializes all elements of fixwo
+rd to '\0'
( length <= MAX_STR_LEN_L )
? ::memcpy( fixword.data(), &buffer[strpos], length )
: ::memcpy( fixword.data(), &buffer[strpos], MAX_STR_LEN_L
+ );
const auto [it, success] = map_upd.emplace(fixword, count);
#else
const auto [it, success] = map_upd.emplace(
str_type(&buffer[strpos], length),
count
);
#endif
if ( !success ) it->second += count;
strpos = pos = ( buffer[pos + 1] == '\r' ) ? pos + 3 : pos +
+2;
}
}
data.close();
return nprop;
}
#else
inline llil_int_type fast_atoll64( const char* str )
{
llil_int_type val = 0;
// int sign = 0;
// if ( *str == '-' ) {
// sign = 1, ++str;
// }
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit
+;
// return sign ? -val : val;
return val;
}
// Limit line length and use ANSI C functions to try to boost performa
+nce
#define MAX_LINE_LEN_L 255
// Update map_upd with the properties found in file fname
// Return the number of properties in file fname (or -1 if fname could
+ not be opened)
static llil_int_type get_properties(
const char* fname, // in : the input file name
map_str_int_type& map_upd // inout: the map to be updated
)
{
std::array<char, MAX_LINE_LEN_L + 1> line;
char* found;
llil_int_type count;
llil_int_type nprop = 0;
FILE* fh = ::fopen(fname, "r");
if ( fh == NULL ) return -1;
while ( ::fgets( line.data(), static_cast<int>(MAX_LINE_LEN_L), fh
+) != NULL ) {
++nprop;
found = std::find( line.begin(), line.end(), '\t' );
count = fast_atoll64( found + 1 );
// Note: using emplace() is faster than map_upd[fixword] += coun
+t;
#ifdef MAX_STR_LEN_L
str_type fixword {}; // {} initializes all elements of fixword
+to '\0'
std::copy( line.begin(), found, fixword.begin() );
const auto [it, success] = map_upd.emplace( fixword, count );
#else
*found = '\0';
const auto [it, success] = map_upd.emplace(str_type{ line.data()
+ }, count);
#endif
if (!success) it->second += count;
}
::fclose(fh);
return nprop;
}
#endif
// -------------------------------------------------------------------
+--
int main(int argc, char* argv[])
{
if (argc < 2) {
std::cerr << "usage: tdriver file1 file2 ... >out.txt\n";
return 1;
}
std::cerr << std::setprecision(3) << std::setiosflags(std::ios::fix
+ed);
#ifdef MAX_STR_LEN_L
std::cerr << "tdriver (fixed string length=" << MAX_STR_LEN_L << ")
+ start\n";
#else
std::cerr << "tdriver start\n";
#endif
#if MAP_TYPE_L == MT_STD_MAP_L
std::cerr << "use std::map\n";
#elif MAP_TYPE_L == MT_STD_UNORDERED_MAP_L
std::cerr << "use std::unordered_map\n";
#elif MAP_TYPE_L == MT_FLAT_HASH_MAP_L
std::cerr << "use phmap::flat_hash_map\n";
#elif MAP_TYPE_L == MT_PARALLEL_FLAT_HASH_MAP_L
std::cerr << "use phmap::parallel_flat_hash_map\n";
#elif MAP_TYPE_L == MT_BOOST_UNORDERED_MAP_L
std::cerr << "use boost::unordered_map\n";
#elif MAP_TYPE_L == MT_GOOGLE_HASH_MAP_L
std::cerr << "use absl::flat_hash_map\n";
#elif MAP_TYPE_L == MT_GOOGLE_NODE_MAP_L
std::cerr << "use absl::node_hash_map\n";
#elif MAP_TYPE_L == MT_ANKERL_UNORDERED_DENSE_MAP_L
std::cerr << "use ankerl::unordered_dense::map\n";
#else
#error "Unsupported map_str_int_type"
#endif
#ifdef MT_USE_ABSL_HASH_L
std::cerr << "use absl::Hash function object in ctor or string view
+\n";
#endif
// Get the list of input files from the command line
int nfiles = argc - 1;
char** fname = &argv[1];
MyTimer tt_main;
map_str_int_type mymap;
for ( int i = 0; i < nfiles; ++i ) {
MyTimer tt;
llil_int_type nlines = get_properties(fname[i], mymap);
std::cerr << fname[i] << ": nlines=" << nlines;
}
// Store the properties into a vector
vec_str_int_type propvec( mymap.begin(), mymap.end() );
mymap.clear();
// Sort the vector by (count) in reverse order, (name) in lexical o
+rder
std::sort(
// Standard sort
propvec.begin(), propvec.end(),
[](const str_int_type& left, const str_int_type& right) {
return left.second != right.second
? left.second > right.second
: left.first < right.first;
}
);
// Output the sorted vector
#ifdef MAX_STR_LEN_L
for ( auto const& n : propvec )
::printf("%.*s\t%ld\n", MAX_STR_LEN_L, n.first.data(), n.second)
+;
#else
for ( auto const& n : propvec )
std::cout << n.first << "\t" << n.second << "\n";
#endif
std::cerr << "duration:";
return 0;
}
One can use taskset on Linux for repeatable timings; i.e. taskset -c 1-2 ./tdriver ...
Notice how phmp parallel-flat-hash-map with absl::Hash is nearly as fast as Abseil flat-hash-map. Even faster, is phmap flat-hash-map. Ankerl unordered-dense-map is the fastest of the bunch running serially. Noteworthy, is that nothing comes close to phmp parallel-flat-hash-map if the application involves many threads.
The Abseil hash maps default to using absl::Hash, behind the scene. Look at Ankerl go with absl::Hash. It benefits from injecting specialization of absl::Hash for str_type into namespace std.