in reply to Re^3: Rosetta Code: Long List is Long (faster - vec - openmp) in thread Rosetta Code: Long List is Long
I tried the following for the interim version of llil3m.cpp.
The overhead for running parallel is greater than I hoped for using std::map. Then, I found parallel hashmap. From the link, "By using parallel SSE2 instructions, these hashmaps are able to look up items by checking 16 slots in parallel."
Running:
$ time ./llil2grt big1.txt big2.txt big3.txt >out.txt
llil2grt start
get_properties CPU time : 3.55473 secs
emplace set sort CPU time : 0.998423 secs
write stdout CPU time : 0.854453 secs
total CPU time : 5.40764 secs
total wall clock time : 5 secs
real 0m5.973s
user 0m5.697s
sys 0m0.292s
$ NUM_THREADS=1 ./llil4map-no6-uno big1.txt big2.txt big3.txt >out.txt
llil4map start
use std::unordered_map
don't use OpenMP
use boost sort
get properties 3.368 secs
finish merging 1.202 secs
vector stable sort 0.898 secs
write stdout 0.261 secs
total time 5.730 secs
$ NUM_THREADS=1 ./llil4map-no6-std big1.txt big2.txt big3.txt >out.txt
llil4map start
use std::map
don't use OpenMP
use boost sort
get properties 3.328 secs
finish merging 0.657 secs
vector stable sort 0.816 secs
write stdout 0.268 secs
total time 5.069 secs
$ NUM_THREADS=1 ./llil4map-no6-flat big1.txt big2.txt big3.txt >out.tx
+t
llil4map start
use phmap::parallel_flat_hash_map
don't use OpenMP
use boost sort
get properties 1.565 secs
map to vector 0.200 secs
vector stable sort 0.665 secs
write stdout 0.220 secs
total time 2.652 secs
llil4map.cc
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~
// llil4map.cc (new chunking variant)
// https://www.perlmonks.com/?node_id=11149643
// A phmap::parallel_flat_hash_map demonstration.
// By Mario Roy, April 7, 2024
// Based on llil3m.cpp https://perlmonks.com/?node_id=11149482
// Original challenge https://perlmonks.com/?node_id=11147822
// and summary https://perlmonks.com/?node_id=11150293
// Other demonstrations https://perlmonks.com/?node_id=11149907
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~
// OpenMP Little Book - https://nanxiao.gitbooks.io/openmp-little-book
//
// Obtain the parallel hashmap library (required dependency):
// git clone --depth=1 https://github.com/greg7mdp/parallel-hashmap
//
// Compile on Linux (clang++ or g++):
// clang++ -o llil4map -std=c++20 -fopenmp -Wall -O3 llil4map.cc -I
+./parallel-hashmap
//
// On macOS, use g++-12 from https://brew.sh (installation: brew insta
+ll gcc@12).
// The g++ command also works with mingw C++ compiler (https://sourcef
+orge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.ex
+e).
//
// Obtain gen-llil.pl and gen-long-llil.pl from https://perlmonks.com/
+?node_id=11148681
// perl gen-llil.pl big1.txt 200 3 1
// perl gen-llil.pl big2.txt 200 3 1
// perl gen-llil.pl big3.txt 200 3 1
// perl gen-long-llil.pl long1.txt 600
// perl gen-long-llil.pl long2.txt 600
// perl gen-long-llil.pl long3.txt 600
//
// To make random input, obtain shuffle.pl from https://perlmonks.com/
+?node_id=11149800
// perl shuffle.pl big1.txt >tmp && mv tmp big1.txt
// perl shuffle.pl big2.txt >tmp && mv tmp big2.txt
// perl shuffle.pl big3.txt >tmp && mv tmp big3.txt
//
// Example run: llil4map big1.txt big2.txt big3.txt >out.txt
// NUM_THREADS=3 llil4map ...
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~
#include <cassert>
#include <cstdio>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <compare>
#include <chrono>
#include <string>
#include <string_view>
#include <array>
#include <vector>
#include <thread>
#include <execution>
#include <atomic>
#include <iomanip>
#include <iostream>
#include <fstream>
#include <parallel_hashmap/phmap.h>
static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, ne
+ed a 64-bit compile");
// Specify 0/1 to use boost's parallel sorting algorithm; faster than
+__gnu_parallel::sort.
// https://www.boost.org/doc/libs/1_81_0/libs/sort/doc/html/sort/paral
+lel.html
// This requires the boost header files: e.g. devpkg-boost bundle on C
+lear Linux.
// Note: Another option is downloading and unpacking Boost locally.
// (no need to build it because the bits we use are header file only)
#define USE_BOOST_PARALLEL_SORT 1
#if USE_BOOST_PARALLEL_SORT
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-parameter"
#pragma clang diagnostic ignored "-Wshadow"
#include <boost/sort/sort.hpp>
#pragma clang diagnostic pop
#else
#include <boost/sort/sort.hpp>
#endif
#endif
#ifdef _OPENMP
#include <omp.h>
#endif
class spinlock_mutex {
// https://rigtorp.se/spinlock/
// https://vorbrodt.blog/2019/02/12/fast-mutex/
public:
// Assignment is disabled.
spinlock_mutex& operator=(const spinlock_mutex& rhs) = delete;
void lock() noexcept {
for (;;) {
if (!lock_.exchange(true, std::memory_order_acquire))
break;
while (lock_.load(std::memory_order_relaxed))
__builtin_ia32_pause();
}
}
void unlock() noexcept {
lock_.store(false, std::memory_order_release);
}
private:
alignas(4 * sizeof(std::max_align_t)) std::atomic_bool lock_ = fals
+e;
};
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~
typedef uint32_t int_type;
// All words in big1.txt, big2.txt, big3.txt are <= 6 chars in length.
// big.txt max word length is 6
// long.txt max word length is 208
//
// Based on rough benchmarking, the short fixed string hack below is o
+nly
// worth trying for MAX_STR_LEN_L up to about 30.
// See also https://backlinko.com/google-keyword-study
//
// To use (limited length) fixed length strings uncomment the next lin
+e.
#define MAX_STR_LEN_L (size_t) 12
#ifdef MAX_STR_LEN_L
struct str_type : std::array<char, MAX_STR_LEN_L> {
bool operator==( const str_type& o ) const {
return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) == 0;
}
bool operator<( const str_type& o ) const {
return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) < 0;
}
};
// inject specialization of std::hash for str_type into namespace std
namespace std {
template<> struct hash<str_type> {
std::size_t operator()( str_type const& v ) const noexcept {
std::basic_string_view<char> bv {
reinterpret_cast<const char*>(v.data()), v.size() * sizeof
+(char) };
return std::hash<std::basic_string_view<char>>()(bv);
}
};
}
#else
using str_type = std::basic_string<char>;
#endif
using str_int_type = std::pair<str_type, int_type>;
using vec_str_int_type = std::vector<str_int_type>;
// declare the type of the parallel_flat_hash_map with spinlock mutexe
+s
using map_str_int_type = phmap::parallel_flat_hash_map<
str_type, int_type,
phmap::priv::hash_default_hash<str_type>,
phmap::priv::hash_default_eq<str_type>,
phmap::priv::Allocator<phmap::priv::Pair<const str_type, int_type>>
+,
12,
spinlock_mutex
>;
// Mimic the Perl get_properties subroutine ~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~
// fast_atoll64
// https://stackoverflow.com/questions/16826422/
// c-most-efficient-way-to-convert-string-to-int-faster-than-atoi
// convert positive number from string to uint32_t
inline uint32_t fast_atoll64(const char* str)
{
uint32_t val = 0;
// int sign = 0;
// if (*str == '-') {
// sign = 1, ++str;
// }
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9)
val = val * 10 + digit;
// return sign ? -val : val;
return val;
}
// Helper function to find a character.
inline char* find_char(char* first, char* last, char c)
{
while (first != last) {
if (*first == c) break;
++first;
}
return first;
}
// Limit chunk size and line length.
inline constexpr size_t CHUNK_SIZE = 32768;
inline constexpr size_t MAX_LINE_LEN = 255;
static int64_t get_properties(
const char* fname, // in : the input file name
const int nthds, // in : the number of threads
map_str_int_type& map_ret) // inout : the map to be updated
{
int64_t num_lines = 0;
std::ifstream fin(fname, std::ifstream::binary);
if (!fin.is_open()) {
std::cerr << "Error opening '" << fname << "' : " << strerror(er
+rno) << '\n';
return num_lines;
}
#pragma omp parallel reduction(+:num_lines)
{
std::string buf; buf.resize(CHUNK_SIZE + MAX_LINE_LEN + 1, '\0')
+;
char *first, *last, *found;
size_t len, klen;
int_type count;
while (fin.good()) {
len = 0;
// Read the next chunk serially.
#pragma omp critical
{
fin.read(&buf[0], CHUNK_SIZE);
if ((len = fin.gcount()) > 0) {
if (buf[len - 1] != '\n' && fin.getline(&buf[len], MAX_
+LINE_LEN)) {
// Getline discards the newline char and appends nul
+l char.
// Therefore, change '\0' to '\n'.
len += fin.gcount();
buf[len - 1] = '\n';
}
}
}
if (!len) break;
buf[len] = '\0';
first = &buf[0];
last = &buf[len];
// Process max Nthreads chunks concurrently.
while (first < last) {
char* beg_ptr{first}; first = find_char(first, last, '\n
+');
char* end_ptr{first}; ++first, ++num_lines;
if ((found = find_char(beg_ptr, end_ptr, '\t')) == end_ptr
+) continue;
count = fast_atoll64(found + 1);
#ifdef MAX_STR_LEN_L
str_type s {}; // {} initializes all elements of s to '\0
+'
klen = std::min(MAX_STR_LEN_L, (size_t)(found - beg_ptr));
::memcpy(s.data(), beg_ptr, klen);
#else
klen = found - beg_ptr;
str_type s(beg_ptr, klen);
#endif
// Use lazy_emplace to modify the map while the mutex is l
+ocked.
map_ret.lazy_emplace_l(
s,
[&](map_str_int_type::value_type& p) {
// called only when key was already present
p.second += count;
},
[&](const map_str_int_type::constructor& ctor) {
// construct value_type in place when key not presen
+t
ctor(std::move(s), count);
}
);
}
}
}
fin.close();
return num_lines;
}
// Output subroutine ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~
size_t divide_up(size_t dividend, size_t divisor)
{
if (dividend % divisor)
return (size_t)(dividend / divisor) + 1;
else
return (size_t)(dividend / divisor);
}
static void out_properties(
const int nthds, // in : the number of threads
vec_str_int_type& vec) // in : the vector to output
{
size_t num_chunks = divide_up(vec.size(), CHUNK_SIZE);
int nthds_out = 1;
#ifdef _OPENMP
nthds_out = std::min(nthds, 32);
#endif
#pragma omp parallel for ordered schedule(static, 1) num_threads(nt
+hds_out)
for (size_t chunk_id = 1; chunk_id <= num_chunks; ++chunk_id) {
std::string str(""); str.reserve(2048 * 1024);
auto it = vec.begin() + (chunk_id - 1) * CHUNK_SIZE;
auto it2 = vec.begin() + std::min(vec.size(), chunk_id * CHUNK_S
+IZE);
for (; it != it2; ++it) {
#ifdef MAX_STR_LEN_L
str.append(it->first.data());
#else
str.append(it->first.data(), it->first.size());
#endif
str.append("\t", 1);
str.append(std::to_string(it->second));
str.append("\n", 1);
}
#pragma omp ordered
std::cout << str << std::flush;
}
}
typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::high_resolution_clock::time_point time_point;
typedef std::chrono::milliseconds milliseconds;
double elaspe_time(
time_point cend,
time_point cstart)
{
return double (
std::chrono::duration_cast<milliseconds>(cend - cstart).count()
) * 1e-3;
}
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~
int main(int argc, char* argv[])
{
if (argc < 2) {
if (argc > 0)
std::cerr << "usage: llil4map file1 file2 ... >out.txt\n";
return 1;
}
std::cerr << std::setprecision(3) << std::setiosflags(std::ios::fix
+ed);
#ifdef MAX_STR_LEN_L
std::cerr << "llil4map (fixed string length=" << MAX_STR_LEN_L << "
+) start\n";
#else
std::cerr << "llil4map start\n";
#endif
#ifdef _OPENMP
std::cerr << "use OpenMP\n";
#else
std::cerr << "don't use OpenMP\n";
#endif
#if USE_BOOST_PARALLEL_SORT == 0
std::cerr << "don't use boost sort\n";
#else
std::cerr << "use boost sort\n";
#endif
time_point cstart1, cend1, cstart2, cend2, cstart3, cend3s, cend3;
cstart1 = high_resolution_clock::now();
#ifdef _OPENMP
// Determine the number of threads.
const char* env_nthds = std::getenv("NUM_THREADS");
int nthds = ( env_nthds && strlen(env_nthds) )
? ::atoi(env_nthds)
: std::thread::hardware_concurrency();
omp_set_dynamic(false);
omp_set_num_threads(nthds);
omp_set_max_active_levels(1);
int nthds_move = std::min(nthds, 12);
#else
int nthds = 1;
int nthds_move = 1;
#endif
// Get the list of input files from the command line
int nfiles = argc - 1;
char** fname = &argv[1];
// Store the properties into a vector
vec_str_int_type propvec;
int64_t num_lines = 0;
{
// Enclose the map inside a block, so GC releases the object
// immediately after exiting the scope.
map_str_int_type map;
for (int i = 0; i < nfiles; ++i)
num_lines += get_properties(fname[i], nthds, map);
cend1 = high_resolution_clock::now();
double ctaken1 = elaspe_time(cend1, cstart1);
std::cerr << "get properties " << std::setw(8) << ctaken1 <
+< " secs\n";
if (map.size() == 0) {
std::cerr << "No work, exiting...\n";
return 1;
}
cstart2 = high_resolution_clock::now();
if (nthds == 1) {
propvec.reserve(map.size());
for (auto const& x : map)
propvec.emplace_back(std::move(const_cast<str_type&>(x.fir
+st)), x.second);
map.clear();
}
else {
propvec.resize(map.size());
std::vector<vec_str_int_type::iterator> I(map.subcnt());
auto cur = propvec.begin();
for (size_t i = 0; i < map.subcnt(); ++i) {
map.with_submap(i, [&](const map_str_int_type::EmbeddedSet
+& set) {
I[i] = cur;
cur += set.size();
});
}
#pragma omp parallel for schedule(static, 1) num_threads(nthd
+s_move)
for (size_t i = 0; i < map.subcnt(); ++i) {
map.with_submap_m(i, [&](map_str_int_type::EmbeddedSet& se
+t) {
auto it = I[i];
for (auto& x : set)
*it++ = std::make_pair(std::move(const_cast<str_type
+&>(x.first)), x.second);
// reset the set (no longer needed) to reclaim memory e
+arly
// https://github.com/greg7mdp/parallel-hashmap/issues/
+238
set = map_str_int_type::EmbeddedSet();
});
}
}
cend2 = high_resolution_clock::now();
double ctaken2 = elaspe_time(cend2, cstart2);
std::cerr << "map to vector " << std::setw(8) << ctaken2 <
+< " secs\n";
}
cstart3 = high_resolution_clock::now();
// Sort the vector by (count) in reverse order, (name) in lexical o
+rder
auto reverse_order = [](const str_int_type& left, const str_int_typ
+e& right) {
return left.second != right.second
? left.second > right.second
: left.first < right.first;
};
#if USE_BOOST_PARALLEL_SORT == 0
// Standard sort
std::sort(propvec.begin(), propvec.end(), reverse_order);
#else
// Parallel sort
boost::sort::block_indirect_sort(
propvec.begin(), propvec.end(), reverse_order,
#ifdef __NVCOMPILER_LLVM__
std::min(nthds, 32)
#else
nthds
#endif
);
#endif
cend3s = high_resolution_clock::now();
// Output the sorted vector
out_properties(nthds, propvec);
cend3 = high_resolution_clock::now();
double ctaken = elaspe_time(cend3, cstart1);
double ctaken3s = elaspe_time(cend3s, cstart3);
double ctaken3o = elaspe_time(cend3, cend3s);
std::cerr << "vector stable sort " << std::setw(8) << ctaken3s <<
+" secs\n";
std::cerr << "write stdout " << std::setw(8) << ctaken3o <<
+" secs\n";
std::cerr << "total time " << std::setw(8) << ctaken <<
+" secs\n";
std::cerr << " count lines " << num_lines << "\n";
std::cerr << " count unique " << propvec.size() << "\n";
// Hack to see Private Bytes in Windows Task Manager
// (uncomment next line so process doesn't exit too quickly)
// std::this_thread::sleep_for(milliseconds(9000));
return 0;
}
Re^5: Rosetta Code: Long List is Long (faster - llil4map - OpenMP code)
by eyepopslikeamosquito (Archbishop) on Feb 22, 2023 at 10:10 UTC
|
Nice work Mario!
As you might expect, I couldn't restrain myself from playing around with a few minor changes:
- I gave boost::hash_range a try because there's less syntactic claptrap around the specialization of std::hash -- I've #if 0'ed it out though because it seems to be a touch slower than good old std::hash.
- Generally, I pull a face whenever I see a function updating a non-const global variable, so I changed the get_properties interface to pass map_upd as a function parameter.
- My usual simplification of parsing in get_properties by assuming the input file/s match the llil spec ^[a-z]+\t\d+$ (and without any long names when using MAX_STR_LEN :).
// llil4map.cpp
// https://perlmonks.com/?node_id=11149643
// OpenMP Little Book - https://nanxiao.gitbooks.io/openmp-little-book
//
// C++ version inspired by Mario's famous dualvar two-sort Perl soluti
+on.
// based on llil3m.cpp https://perlmonks.com/?node_id=11149482
// 1. Run get_properties in parallel.
// 2. Capture time and diff via chrono.
// 3. Threads: Flush local vector periodically; speed-up for std::m
+ap.
// 4. Key words null-terminated for MAX_STR_LEN_L.
// 5. Concat strings using fast_io::concatln during output.
// 6. Support NUM_THREADS environment variable.
// 7. Added define for using boost's parallel sort.
// 8. Removed parallel get_properties, not helpful due to overhead.
// 9. Removed MAX_STR_LEN_L for simply std::map.
// A. Replaced atoll with fast_atoll64.
// B. Fast vector sorting - from llil5vec-tbb.
// C. Re-enabled parallel get_properties.
// D. Exit early if no work; fast_io tweak writing to output;
// limit to 8 threads max for sorting.
// E. Switch to parallel hashmap; using SSE2 instructions.
// F. Support std::map, std::unordered_map, or parallel-hashmap via
+ define.
// G. Refactored OpenMP code to merge immediately.
// H. Improved get_properties; set precision for timings.
// I. Support fixed-length key words using basic_static_string.
// J. Remove support for std::map and std::unordered_map.
// K. Opt-in to use (limited length) fixed length strings.
// Uncomment line #define MAX_STR_LEN_L and update value.
// L. Improved the fixed length path with custom default_hash/eq fu
+nctions.
// M. Define str_type struct as std::array, overriding == and < ope
+rators.
//
// Obtain the fast_io library (required dependency):
// git clone --depth=1 https://github.com/cppfastio/fast_io
//
// Obtain the parallel hashmap library (required dependency):
// git clone --depth=1 https://github.com/greg7mdp/parallel-hashmap
//
// g++ compile on Linux: (boost header may need the -Wno-stringop-over
+flow gcc option)
// g++ -o llil4map -std=c++20 -Wall -O3 llil4map.cpp -I ./fast_io/i
+nclude -I ./parallel-hashmap
// g++ -o llil4map-omp -std=c++20 -fopenmp -Wall -O3 llil4map.cpp -
+I ./fast_io/include -I ./parallel-hashmap
//
// This g++ command also works with mingw C++ compiler (https://source
+forge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.ex
+e).
//
// clang++ compile: same args, without the -Wno-stringop-overflow opti
+on
// Seems to run slightly faster when compiled with clang++ instead of
+g++
//
// Obtain gen-llil.pl and gen-long-llil.pl from https://perlmonks.com/
+?node_id=11148681
// perl gen-llil.pl big1.txt 200 3 1
// perl gen-llil.pl big2.txt 200 3 1
// perl gen-llil.pl big3.txt 200 3 1
// perl gen-long-llil.pl long1.txt 600
// perl gen-long-llil.pl long2.txt 600
// perl gen-long-llil.pl long3.txt 600
//
// To make random input, obtain shuffle.pl from https://perlmonks.com/
+?node_id=11149800
// perl shuffle.pl big1.txt >tmp && mv tmp big1.txt
// perl shuffle.pl big2.txt >tmp && mv tmp big2.txt
// perl shuffle.pl big3.txt >tmp && mv tmp big3.txt
//
// Example run: llil4map big1.txt big2.txt big3.txt >out.txt
// NUM_THREADS=3 llil4map-omp ...
// Note: Lesser NUM_THREADS is preferred, else merge time incr
+eases
// Start with MIN( CPU cores, 1 + log2( number of input
+files ) )
// -------------------------------------------------------------------
+---------
// Specify 0/1 to use boost's parallel sorting algorithm; faster than
+__gnu_parallel::sort.
// https://www.boost.org/doc/libs/1_81_0/libs/sort/doc/html/sort/paral
+lel.html
// This requires the boost header files: e.g. devpkg-boost bundle on C
+lear Linux.
// Note: Another option is downloading and unpacking Boost locally.
// (no need to build it because the bits we use are header file only)
#define USE_BOOST_PARALLEL_SORT 1
#include <chrono>
#include <thread>
// The fast_io header must come after chrono, else build error:
// "no member named 'concatln' in namespace 'fast_io'"
#include <fast_io.h>
#include <parallel_hashmap/phmap.h>
#include <cstdio>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <string>
#include <array>
#include <vector>
#include <utility>
#include <iterator>
#include <execution>
#include <algorithm>
#include <boost/container_hash/hash.hpp>
#if USE_BOOST_PARALLEL_SORT > 0
#include <boost/sort/sort.hpp>
#endif
#ifdef _OPENMP
#include <omp.h>
#endif
#include <iostream>
#include <iomanip>
#include <fstream>
#include <sstream>
static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, ne
+ed a 64-bit compile");
// -------------------------------------------------------------------
+---------
typedef long long llil_int_type;
// All words in big1.txt, big2.txt, big3.txt are <= 6 chars in length.
// big.txt max word length is 6
// long.txt max word length is 208
//
// Based on rough benchmarking, the short fixed string hack below is o
+nly
// worth trying for MAX_STR_LEN_L up to about 22.
// See also https://backlinko.com/google-keyword-study
//
// To use (limited length) fixed length strings uncomment the next lin
+e.
#define MAX_STR_LEN_L 6
#ifdef MAX_STR_LEN_L
struct str_type : std::array<char, MAX_STR_LEN_L> {
bool operator==( const str_type& o ) const {
return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) == 0;
}
bool operator<( const str_type& o ) const {
return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) < 0;
}
};
#else
struct str_type : std::basic_string<char> {
bool operator==( const str_type& o ) const {
return ::strcmp(this->data(), o.data()) == 0;
}
bool operator<( const str_type& o ) const {
return ::strcmp(this->data(), o.data()) < 0;
}
};
#endif
using str_int_type = std::pair<str_type, llil_int_type>;
using vec_str_int_type = std::vector<str_int_type>;
// inject specialization of std::hash for str_type into namespace std
namespace std {
template<> struct hash<str_type> {
std::size_t operator()( str_type const& v ) const noexcept {
#if 0
return boost::hash_range( v.cbegin(), v.cend() );
#else
std::basic_string_view<char> bv {
reinterpret_cast<const char*>(v.data()), v.size() * sizeof
+(char) };
return std::hash<std::basic_string_view<char>>()(bv);
#endif
}
};
}
// create the parallel_flat_hash_map without internal mutexes
using map_str_int_type = phmap::parallel_flat_hash_map<
str_type, llil_int_type,
phmap::priv::hash_default_hash<str_type>,
phmap::priv::hash_default_eq<str_type>,
phmap::priv::Allocator<phmap::priv::Pair<const str_type, llil_int_t
+ype>>,
8, phmap::NullMutex
>;
// fast_atoll64
// https://stackoverflow.com/questions/16826422/
// c-most-efficient-way-to-convert-string-to-int-faster-than-atoi
inline int64_t fast_atoll64( const char* str )
{
int64_t val = 0;
// int sign = 0;
// if ( *str == '-' ) {
// sign = 1, ++str;
// }
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit
+;
// return sign ? -val : val;
return val;
}
// Mimic the Perl get_properties subroutine --------------------------
+--
// Limit line length and use ANSI C functions to try to boost performa
+nce
#define MAX_LINE_LEN_L 255
#define MAX_THREADS 32
static void get_properties(
const char* fname, // in : the input file name
map_str_int_type& map_upd // inout: the map to be updated
)
{
FILE* fh;
std::array<char, MAX_LINE_LEN_L + 1> line;
char* found;
llil_int_type count;
fh = ::fopen(fname, "r");
if ( fh == NULL ) {
std::cerr << "Error opening '" << fname << "' : " << strerror(er
+rno) << "\n";
return;
}
while ( ::fgets( line.data(), static_cast<int>(MAX_LINE_LEN_L), fh
+) != NULL ) {
found = std::find( line.begin(), line.end(), '\t' );
count = fast_atoll64( found + 1 );
// Note: using emplace() is faster than map_upd[fixword] += coun
+t;
#ifdef MAX_STR_LEN_L
str_type fixword {}; // {} initializes all elements of fixword
+to '\0'
std::copy( line.begin(), found, fixword.begin() );
const auto [it, success] = map_upd.emplace(fixword, count);
#else
*found = '\0';
const auto [it, success] = map_upd.emplace(str_type{ line.data()
+ }, count);
#endif
if ( !success ) it->second += count;
}
::fclose(fh);
}
typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::high_resolution_clock::time_point time_point;
typedef std::chrono::milliseconds milliseconds;
double elaspe_time(
time_point cend,
time_point cstart)
{
return double (
std::chrono::duration_cast<milliseconds>(cend - cstart).count()
) * 1e-3;
}
// -------------------------------------------------------------------
+--
static map_str_int_type Map[MAX_THREADS];
int main(int argc, char* argv[])
{
if (argc < 2) {
std::cerr << "usage: llil4map file1 file2 ... >out.txt\n";
return 1;
}
std::cerr << std::setprecision(3) << std::setiosflags(std::ios::fix
+ed);
#ifdef MAX_STR_LEN_L
std::cerr << "llil4map (fixed string length=" << MAX_STR_LEN_L << "
+) start\n";
#else
std::cerr << "llil4map start\n";
#endif
#ifdef _OPENMP
std::cerr << "use OpenMP\n";
#else
std::cerr << "don't use OpenMP\n";
#endif
#if USE_BOOST_PARALLEL_SORT == 0
std::cerr << "don't use boost sort\n";
#else
std::cerr << "use boost sort\n";
#endif
time_point cstart1, cend1, cstart2, cend2, cstart3, cend3s, cend3;
cstart1 = high_resolution_clock::now();
#ifdef _OPENMP
// Determine the number of threads.
const char* env_nthrs = std::getenv("NUM_THREADS");
int nthrs = ( env_nthrs && strlen(env_nthrs) )
? ::atoi(env_nthrs)
: std::thread::hardware_concurrency();
if ( nthrs > MAX_THREADS ) nthrs = MAX_THREADS;
omp_set_dynamic(false);
omp_set_num_threads(nthrs);
#else
int nthrs = 1;
#endif
int nthrs_sort = ( std::thread::hardware_concurrency() < 12 )
? std::thread::hardware_concurrency()
: 12;
// Get the list of input files from the command line
int nfiles = argc - 1;
char** fname = &argv[1];
int merge_id = 0;
// Run parallel, depending on the number of threads
if ( nthrs == 1 || nfiles == 1 ) {
for ( int i = 0; i < nfiles; ++i ) {
get_properties(fname[i], Map[0]);
}
cend1 = high_resolution_clock::now();
double ctaken1 = elaspe_time(cend1, cstart1);
std::cerr << "get properties " << std::setw(8) << ctaken1 <
+< " secs\n";
cstart2 = high_resolution_clock::now();
}
#ifdef _OPENMP
else {
merge_id = -1;
int nfiles_processed = 0;
int gettime_rendered = 0;
#pragma omp parallel
{
int tid = omp_get_thread_num();
#pragma omp for nowait schedule(static, 1)
for ( int i = 0; i < nfiles; ++i ) {
get_properties(fname[i], Map[tid]);
#pragma omp atomic
nfiles_processed += 1;
}
#pragma omp critical
{
// report time for get properties
if ( nfiles_processed == nfiles && !gettime_rendered ) {
cend1 = high_resolution_clock::now();
double ctaken1 = elaspe_time(cend1, cstart1);
std::cerr << "get properties " << std::setw(8) <<
+ctaken1 << " secs\n";
cstart2 = high_resolution_clock::now();
gettime_rendered = 1;
}
// merge array
if ( Map[tid].size() > 0 ) {
if ( merge_id < 0 ) {
// container id other threads merge to
merge_id = tid;
}
else {
for ( auto const& x : Map[tid] ) {
// Map[merge_id][x.first] += x.second;
const auto [it, success] = Map[merge_id].emplace(
+x.first, x.second);
if ( !success ) it->second += x.second;
}
Map[tid].clear();
}
}
}
}
}
#endif
if ( merge_id < 0 || Map[merge_id].size() == 0 ) {
std::cerr << "No work, exiting...\n";
return 1;
}
// Store the properties into a vector
vec_str_int_type propvec( Map[merge_id].begin(), Map[merge_id].end(
+) );
Map[merge_id].clear();
cend2 = high_resolution_clock::now();
double ctaken2 = elaspe_time(cend2, cstart2);
std::cerr << "finish merging " << std::setw(8) << ctaken2 << "
+ secs\n";
cstart3 = high_resolution_clock::now();
// Sort the vector by (count) in reverse order, (name) in lexical o
+rder
#if USE_BOOST_PARALLEL_SORT == 0
std::sort(
// Standard sort
propvec.begin(), propvec.end(),
[](const str_int_type& left, const str_int_type& right) {
return left.second != right.second
? left.second > right.second
: left.first < right.first;
}
);
#else
boost::sort::block_indirect_sort(
// Parallel sort
propvec.begin(), propvec.end(),
[](const str_int_type& left, const str_int_type& right) {
return left.second != right.second
? left.second > right.second
: left.first < right.first;
},
nthrs_sort
);
#endif
cend3s = high_resolution_clock::now();
// Output the sorted vector
#ifdef MAX_STR_LEN_L
for ( auto const& n : propvec )
::print(fast_io::concatln(fast_io::mnp::os_c_str(n.first.data(),
+ MAX_STR_LEN_L), "\t", n.second));
#else
for ( auto const& n : propvec )
::print(fast_io::concatln(n.first, "\t", n.second));
#endif
cend3 = high_resolution_clock::now();
double ctaken = elaspe_time(cend3, cstart1);
double ctaken3s = elaspe_time(cend3s, cstart3);
double ctaken3o = elaspe_time(cend3, cend3s);
std::cerr << "vector stable sort " << std::setw(8) << ctaken3s <<
+" secs\n";
std::cerr << "write stdout " << std::setw(8) << ctaken3o <<
+" secs\n";
std::cerr << "total time " << std::setw(8) << ctaken <<
+" secs\n";
// Hack to see Private Bytes in Windows Task Manager
// (uncomment next line so process doesn't exit too quickly)
// std::this_thread::sleep_for(milliseconds(9000));
return 0;
}
Parallel HashMap References Added Later
- 2024 update: marioroy noted that std::mutex is slower in gcc-14 than gcc-13; he found another way to use spin locks via a small C++ class on the web.
| [reply] [d/l] [select] |
|
I took a break since you had posted. Recently, I revised the map implementation to handle hundreds of files with lesser memory consumption. The llil4map variant may handle Chuma's use-case; see my reply to Chuma.
There is also the mcesort implementation, a GNU parallel (parsort variant). It includes a mini-MCE (less than 1,150 lines) integrated into the script. The application itself is around 500 lines. It runs on Unix only.
| [reply] |
|
|