// llil2m.cpp. C++ 11 version of Perl llil.pl. // C++ version inspired by Mario's famous dualvar two-sort Perl solution. // g++ compile on Linux: // g++ -o llil2m -std=c++11 -Wall -O3 llil2m.cpp // This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64) // that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe). // Example run: llil2m tt1.txt tt2.txt tt3.txt >out.txt // Uncomment next line to mimic marioroy two sort trick (needs stable sort) #define MARIOROY_TWO_SORT_TRICK_L 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile"); // ---------------------------------------------------------------------------- // Crude hack to see Windows Private Bytes in Task Manager by sleeping at // program end (see also sleep hack at end of main) // #include // #include // For some performance hacks to speed up C++ I/O see: // https://www.reddit.com/r/rust/comments/9xedap/how_to_achieve_fast_stdinstdout_io_suitable_for/ // The only one we use here is to prefer "\n" to std::endl to reduce stdout flushing // ---------------------------------------------------------------------------- typedef long long llil_int_type; using str_int_type = std::pair; using map_str_int_type = std::map; using vec_str_int_type = std::vector; // Mimic the Perl get_properties subroutine ---------------------------- // Limit line length and use lower level ANSI C functions to try to boost I/O performance // TODO (maybe) Try: // - reading: ::setvbuf(fh, NULL, _IOFBF, 65536) on input files // - writing: ::setvbuf(stdout, stdout_buf, _IOFBF, sizeof(stdout_buf)) // ... or instead of writing to stdout, take output file as program argument #define MAX_LINE_LEN_L 255 static void get_properties( int nfiles, // in: the number of input files char* fname[], // in: the input file names map_str_int_type& hash_ret) // out: a hash of properties { FILE* fh; char line[MAX_LINE_LEN_L+1]; char* word; llil_int_type count; for (int i = 0; i < nfiles; ++i) { fh = ::fopen(fname[i], "r"); if (fh == NULL) { std::cerr << "Error opening '" << fname[i] << "' : errno=" << errno << "\n"; continue; } #if 0 // This is no faster if ( ::setvbuf(fh, NULL, _IOFBF, 65536 * 4 ) != 0 ) { std::cerr << "Error setvbuf '" << fname[i] << "'\n"; return; } #endif while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) { word = ::strtok(line, "\t"); count = ::atoll( ::strtok(NULL, "\n") ); hash_ret[word] -= count; } ::fclose(fh); } } // --------------------------------------------------------------------- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil2c file1 file2 ... >out.txt\n"; return 1; } std::cerr << "llil2m start\n"; time_t tstart1 = ::time(NULL); clock_t cstart1 = ::clock(); // Create the hash of properties map_str_int_type hash_ret; get_properties(argc - 1, &argv[1], hash_ret); clock_t cend1 = ::clock(); double ctaken1 = (double) (cend1 - cstart1) / (double)CLOCKS_PER_SEC; std::cerr << "get_properties CPU time : " << ctaken1 << " secs\n"; clock_t cstart2 = ::clock(); // Create a vector to sort (can't sort a std::map directly) vec_str_int_type v( hash_ret.begin(), hash_ret.end() ); clock_t cend2v = ::clock(); #ifdef MARIOROY_TWO_SORT_TRICK_L // std::map is already sorted by string key so just stable_sort by value std::stable_sort( v.begin(), v.end(), [](const str_int_type& left, const str_int_type& right) { return left.second < right.second; } ); #else // Sort descending by value, i.e. mimic this Perl code in C++: // sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href} // No surprise that string key comparison is way slower than numeric key comparison in std::sort() std::sort( v.begin(), v.end(), [](const str_int_type& left, const str_int_type& right) { return left.second != right.second ? left.second < right.second : left.first < right.first; } ); #endif clock_t cend2s = ::clock(); // Output the merged properties // (std::for_each() no faster, ditto replacing std::cout with ::printf) for ( auto const& n : v ) { std::cout << n.first << '\t' << -n.second << '\n'; } clock_t cend2 = ::clock(); time_t tend2 = ::time(NULL); long ttaken = static_cast(::difftime(tend2, tstart1) + 0.5); double ctaken = (double) (cend2 - cstart1) / (double)CLOCKS_PER_SEC; double ctaken2v = (double) (cend2v - cstart2) / (double)CLOCKS_PER_SEC; double ctaken2s = (double) (cend2s - cend2v) / (double)CLOCKS_PER_SEC; double ctaken2o = (double) (cend2 - cend2s) / (double)CLOCKS_PER_SEC; std::cerr << "vector copy CPU time : " << ctaken2v << " secs\n"; #ifdef MARIOROY_TWO_SORT_TRICK_L std::cerr << "vector stable sort CPU time : " << ctaken2s << " secs\n"; #else std::cerr << "vector sort CPU time : " << ctaken2s << " secs\n"; #endif std::cerr << "write stdout CPU time : " << ctaken2o << " secs\n"; std::cerr << "total CPU time : " << ctaken << " secs\n"; std::cerr << "total wall clock : " << ttaken << " secs\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000)); return 0; } #### > llil2m big1.txt big2.txt big3.txt >llil2m.tmp llil2m start get_properties CPU time : 4.187 secs vector copy CPU time : 0.612 secs vector sort CPU time : 2.89 secs write stdout CPU time : 1.383 secs total CPU time : 9.074 secs total wall clock : 9 secs #### > llil2m big1.txt big2.txt big3.txt >llil2m.tmp llil2m start get_properties CPU time : 4.286 secs vector copy CPU time : 0.613 secs vector stable sort CPU time : 2.977 secs write stdout CPU time : 1.363 secs total CPU time : 9.244 secs total wall clock : 9 secs Memory use (Windows Private Bytes): 1,218,716K #### // llil2grt.cpp. // Inspired by llilgrt.pl: improve sort performance via a negative count. // g++ compile on Linux: // g++ -o llil2grt -std=c++11 -Wall -O3 llil2grt.cpp // This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64) // that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe). // Example run: llil2grt tt1.txt tt2.txt tt3.txt >out.txt #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile"); // ---------------------------------------------------------------------------- // Crude hack to see Windows Private Bytes in Task Manager by sleeping at // program end (see also sleep hack at end of main) // #include // #include // ---------------------------------------------------------------------------- typedef long long llil_int_type; using str_int_type = std::pair; using int_str_type = std::pair; using map_str_int_type = std::map; using vec_str_int_type = std::vector; using set_int_str_type = std::set; // Mimic the Perl get_properties subroutine ---------------------------- // Limit line length and use ANSI C functions to try to boost performance #define MAX_LINE_LEN_L 255 static void get_properties( int nfiles, // in: the number of input files char* fname[], // in: the input file names map_str_int_type& hash_ret) // out: a hash of properties { FILE* fh; char line[MAX_LINE_LEN_L+1]; char* word; llil_int_type count; for (int i = 0; i < nfiles; ++i) { fh = ::fopen(fname[i], "r"); if (fh == NULL) { std::cerr << "Error opening '" << fname[i] << "' : errno=" << errno << "\n"; continue; } while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) { word = ::strtok(line, "\t"); count = ::atoll( ::strtok(NULL, "\n") ); hash_ret[word] -= count; } ::fclose(fh); } } // --------------------------------------------------------------------- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil2grt file1 file2 ... >out.txt\n"; return 1; } std::cerr << "llil2grt start\n"; time_t tstart1 = ::time(NULL); clock_t cstart1 = ::clock(); // Create the hash of properties map_str_int_type hash_ret; get_properties(argc - 1, &argv[1], hash_ret); clock_t cend1 = ::clock(); double ctaken1 = (double) (cend1 - cstart1) / (double)CLOCKS_PER_SEC; std::cerr << "get_properties CPU time : " << ctaken1 << " secs\n"; clock_t cstart2 = ::clock(); // To avoid calling sort(), try creating an inverted std::set container // Note: negative count gives desired ordering (just like Perl GRT sort :) set_int_str_type s; for ( auto const& kv : hash_ret ) { // Note: emplace_hint() was a lot faster than emplace() s.emplace_hint( s.end(), kv.second, kv.first ); } clock_t cend2s = ::clock(); // Output the (already sorted) std::set - no sort() function required! // Note: fix up negative count via -n.first for ( auto const& n : s ) { std::cout << n.second << '\t' << -n.first << '\n'; } clock_t cend2 = ::clock(); time_t tend2 = ::time(NULL); long ttaken = static_cast(::difftime(tend2, tstart1) + 0.5); double ctaken = (double) (cend2 - cstart1) / (double)CLOCKS_PER_SEC; double ctaken2s = (double) (cend2s - cstart2) / (double)CLOCKS_PER_SEC; double ctaken2o = (double) (cend2 - cend2s) / (double)CLOCKS_PER_SEC; std::cerr << "emplace set sort CPU time : " << ctaken2s << " secs\n"; std::cerr << "write stdout CPU time : " << ctaken2o << " secs\n"; std::cerr << "total CPU time : " << ctaken << " secs\n"; std::cerr << "total wall clock time : " << ttaken << " secs\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000)); return 0; } #### > llil2grt big1.txt big2.txt big3.txt >llil2grt.tmp llil2grt start get_properties CPU time : 4.252 secs emplace set sort CPU time : 1.282 secs write stdout CPU time : 1.716 secs total CPU time : 7.254 secs total wall clock time : 7 secs Memory use (Windows Private Bytes): 1,626,728K