// llil2.cpp. C++ 11 version of Perl llil.pl. // llil2.cpp is faster than llil.cpp while also clarifying limits: // - all keys should be less than 200 or so characters in length // - numbers are 64 bit integers (max: 9,223,372,036,854,775,807) // g++ compile on Linux: // g++ -o llil2 -std=c++11 -Wall -O3 llil2.cpp // This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64) // that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe). // Example run: llil2 tt1.txt tt2.txt tt3.txt >out.txt #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile"); // ---------------------------------------------------------------------------- // Crude hack to see Windows Private Bytes in Task Manager by sleeping at // program end (see also sleep hack at end of main) // #include // #include // For some performance hacks to speed up C++ I/O see: // https://www.reddit.com/r/rust/comments/9xedap/how_to_achieve_fast_stdinstdout_io_suitable_for/ // The only one we use here is to prefer "\n" to std::endl to reduce stdout flushing // ---------------------------------------------------------------------------- typedef long long llil_int_type; using str_int_type = std::pair; using map_str_int_type = std::map; using vec_str_int_type = std::vector; // Mimic the Perl get_properties subroutine ---------------------------- // Limit line length and use lower level ANSI C functions to try to boost I/O performance // TODO (maybe): // - reading: Try ::setvbuf(fh, NULL, _IOFBF, 65536) or some such on input files // - writing: Try ::setvbuf(stdout, stdout_buf, _IOFBF, sizeof(stdout_buf)) on stdout // ... or instead of writing to stdout, take an output file as a program argument #define MAX_LINE_LEN_L 255 static void get_properties( int nfiles, // in: the number of input files char* fname[], // in: the input file names map_str_int_type& hash_ret) // out: a hash of properties { FILE* fh; char line[MAX_LINE_LEN_L+1]; char* word; char* count; for (int i = 0; i < nfiles; ++i) { fh = ::fopen(fname[i], "r"); if (fh == NULL) { std::cerr << "Error opening '" << fname[i] << "'\n"; return; } while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) { word = ::strtok(line, "\t"); count = ::strtok(NULL, "\n"); hash_ret[word] += ::atoll(count); } ::fclose(fh); } } // --------------------------------------------------------------------- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil2 file1 file2 ... >out.txt\n"; return 1; } std::cerr << "llil2 start\n"; time_t tstart1 = ::time(NULL); // Create the hash of properties map_str_int_type hash_ret; get_properties(argc - 1, &argv[1], hash_ret); time_t tend1 = ::time(NULL); long taken1 = static_cast(::difftime(tend1, tstart1) + 0.5); std::cerr << "get_properties : " << taken1 << " secs\n"; // Sort descending by value, i.e. mimic this Perl code in C++: // sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href} time_t tstart2 = ::time(NULL); vec_str_int_type v( hash_ret.begin(), hash_ret.end() ); std::sort( v.begin(), v.end(), [](const str_int_type& left, const str_int_type& right) { return right.second != left.second ? right.second < left.second : left.first < right.first; } ); // Output the merged properties for ( auto const& n : v ) { std::cout << n.first << '\t' << n.second << '\n'; } time_t tend2 = ::time(NULL); long taken2 = static_cast(::difftime(tend2, tstart2) + 0.5); long taken = static_cast(::difftime(tend2, tstart1) + 0.5); std::cerr << "sort + output : " << taken2 << " secs\n"; std::cerr << "total : " << taken << " secs\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000)); return 0; }