llil2vec start get_properties CPU time : 3.06313 secs emplace set sort CPU time : 0.923435 secs write stdout CPU time : 1.392 secs total CPU time : 5.37868 secs total wall clock time : 6 secs llil2vec start get_properties CPU time : 3.1567 secs emplace set sort CPU time : 0.970294 secs write stdout CPU time : 1.22305 secs total CPU time : 5.35015 secs total wall clock time : 5 secs llil2vec start get_properties CPU time : 3.32019 secs emplace set sort CPU time : 1.08277 secs write stdout CPU time : 1.22461 secs total CPU time : 5.62766 secs total wall clock time : 5 secs #### Ave CPU time: 5.5 secs (Memory use (Windows Private Bytes): 1,225,580K) #### llil2vec (fixed string length=6) start get_properties CPU time : 2.09353 secs emplace set sort CPU time : 0.795144 secs write stdout CPU time : 1.20994 secs total CPU time : 4.09871 secs total wall clock time : 4 secs llil2vec (fixed string length=6) start get_properties CPU time : 2.2078 secs emplace set sort CPU time : 0.707252 secs write stdout CPU time : 1.14867 secs total CPU time : 4.06383 secs total wall clock time : 4 secs llil2vec (fixed string length=6) start get_properties CPU time : 2.39225 secs emplace set sort CPU time : 1.0033 secs write stdout CPU time : 1.22765 secs total CPU time : 4.62331 secs total wall clock time : 4 secs #### Ave CPU time: 4.3 secs (Memory use (Windows Private Bytes): 814,940K) #### // llil2vec.cpp. // Vector version of llil2grt.pl. // g++ compile on Linux: // g++ -o llil2vec -std=c++11 -Wall -O3 llil2vec.cpp // This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64) // that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe). // Example run: llil2vec big1.txt big2.txt big3.txt >vec.tmp #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile"); // ---------------------------------------------------------------------------- // Crude hack to see Windows Private Bytes in Task Manager by sleeping at // program end (see also sleep hack at end of main) // #include // #include // ---------------------------------------------------------------------------- typedef long long llil_int_type; // Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in length // To use (limited length) fixed length strings uncomment the next line // #define MAX_STR_LEN_L 6 #ifdef MAX_STR_LEN_L using str_arr_type = std::array; // +1 for trailing '\0' using str_int_type = std::pair; using int_str_type = std::pair; #else using str_int_type = std::pair; using int_str_type = std::pair; #endif using vec_str_int_type = std::vector; using vec_int_str_type = std::vector; using set_int_str_type = std::set; // Mimic the Perl get_properties subroutine ---------------------------- // Limit line length and use ANSI C functions to try to boost performance #define MAX_LINE_LEN_L 255 static void get_properties( int nfiles, // in: the number of input files char* fname[], // in: the input file names vec_int_str_type& vec_ret) // out: a vector of properties { FILE* fh; char line[MAX_LINE_LEN_L+1]; char* word; llil_int_type count; for (int i = 0; i < nfiles; ++i) { fh = ::fopen(fname[i], "r"); if (fh == NULL) { std::cerr << "Error opening '" << fname[i] << "' : errno=" << errno << "\n"; continue; } while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) { word = ::strtok(line, "\t"); count = ::atoll( ::strtok(NULL, "\n") ); #ifdef MAX_STR_LEN_L str_arr_type fixword { { '\0', '\0', '\0', '\0', '\0', '\0', '\0' } }; ::strcpy( fixword.data(), word ); vec_ret.emplace_back( -count, fixword ); #else vec_ret.emplace_back( -count, word ); #endif } ::fclose(fh); } // Needs to be sorted by word for later sum of adjacent count fields to work std::sort( vec_ret.begin(), vec_ret.end(), [](const int_str_type& left, const int_str_type& right) { return left.second < right.second; } ); } // --------------------------------------------------------------------- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil2vec file1 file2 ... >out.txt\n"; return 1; } #ifdef MAX_STR_LEN_L std::cerr << "llil2vec (fixed string length=" << MAX_STR_LEN_L << ") start\n"; #else std::cerr << "llil2vec start\n"; #endif time_t tstart1 = ::time(NULL); clock_t cstart1 = ::clock(); // Create the vector of properties vec_int_str_type vec_ret; get_properties(argc - 1, &argv[1], vec_ret); clock_t cend1 = ::clock(); double ctaken1 = (double) (cend1 - cstart1) / (double)CLOCKS_PER_SEC; std::cerr << "get_properties CPU time : " << ctaken1 << " secs\n"; clock_t cstart2 = ::clock(); // To avoid calling sort(), create an inverted std::set container // Note: negative count gives desired ordering set_int_str_type myset; auto it = vec_ret.begin(); int_str_type kv_last = *it; llil_int_type count = it->first; for (++it; it != vec_ret.end(); ++it) { if ( it->second == kv_last.second ) { count += it->first; } else { myset.emplace_hint( myset.end(), count, kv_last.second ); kv_last = *it; count = it->first; } } myset.emplace_hint( myset.end(), count, kv_last.second ); clock_t cend2s = ::clock(); // Output the (already sorted) std::set - no sort() function required // Note: fix up negative count via -n.first #ifdef MAX_STR_LEN_L for ( auto const& n : myset ) std::cout << n.second.data() << '\t' << -n.first << '\n'; #else for ( auto const& n : myset ) std::cout << n.second << '\t' << -n.first << '\n'; #endif clock_t cend2 = ::clock(); time_t tend2 = ::time(NULL); long ttaken = static_cast(::difftime(tend2, tstart1) + 0.5); double ctaken = (double) (cend2 - cstart1) / (double)CLOCKS_PER_SEC; double ctaken2s = (double) (cend2s - cstart2) / (double)CLOCKS_PER_SEC; double ctaken2o = (double) (cend2 - cend2s) / (double)CLOCKS_PER_SEC; std::cerr << "emplace set sort CPU time : " << ctaken2s << " secs\n"; std::cerr << "write stdout CPU time : " << ctaken2o << " secs\n"; std::cerr << "total CPU time : " << ctaken << " secs\n"; std::cerr << "total wall clock time : " << ttaken << " secs\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000)); return 0; }