After being surprised in the extreme by
marioroy's astonishing two-sort dualvar Perl solution
and my accidental jwkrahn-inspired llil2grt.pl Perl GRT solution,
I couldn't restrain myself from trying to steal ideas from these two nodes
to improve the (9 second) speed of my fastest llil2a C++ solution.
llil2m.cpp
Here is my C++ solution inspired by marioroy's famous two-sort Perl concoction.
// llil2m.cpp. C++ 11 version of Perl llil.pl.
// C++ version inspired by Mario's famous dualvar two-sort Perl soluti
+on.
// g++ compile on Linux:
// g++ -o llil2m -std=c++11 -Wall -O3 llil2m.cpp
// This g++ command also works with mingw C++ compiler (https://source
+forge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.ex
+e).
// Example run: llil2m tt1.txt tt2.txt tt3.txt >out.txt
// Uncomment next line to mimic marioroy two sort trick (needs stable
+sort)
#define MARIOROY_TWO_SORT_TRICK_L 1
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <cstdio>
#include <string>
#include <vector>
#include <set>
#include <map>
#include <algorithm>
#include <utility>
#include <iostream>
#include <fstream>
#include <sstream>
static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, ne
+ed a 64-bit compile");
// -------------------------------------------------------------------
+---------
// Crude hack to see Windows Private Bytes in Task Manager by sleeping
+ at
// program end (see also sleep hack at end of main)
// #include <chrono>
// #include <thread>
// For some performance hacks to speed up C++ I/O see:
// https://www.reddit.com/r/rust/comments/9xedap/how_to_achieve_fast
+_stdinstdout_io_suitable_for/
// The only one we use here is to prefer "\n" to std::endl to reduce s
+tdout flushing
// -------------------------------------------------------------------
+---------
typedef long long llil_int_type;
using str_int_type = std::pair<std::string, llil_int_type>;
using map_str_int_type = std::map<std::string, llil_int_type>;
using vec_str_int_type = std::vector<str_int_type>;
// Mimic the Perl get_properties subroutine --------------------------
+--
// Limit line length and use lower level ANSI C functions to try to bo
+ost I/O performance
// TODO (maybe) Try:
// - reading: ::setvbuf(fh, NULL, _IOFBF, 65536) on input files
// - writing: ::setvbuf(stdout, stdout_buf, _IOFBF, sizeof(stdout_bu
+f))
// ... or instead of writing to stdout, take output file
+as program argument
#define MAX_LINE_LEN_L 255
static void get_properties(
int nfiles, // in: the number of input files
char* fname[], // in: the input file names
map_str_int_type& hash_ret) // out: a hash of properties
{
FILE* fh;
char line[MAX_LINE_LEN_L+1];
char* word;
llil_int_type count;
for (int i = 0; i < nfiles; ++i) {
fh = ::fopen(fname[i], "r");
if (fh == NULL) {
std::cerr << "Error opening '" << fname[i] << "' : errno=" <<
+ errno << "\n";
continue;
}
#if 0
// This is no faster
if ( ::setvbuf(fh, NULL, _IOFBF, 65536 * 4 ) != 0 ) {
std::cerr << "Error setvbuf '" << fname[i] << "'\n";
return;
}
#endif
while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) {
word = ::strtok(line, "\t");
count = ::atoll( ::strtok(NULL, "\n") );
hash_ret[word] -= count;
}
::fclose(fh);
}
}
// -------------------------------------------------------------------
+--
int main(int argc, char* argv[])
{
if (argc < 2) {
std::cerr << "usage: llil2c file1 file2 ... >out.txt\n";
return 1;
}
std::cerr << "llil2m start\n";
time_t tstart1 = ::time(NULL);
clock_t cstart1 = ::clock();
// Create the hash of properties
map_str_int_type hash_ret;
get_properties(argc - 1, &argv[1], hash_ret);
clock_t cend1 = ::clock();
double ctaken1 = (double) (cend1 - cstart1) / (double)CLOCKS_PER_SE
+C;
std::cerr << "get_properties CPU time : " << ctaken1 << " secs
+\n";
clock_t cstart2 = ::clock();
// Create a vector to sort (can't sort a std::map directly)
vec_str_int_type v( hash_ret.begin(), hash_ret.end() );
clock_t cend2v = ::clock();
#ifdef MARIOROY_TWO_SORT_TRICK_L
// std::map is already sorted by string key so just stable_sort by
+value
std::stable_sort( v.begin(), v.end(),
[](const str_int_type& left, const str_int_type& right) { return
+ left.second < right.second; }
);
#else
// Sort descending by value, i.e. mimic this Perl code in C++:
// sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href
+}
// No surprise that string key comparison is way slower than numeri
+c key comparison in std::sort()
std::sort( v.begin(), v.end(),
[](const str_int_type& left, const str_int_type& right) { return
+ left.second != right.second ? left.second < right.second : left.firs
+t < right.first; }
);
#endif
clock_t cend2s = ::clock();
// Output the merged properties
// (std::for_each() no faster, ditto replacing std::cout with ::pri
+ntf)
for ( auto const& n : v ) { std::cout << n.first << '\t' << -n.seco
+nd << '\n'; }
clock_t cend2 = ::clock();
time_t tend2 = ::time(NULL);
long ttaken = static_cast<long>(::difftime(tend2, tstart1) + 0
+.5);
double ctaken = (double) (cend2 - cstart1) / (double)CLOCKS_PER_
+SEC;
double ctaken2v = (double) (cend2v - cstart2) / (double)CLOCKS_PER
+_SEC;
double ctaken2s = (double) (cend2s - cend2v) / (double)CLOCKS_PER
+_SEC;
double ctaken2o = (double) (cend2 - cend2s) / (double)CLOCKS_PER
+_SEC;
std::cerr << "vector copy CPU time : " << ctaken2v << " sec
+s\n";
#ifdef MARIOROY_TWO_SORT_TRICK_L
std::cerr << "vector stable sort CPU time : " << ctaken2s << " sec
+s\n";
#else
std::cerr << "vector sort CPU time : " << ctaken2s << " sec
+s\n";
#endif
std::cerr << "write stdout CPU time : " << ctaken2o << " sec
+s\n";
std::cerr << "total CPU time : " << ctaken << " secs
+\n";
std::cerr << "total wall clock : " << ttaken << " secs
+\n";
// Hack to see Private Bytes in Windows Task Manager (uncomment nex
+t line so process doesn't exit too quickly)
// std::this_thread::sleep_for(std::chrono::milliseconds(90000000
+));
return 0;
}
The above program can be compiled with or without MARIOROY_TWO_SORT_TRICK_L defined.
Without MARIOROY_TWO_SORT_TRICK_L defined I saw:
> llil2m big1.txt big2.txt big3.txt >llil2m.tmp
llil2m start
get_properties CPU time : 4.187 secs
vector copy CPU time : 0.612 secs
vector sort CPU time : 2.89 secs
write stdout CPU time : 1.383 secs
total CPU time : 9.074 secs
total wall clock : 9 secs
So far so good. This is just the fastest C++ solution found previously.
Beside myself with excitement, I defined MARIOROY_TWO_SORT_TRICK_L ... but sobbed when I saw:
> llil2m big1.txt big2.txt big3.txt >llil2m.tmp
llil2m start
get_properties CPU time : 4.286 secs
vector copy CPU time : 0.613 secs
vector stable sort CPU time : 2.977 secs
write stdout CPU time : 1.363 secs
total CPU time : 9.244 secs
total wall clock : 9 secs
Memory use (Windows Private Bytes): 1,218,716K
No improvement. At all. Aaaargh!
When I changed stable_sort to sort the sort speed improved dramatically from
2.977 secs to just 0.750 secs ... but of course the output was wrong.
It seems there is a high performance price to pay for a stable sort in this case.
Oh well, at least we now have an alternative nine second solution in C++ using a stable sort.
Sorting Algorithms
BTW, from sort - perl pragma to control sort() behaviour
The sort pragma is now a no-op, and its use is discouraged.
...
The default sort has been stable since v5.8.0, and given this consistent behaviour for almost two decades, everyone has come to assume stability.
Given the above performance difference between stable and non-stable sorts, this looks like an unfortunate decision to me.
After all, there may be times when you really want the performance boost of a non-stable sort in Perl.
Update: Stroustrup, in his C++ Programming Language book,
notes that stable_sort improves towards N*log(N) when the system has sufficient memory, but degrades
to N*log(N)*log(N) otherwise ... so I guess the use case for needing a non-stable sort in Perl
is when you are sorting huge arrays.
See also Sorting algorithm (wikipedia).
llil2grt.cpp
Here is my C++ solution inspired by this llil2grt.pl Perl GRT solution.
Desperate to avoid the sort function after being burnt during my llil2m.cpp adventure,
and remembering jwkrahn's cute GRT trick of achieving a descending sort via
an ascending sort of negative numbers, I had to try a similar stunt in C++.
In llil2grt below, I avoided sort simply by
creating an inverted set_int_str_type container from the existing map_str_int_type container.
// llil2grt.cpp.
// Inspired by llilgrt.pl: improve sort performance via a negative cou
+nt.
// g++ compile on Linux:
// g++ -o llil2grt -std=c++11 -Wall -O3 llil2grt.cpp
// This g++ command also works with mingw C++ compiler (https://source
+forge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.ex
+e).
// Example run: llil2grt tt1.txt tt2.txt tt3.txt >out.txt
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <cstdio>
#include <string>
#include <vector>
#include <set>
#include <map>
#include <algorithm>
#include <utility>
#include <iostream>
#include <fstream>
#include <sstream>
static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, ne
+ed a 64-bit compile");
// -------------------------------------------------------------------
+---------
// Crude hack to see Windows Private Bytes in Task Manager by sleeping
+ at
// program end (see also sleep hack at end of main)
// #include <chrono>
// #include <thread>
// -------------------------------------------------------------------
+---------
typedef long long llil_int_type;
using str_int_type = std::pair<std::string, llil_int_type>;
using int_str_type = std::pair<llil_int_type, std::string>;
using map_str_int_type = std::map<std::string, llil_int_type>;
using vec_str_int_type = std::vector<str_int_type>;
using set_int_str_type = std::set<int_str_type>;
// Mimic the Perl get_properties subroutine --------------------------
+--
// Limit line length and use ANSI C functions to try to boost performa
+nce
#define MAX_LINE_LEN_L 255
static void get_properties(
int nfiles, // in: the number of input files
char* fname[], // in: the input file names
map_str_int_type& hash_ret) // out: a hash of properties
{
FILE* fh;
char line[MAX_LINE_LEN_L+1];
char* word;
llil_int_type count;
for (int i = 0; i < nfiles; ++i) {
fh = ::fopen(fname[i], "r");
if (fh == NULL) {
std::cerr << "Error opening '" << fname[i] << "' : errno=" <<
+ errno << "\n";
continue;
}
while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) {
word = ::strtok(line, "\t");
count = ::atoll( ::strtok(NULL, "\n") );
hash_ret[word] -= count;
}
::fclose(fh);
}
}
// -------------------------------------------------------------------
+--
int main(int argc, char* argv[])
{
if (argc < 2) {
std::cerr << "usage: llil2grt file1 file2 ... >out.txt\n";
return 1;
}
std::cerr << "llil2grt start\n";
time_t tstart1 = ::time(NULL);
clock_t cstart1 = ::clock();
// Create the hash of properties
map_str_int_type hash_ret;
get_properties(argc - 1, &argv[1], hash_ret);
clock_t cend1 = ::clock();
double ctaken1 = (double) (cend1 - cstart1) / (double)CLOCKS_PER_SE
+C;
std::cerr << "get_properties CPU time : " << ctaken1 << " secs
+\n";
clock_t cstart2 = ::clock();
// To avoid calling sort(), try creating an inverted std::set conta
+iner
// Note: negative count gives desired ordering (just like Perl GRT
+sort :)
set_int_str_type s;
for ( auto const& kv : hash_ret ) {
// Note: emplace_hint() was a lot faster than emplace()
s.emplace_hint( s.end(), kv.second, kv.first );
}
clock_t cend2s = ::clock();
// Output the (already sorted) std::set - no sort() function requir
+ed!
// Note: fix up negative count via -n.first
for ( auto const& n : s ) { std::cout << n.second << '\t' << -n.fir
+st << '\n'; }
clock_t cend2 = ::clock();
time_t tend2 = ::time(NULL);
long ttaken = static_cast<long>(::difftime(tend2, tstart1) + 0
+.5);
double ctaken = (double) (cend2 - cstart1) / (double)CLOCKS_PER_
+SEC;
double ctaken2s = (double) (cend2s - cstart2) / (double)CLOCKS_PER
+_SEC;
double ctaken2o = (double) (cend2 - cend2s) / (double)CLOCKS_PER
+_SEC;
std::cerr << "emplace set sort CPU time : " << ctaken2s << " sec
+s\n";
std::cerr << "write stdout CPU time : " << ctaken2o << " sec
+s\n";
std::cerr << "total CPU time : " << ctaken << " sec
+s\n";
std::cerr << "total wall clock time : " << ttaken << " sec
+s\n";
// Hack to see Private Bytes in Windows Task Manager (uncomment nex
+t line so process doesn't exit too quickly)
// std::this_thread::sleep_for(std::chrono::milliseconds(90000000
+));
return 0;
}
I was pleasantly surprised to see this improved my fastest C++ solution from 9 seconds down to 7:
> llil2grt big1.txt big2.txt big3.txt >llil2grt.tmp
llil2grt start
get_properties CPU time : 4.252 secs
emplace set sort CPU time : 1.282 secs
write stdout CPU time : 1.716 secs
total CPU time : 7.254 secs
total wall clock time : 7 secs
Memory use (Windows Private Bytes): 1,626,728K
Though faster, it's no surprise memory use increased because instead of sorting in place,
you're creating a new set_int_str_type container.
Updated: Minor cosmetic changes were made to llil2grt.cpp after posting.
Added "Sorting Algorithms" section with a bit more information on stable sorts.