LC_ALL=C sort -k1 big{1,2,3}.txt | awk -f tally-count.awk | LC_ALL=C sort -k2nr >out.txt
real time: 12.671 seconds
LC_ALL=C parsort -k1 big{1,2,3}.txt | awk -f tally-count.awk | LC_ALL=C parsort -k2nr >out.txt
real time: 6.561 seconds
LC_ALL=C parsort -k1 big{1,2,3}.txt | ./tally-count | LC_ALL=C parsort -k2nr >out.txt
real time: 3.615 seconds
####
LC_ALL=C parsort -k1 big{1,2,3}.txt | awk -f tally-count.awk | LC_ALL=C parsort -k2nr >out.txt
real time: 6.520 seconds
LC_ALL=C parsort -k1 big{1,2,3}.txt | ./tally-count | LC_ALL=C parsort -k2nr >out.txt
real time: 2.398 seconds
##
##
cat big{1,2,3}.txt | ./linux-q -tT "select c1, sum(c2) from - group by c1 order by sum(c2) desc, c1" >out.txt
real time: 84.027 seconds
./linux-q -tT "select c1, sum(c2) from big[1-3].txt group by c1 order by sum(c2) desc, c1" >out.txt
real time: 89.539 seconds
##
##
# Tally adjacent count fields of duplicate key names.
BEGIN {
OFS = FS = "\t"
key = ""
sum = 0
flg = 1
}
{
if ( flg ) {
flg = 0
key = $1
}
if ( $1 == key ) {
sum += $2
} else {
print key, sum
key = $1
sum = $2
}
}
END {
print key, sum
}
##
##
// ----------------------------------------------------------------------------
// tally-count.cpp
// Tally adjacent count fields of duplicate key names.
//
// Obtain the fast_io library (required dependency):
// git clone --depth=1 https://github.com/cppfastio/fast_io
//
// Obtain GNU Parallel (check first, your OS vendor may have installer package):
// https://www.gnu.org/software/parallel/
// https://www.gnu.org/software/parallel/sphinx.html
//
// g++ compile on Linux:
// g++ -o tally-count -std=c++20 -Wall -O3 tally-count.cpp -I ./fast_io/include
//
// This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe).
//
// clang++ compile:
// Seems to run slightly faster when compiled with clang++ instead of g++
//
// Example run:
// LC_ALL=C parsort -k1 big{1,2,3}.txt | ./tally-count | LC_ALL=C parsort -k2nr > out.txt
// ----------------------------------------------------------------------------
#include
// The fast_io header must come after chrono, else build error:
// "no member named 'concatln' in namespace 'fast_io'"
#include
#include
#include
#include
#include
#include
#include
#include
// fast_atoll64
// https://stackoverflow.com/questions/16826422/
// c-most-efficient-way-to-convert-string-to-int-faster-than-atoi
inline int64_t fast_atoll64( const char* str )
{
int64_t val = 0;
int sign = 0;
if ( *str == '-' ) {
sign = 1, ++str;
}
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit;
return sign ? -val : val;
}
#define MAX_LINE_LEN_L 255
int main(int argc, char* argv[])
{
std::array line, name;
char* found;
long long count, sum;
int flag = 0;
// obtain the first key-value pair delimited by tab
while ( ::fgets( line.data(), static_cast(MAX_LINE_LEN_L), ::stdin ) != NULL ) {
if ( ( found = std::find( line.begin(), line.end(), '\t' ) ) == line.end() )
continue;
sum = fast_atoll64( found + 1 );
*found = '\0'; // key name
::memcpy( name.data(), line.data(), found - line.data() + 1 );
flag = 1;
break;
}
// process the rest of standard input
while ( ::fgets( line.data(), static_cast(MAX_LINE_LEN_L), ::stdin ) != NULL ) {
if ( ( found = std::find( line.begin(), line.end(), '\t' ) ) == line.end() )
continue;
count = fast_atoll64( found + 1 );
*found = '\0'; // key name
if ( ! ::strcmp( line.data(), name.data() ) ) {
sum += count;
} else {
::print( fast_io::concatln( fast_io::mnp::os_c_str(name.data()), "\t", sum ) );
::memcpy( name.data(), line.data(), found - line.data() + 1 );
sum = count;
}
}
if ( flag )
::print( fast_io::concatln( fast_io::mnp::os_c_str(name.data()), "\t", sum ) );
return 0;
}