In short, could you do another Python, Pyston* run with sort function replaced with ...
def sort_native( cat_count ):
once = sorted( cat_count.keys() )
return sorted( once, key = lambda k: cat_count[ k ], reverse = True
+ )
...?
In long, before eyepopslikeamosquito posted ...
sort { $href->{$b} <=> $href->{$a} } sort keys %{$href}
... I had missed to notice the sorting order. I decided to do the same for Python (native) version instead of using functools.cmp_to_key function. I also realized that I was doing the sorting other way (sorting keys by value, followed by sorting by key), so was not getting the expected output (which made me to use cmp_to_key instead).
Replacing sort_via_cmp_to_key with ...
def sort_native( cat_count :dict[ str, int ] ) ->list[ str ]:
"""
Returns:
A `list` of sorted keys by decreasing order of values & increasi
+ng order of
keys.
Args:
cat_count: A `dict` with string key & integer value.
"""
once = sorted( cat_count.keys() )
return sorted( once, key = lambda k: cat_count[ k ], reverse = True
+ )
... reduces the sort time by ~10 times (~11-16 s; also produces the expected output) in my run environment.
time passes, so slowly🎶 ...
Putting the complete program here (~28-35 s) ...
#!/usr/local/bin/python3.10
# Source: https://perlmonks.org/index.pl?node_id=11148702
#
# This is one Python implementation based on the problem specification
+ ...
#
# Rosetta Code: Long List is Long, 20221130,
# by eyepopslikeamosquito
# https://perlmonks.org/?node_id=11148465
import sys
from collections import defaultdict
from hashlib import sha256
from pathlib import PosixPath
from timeit import default_timer
# Takes ~0.3 s.
def verify( output :str,
# The digest of the output produced from eyepopslikeamosqu
+ito's
# program, "gen-llil.pl", after generating 3 input files.
+Update it
# as needed.
expected_sum :str =
'70a1644743e9b9d8d73094ed1826527f27a7f3f131c3f28a63aaeb
+85e1af8fef'
) ->None:
"""
Prints a message about output being different than expected.
Args:
output : Stringified output of sorted input.
expected_sum: SHA-256 digest sum in hexadecimal of expected ASCI
+I output.
"""
# Need to encode the resulting string with encoding of "ascii" to m
+atch up
# of the input.
# Also make sure that each category-count pair ends with a newline.
sum = sha256( output.encode( encoding = 'ascii' ) ).hexdigest()
if sum != expected_sum:
sys.stderr.write( f"OUTPUT is DIFFERENT!\n {sum}\n" )
return
# Takes ~7-11 s.
def sort_val_desc_key_asc( cat_count :dict[ str, int ] ) ->list[ str ]
+:
"""
Returns:
A `list` of sorted keys by decreasing order of values & increasi
+ng order of
keys.
Args:
cat_count: A `dict` with string key & integer value.
"""
once = sorted( cat_count.keys() )
return sorted( once, key = lambda k: cat_count[ k ], reverse = True
+ )
# Takes ~12-16 s.
def collect( data_list :list ) ->dict[ str, int ]:
"""
Returns:
A `dict` with category as key & total count as value.
Args:
data_list: list of file paths as `pathlib.PosixPath` objects.
Side effect:
Updates `time_stat` with the time taken to collect the data.
"""
cat_count = defaultdict( lambda: 0 )
delimiter = '\t'
open_prop = { 'mode' : 'rt',
'encoding' : 'ascii',
'newline' : '\n'
}
for path in data_list:
with path.open( **open_prop ) as fh:
for line in fh:
category, number = line.split( delimiter, 1 )
cat_count[ category ] += int( number )
return cat_count
# Collect file paths.
if sys.argv[1:]:
data_list = [ PosixPath( p ) for p in sys.argv[1:] ]
else:
sys.exit( 'Give a file list with data to process' )
start = default_timer()
# Process.
cat_count = collect( data_list )
end_collect = default_timer()
# Sort.
sorted_key = sort_val_desc_key_asc( cat_count )
end_sort = default_timer()
# Format; take ~7 s.
stringified = ''.join( f'{k}\t{cat_count[ k ]}\n' for k in sorted_key
+)
end_stringification = default_timer()
# Either verify or print to verifiy outside of the program.
# Verification is slightly slower than dumping string on standard outp
+ut.
verify_or_print = 'NOT verify'
if verify_or_print == 'verify':
output_label = 'sha256 verification'
verify( stringified )
else:
output_label = 'output'
print( stringified, end = '' )
end = default_timer()
# Print time taken.
stat = { 'collect' : end_collect - start,
'sort' : end_sort - end_collect,
'stringification' : end_stringification - end_sort,
output_label : end - end_stringification,
# ~28-36 s.
'total' : end - start
}
max_label_width = max( len( k ) for k in stat.keys() )
#
decimal_place = 1
max_time_width = max( len( f'{t:0.{decimal_place}f}' ) for t in stat.v
+alues() )
time_format = f'{max_time_width}.{decimal_place}f'
#
out_format = '{label:{label_pad}}: {time:{time_format}} s\n'
#
sys.stderr.write( f'# {__file__}\n' )
for step, time in stat.items():
mess = out_format.format( label = step, label_pad = max_label_width
+,
time = time, time_format = time_format
)
sys.stderr.write( mess )
# Time.
# parv-20221210-two-sorts.py
collect : 15.9 s
sort : 7.8 s
stringification: 7.5 s
output : 0.1 s
total : 31.3 s