llil start
get_properties : 44 secs
sort + output : 429 secs
total : 473 secs
####
start
get properties: 60 secs
sort + output: 63 secs
total: 123 secs
##
##
# Manually rounded off after posting.
collect time : 37.6 s
sort_via_cmp_to_key time : 115.1 s
sort+format time : 115.1 s
total processing time : 152.7 s
output time : 22.0 s
total time : 174.7 s
##
##
#!/usr/local/bin/python3.10
# This is one Python implementation based on the problem specification ...
#
# Rosetta Code: Long List is Long, 20221130,
# by eyepopslikeamosquito
# https://perlmonks.org/?node_id=11148465
#
# ----------------------------------------------------------------------
# LLiL specification
# ------------------
# A LLiL-format file is a text file.
# Each line consists of a lowercase name a TAB character and a non-negative integer count.
# That is, each line must match : ^[a-z]+\t\d+$
# For example, reading the LLiL-format files, tt1.txt containing:
# camel\t42
# pearl\t94
# dromedary\t69
# and tt2.txt containing:
# camel\t8
# hello\t12345
# dromedary\t1
# returns this hashref:
# $hash_ret{"camel"} = 50
# $hash_ret{"dromedary"} = 70
# $hash_ret{"hello"} = 12345
# $hash_ret{"pearl"} = 94
# That is, values are added for items with the same key.
#
# To get the required LLiL text, you must sort the returned hashref
# descending by value and insert a TAB separator:
# hello\t12345
# pearl\t94
# dromedary\t70
# camel\t50
# To make testing via diff easier, we further sort ascending by name
# for lines with the same value.
# ----------------------------------------------------------------------
import sys
from functools import cmp_to_key
from pathlib import PosixPath
from timeit import default_timer
from typing import Generator
# Collect time.
time_stat = dict()
# Takes ~35-46 s.
def collect( data_list :list ) ->dict[ str, int ]:
"""
Returns:
A `dict` with category as key & total count as value.
Args:
data_list: list of file paths as `pathlib.PosixPath` objects.
Side effect:
Updates `time_stat` with the time taken to collect the data.
"""
start = default_timer()
cat_count = dict()
delimiter = '\t'
open_prop = { 'mode' : 'rt',
'encoding' : 'ascii',
'newline' : '\n'
}
for path in data_list:
with path.open( **open_prop ) as fh:
for line in fh:
category, number = line.split( delimiter, 1 )
if category not in cat_count.keys():
cat_count[ category ] = 0
cat_count[ category ] += int( number )
time_stat['collect time'] = default_timer() - start
return cat_count
# Time taken to create a "Generator" of formatted strings gets lost in decimal
# place; largely is contributed by "sort_via_cmp_to_key" function.
def process( cat_count :dict[ str, int ] ) ->Generator[ str, None, None ]:
"""
Returns:
A generator of strings composed of category & total count, sorted by
total count in descending order & by category in ascending order.
Args:
cat_count: `dict` of category as key & total count as value.
Side effect:
Updates `time_stat` dict with the time taken to sort & format the data.
"""
start = default_timer()
formatted = ( f'{k}\t{cat_count[ k ]}'
for k in sort_via_cmp_to_key( cat_count )
)
time_stat['sort+format time'] = default_timer() - start
return formatted
# Takes ~103-115 s.
def sort_via_cmp_to_key( cat_count :dict[ str, int ] ) ->list[ str ]:
"""
Returns:
A a `list` of sorted keys; sorted by total count in descending order & by
category in ascending order.
Args:
cat_count: `dict` of category as key & total count as value.
Side effect:
Updates `time_stat` with the time taken to sort.
"""
start = default_timer()
# Set up old fashioned comparison function.
def compare_desc_count_asc_cat( a :str, b :str ):
# key: category;
# value: total count.
# Ascending category order.
if cat_count[ a ] == cat_count[ b ]:
if a < b:
return -1
if a > b:
return 1
return 0
# Descending count order.
return cat_count[ b ] - cat_count[ a ]
#
comparator = cmp_to_key( compare_desc_count_asc_cat )
out = sorted( cat_count.keys(), key = comparator )
time_stat['sort_via_cmp_to_key time'] = default_timer() - start
return out
# Collect file paths.
if sys.argv[1:]:
data_list = [ PosixPath( p ) for p in sys.argv[1:] ]
else:
sys.exit( 'Give a file list with data to process' )
process_start_time = default_timer()
processed = process( collect( data_list ) )
process_end_time = default_timer()
# Faster than printing one by one.
# Takes ~14-25 s.
print( '\n'.join( processed ) )
output_end_time = default_timer()
time_stat['total processing time'] = process_end_time - process_start_time
time_stat['output time'] = output_end_time - process_end_time
time_stat['total time'] = output_end_time - process_start_time
#
decimal_place = 1
round_format = f'.{decimal_place}f'
time_width = max( len( f'{t:{round_format}}' ) for t in time_stat.values() ) \
+ decimal_place +1
label_width = max( len( label ) for label in time_stat.keys() )
#
for label, time_taken in time_stat.items():
sys.stderr.write( f'{label:{label_width}}: {time_taken:{time_width}{round_format}} s\n' )
##
##
@@ -98,20 +98,20 @@
"""
start = default_timer()
- formatted = ( f'{sk}\t{sv}'
- for sk, sv in sort_via_cmp_to_key( cat_count )
+ formatted = ( f'{k}\t{cat_count[ k ]}'
+ for k in sort_via_cmp_to_key( cat_count )
)
time_stat['sort+format time'] = default_timer() - start
return formatted
-# Takes ~115-120 s.
-def sort_via_cmp_to_key( cat_count :dict[ str, int ] ) ->Generator[ tuple[ str, int ], None, None ]:
+# Takes ~103-115 s.
+def sort_via_cmp_to_key( cat_count :dict[ str, int ] ) ->list[ str ]:
"""
Returns:
- A generator of 2-element `tuple`s of category & total count, sorted by
- total count in descending order & by category in ascending order.
+ A a `list` of sorted keys; sorted by total count in descending order & by
+ category in ascending order.
Args:
cat_count: `dict` of category as key & total count as value.
@@ -137,9 +137,7 @@
#
comparator = cmp_to_key( compare_desc_count_asc_cat )
- out = ( ( k, cat_count[ k ] )
- for k in sorted( cat_count.keys(), key = comparator )
- )
+ out = sorted( cat_count.keys(), key = comparator )
time_stat['sort_via_cmp_to_key time'] = default_timer() - start
return out
@@ -165,6 +163,12 @@
time_stat['output time'] = output_end_time - process_end_time
time_stat['total time'] = output_end_time - process_start_time
#
+decimal_place = 1
+round_format = f'.{decimal_place}f'
+time_width = max( len( f'{t:{round_format}}' ) for t in time_stat.values() ) \
+ + decimal_place +1
+label_width = max( len( label ) for label in time_stat.keys() )
+#
for label, time_taken in time_stat.items():
- sys.stderr.write( f'{label} : {time_taken:0.1f}\n' )
+ sys.stderr.write( f'{label:{label_width}}: {time_taken:{time_width}{round_format}} s\n' )