llil start get_properties : 44 secs sort + output : 429 secs total : 473 secs #### start get properties: 60 secs sort + output: 63 secs total: 123 secs #### # Manually rounded off after posting. collect time : 37.6 s sort_via_cmp_to_key time : 115.1 s sort+format time : 115.1 s total processing time : 152.7 s output time : 22.0 s total time : 174.7 s #### #!/usr/local/bin/python3.10 # This is one Python implementation based on the problem specification ... # # Rosetta Code: Long List is Long, 20221130, # by eyepopslikeamosquito # https://perlmonks.org/?node_id=11148465 # # ---------------------------------------------------------------------- # LLiL specification # ------------------ # A LLiL-format file is a text file. # Each line consists of a lowercase name a TAB character and a non-negative integer count. # That is, each line must match : ^[a-z]+\t\d+$ # For example, reading the LLiL-format files, tt1.txt containing: # camel\t42 # pearl\t94 # dromedary\t69 # and tt2.txt containing: # camel\t8 # hello\t12345 # dromedary\t1 # returns this hashref: # $hash_ret{"camel"} = 50 # $hash_ret{"dromedary"} = 70 # $hash_ret{"hello"} = 12345 # $hash_ret{"pearl"} = 94 # That is, values are added for items with the same key. # # To get the required LLiL text, you must sort the returned hashref # descending by value and insert a TAB separator: # hello\t12345 # pearl\t94 # dromedary\t70 # camel\t50 # To make testing via diff easier, we further sort ascending by name # for lines with the same value. # ---------------------------------------------------------------------- import sys from functools import cmp_to_key from pathlib import PosixPath from timeit import default_timer from typing import Generator # Collect time. time_stat = dict() # Takes ~35-46 s. def collect( data_list :list ) ->dict[ str, int ]: """ Returns: A `dict` with category as key & total count as value. Args: data_list: list of file paths as `pathlib.PosixPath` objects. Side effect: Updates `time_stat` with the time taken to collect the data. """ start = default_timer() cat_count = dict() delimiter = '\t' open_prop = { 'mode' : 'rt', 'encoding' : 'ascii', 'newline' : '\n' } for path in data_list: with path.open( **open_prop ) as fh: for line in fh: category, number = line.split( delimiter, 1 ) if category not in cat_count.keys(): cat_count[ category ] = 0 cat_count[ category ] += int( number ) time_stat['collect time'] = default_timer() - start return cat_count # Time taken to create a "Generator" of formatted strings gets lost in decimal # place; largely is contributed by "sort_via_cmp_to_key" function. def process( cat_count :dict[ str, int ] ) ->Generator[ str, None, None ]: """ Returns: A generator of strings composed of category & total count, sorted by total count in descending order & by category in ascending order. Args: cat_count: `dict` of category as key & total count as value. Side effect: Updates `time_stat` dict with the time taken to sort & format the data. """ start = default_timer() formatted = ( f'{k}\t{cat_count[ k ]}' for k in sort_via_cmp_to_key( cat_count ) ) time_stat['sort+format time'] = default_timer() - start return formatted # Takes ~103-115 s. def sort_via_cmp_to_key( cat_count :dict[ str, int ] ) ->list[ str ]: """ Returns: A a `list` of sorted keys; sorted by total count in descending order & by category in ascending order. Args: cat_count: `dict` of category as key & total count as value. Side effect: Updates `time_stat` with the time taken to sort. """ start = default_timer() # Set up old fashioned comparison function. def compare_desc_count_asc_cat( a :str, b :str ): # key: category; # value: total count. # Ascending category order. if cat_count[ a ] == cat_count[ b ]: if a < b: return -1 if a > b: return 1 return 0 # Descending count order. return cat_count[ b ] - cat_count[ a ] # comparator = cmp_to_key( compare_desc_count_asc_cat ) out = sorted( cat_count.keys(), key = comparator ) time_stat['sort_via_cmp_to_key time'] = default_timer() - start return out # Collect file paths. if sys.argv[1:]: data_list = [ PosixPath( p ) for p in sys.argv[1:] ] else: sys.exit( 'Give a file list with data to process' ) process_start_time = default_timer() processed = process( collect( data_list ) ) process_end_time = default_timer() # Faster than printing one by one. # Takes ~14-25 s. print( '\n'.join( processed ) ) output_end_time = default_timer() time_stat['total processing time'] = process_end_time - process_start_time time_stat['output time'] = output_end_time - process_end_time time_stat['total time'] = output_end_time - process_start_time # decimal_place = 1 round_format = f'.{decimal_place}f' time_width = max( len( f'{t:{round_format}}' ) for t in time_stat.values() ) \ + decimal_place +1 label_width = max( len( label ) for label in time_stat.keys() ) # for label, time_taken in time_stat.items(): sys.stderr.write( f'{label:{label_width}}: {time_taken:{time_width}{round_format}} s\n' ) #### @@ -98,20 +98,20 @@ """ start = default_timer() - formatted = ( f'{sk}\t{sv}' - for sk, sv in sort_via_cmp_to_key( cat_count ) + formatted = ( f'{k}\t{cat_count[ k ]}' + for k in sort_via_cmp_to_key( cat_count ) ) time_stat['sort+format time'] = default_timer() - start return formatted -# Takes ~115-120 s. -def sort_via_cmp_to_key( cat_count :dict[ str, int ] ) ->Generator[ tuple[ str, int ], None, None ]: +# Takes ~103-115 s. +def sort_via_cmp_to_key( cat_count :dict[ str, int ] ) ->list[ str ]: """ Returns: - A generator of 2-element `tuple`s of category & total count, sorted by - total count in descending order & by category in ascending order. + A a `list` of sorted keys; sorted by total count in descending order & by + category in ascending order. Args: cat_count: `dict` of category as key & total count as value. @@ -137,9 +137,7 @@ # comparator = cmp_to_key( compare_desc_count_asc_cat ) - out = ( ( k, cat_count[ k ] ) - for k in sorted( cat_count.keys(), key = comparator ) - ) + out = sorted( cat_count.keys(), key = comparator ) time_stat['sort_via_cmp_to_key time'] = default_timer() - start return out @@ -165,6 +163,12 @@ time_stat['output time'] = output_end_time - process_end_time time_stat['total time'] = output_end_time - process_start_time # +decimal_place = 1 +round_format = f'.{decimal_place}f' +time_width = max( len( f'{t:{round_format}}' ) for t in time_stat.values() ) \ + + decimal_place +1 +label_width = max( len( label ) for label in time_stat.keys() ) +# for label, time_taken in time_stat.items(): - sys.stderr.write( f'{label} : {time_taken:0.1f}\n' ) + sys.stderr.write( f'{label:{label_width}}: {time_taken:{time_width}{round_format}} s\n' )