llil start
get_properties : 44 secs
sort + output  : 429 secs
total          : 473 secs

##</code><code>##

start
get properties: 60 secs
sort + output: 63 secs
total: 123 secs

##</code><code>##

# Manually rounded off after posting.
collect time : 37.6 s
sort_via_cmp_to_key time : 115.1 s
sort+format time : 115.1 s
total processing time : 152.7 s
output time : 22.0 s
total time : 174.7 s

##</code><code>##

#!/usr/local/bin/python3.10

# This is one Python implementation based on the problem specification ...
#
#   Rosetta Code: Long List is Long, 20221130,
#   by eyepopslikeamosquito
#   https://perlmonks.org/?node_id=11148465
#
# ----------------------------------------------------------------------
# LLiL specification
# ------------------
# A LLiL-format file is a text file.
# Each line consists of a lowercase name a TAB character and a non-negative integer count.
# That is, each line must match : ^[a-z]+\t\d+$
# For example, reading the LLiL-format files, tt1.txt containing:
#   camel\t42
#   pearl\t94
#   dromedary\t69
# and tt2.txt containing:
#   camel\t8
#   hello\t12345
#   dromedary\t1
# returns this hashref:
#   $hash_ret{"camel"}     = 50
#   $hash_ret{"dromedary"} = 70
#   $hash_ret{"hello"}     = 12345
#   $hash_ret{"pearl"}     = 94
# That is, values are added for items with the same key.
#
# To get the required LLiL text, you must sort the returned hashref
# descending by value and insert a TAB separator:
#   hello\t12345
#   pearl\t94
#   dromedary\t70
#   camel\t50
# To make testing via diff easier, we further sort ascending by name
# for lines with the same value.
# ----------------------------------------------------------------------


import sys
from functools import cmp_to_key
from pathlib import PosixPath
from timeit import default_timer
from typing import Generator


# Collect time.
time_stat = dict()


# Takes ~35-46 s.
def collect( data_list :list ) ->dict[ str, int ]:
   """
   Returns:
      A `dict` with category as key & total count as value.

   Args:
      data_list: list of file paths as `pathlib.PosixPath` objects.

   Side effect:
      Updates `time_stat` with the time taken to collect the data.
   """
   start = default_timer()

   cat_count = dict()

   delimiter = '\t'
   open_prop = { 'mode' : 'rt',
                  'encoding' : 'ascii',
                  'newline' : '\n'
               }
   for path in data_list:
      with path.open( **open_prop ) as fh:
         for line in fh:
            category, number = line.split( delimiter, 1 )
            if category not in cat_count.keys():
               cat_count[ category ] = 0
            cat_count[ category ] += int( number )

   time_stat['collect time'] = default_timer() - start
   return cat_count


# Time taken to create a "Generator" of formatted strings gets lost in decimal
# place; largely is contributed by "sort_via_cmp_to_key" function.
def process( cat_count :dict[ str, int ] ) ->Generator[ str, None, None ]:
   """
   Returns:
      A generator of strings composed of category & total count, sorted by
      total count in descending order & by category in ascending order.

   Args:
      cat_count: `dict` of category as key & total count as value.

   Side effect:
      Updates `time_stat` dict with the time taken to sort & format the data.
   """
   start = default_timer()

   formatted = ( f'{k}\t{cat_count[ k ]}'
                  for k in sort_via_cmp_to_key( cat_count )
               )

   time_stat['sort+format time'] = default_timer() - start
   return formatted


# Takes ~103-115 s.
def sort_via_cmp_to_key( cat_count :dict[ str, int ] ) ->list[ str ]:
   """
   Returns:
      A a `list` of sorted keys; sorted by total count in descending order & by
      category in ascending order.

   Args:
      cat_count: `dict` of category as key & total count as value.

   Side effect:
      Updates `time_stat` with the time taken to sort.
   """
   start = default_timer()

   # Set up old fashioned comparison function.
   def compare_desc_count_asc_cat( a :str, b :str  ):
      # key: category;
      # value: total count.
      # Ascending category order.
      if cat_count[ a ] == cat_count[ b ]:
         if a < b:
            return -1
         if a > b:
            return 1
         return 0
      # Descending count order.
      return cat_count[ b ] - cat_count[ a ]
   #
   comparator = cmp_to_key( compare_desc_count_asc_cat )

   out = sorted( cat_count.keys(), key = comparator )

   time_stat['sort_via_cmp_to_key time'] = default_timer() - start
   return out


# Collect file paths.
if sys.argv[1:]:
   data_list = [ PosixPath( p ) for p in sys.argv[1:] ]
else:
   sys.exit( 'Give a file list with data to process' )


process_start_time = default_timer()
processed = process( collect( data_list ) )
process_end_time = default_timer()

# Faster than printing one by one.
# Takes ~14-25 s.
print( '\n'.join( processed ) )
output_end_time = default_timer()

time_stat['total processing time'] = process_end_time - process_start_time
time_stat['output time'] = output_end_time - process_end_time
time_stat['total time'] = output_end_time - process_start_time
#
decimal_place = 1
round_format = f'.{decimal_place}f'
time_width = max( len( f'{t:{round_format}}' ) for t in time_stat.values() ) \
            + decimal_place +1
label_width = max( len( label ) for label in time_stat.keys() )
#
for label, time_taken in time_stat.items():
   sys.stderr.write( f'{label:{label_width}}: {time_taken:{time_width}{round_format}} s\n' )


##</code><code>##

@@ -98,20 +98,20 @@
    """
    start = default_timer()
 
-   formatted = ( f'{sk}\t{sv}'
-                  for sk, sv in sort_via_cmp_to_key( cat_count )
+   formatted = ( f'{k}\t{cat_count[ k ]}'
+                  for k in sort_via_cmp_to_key( cat_count )
                )
 
    time_stat['sort+format time'] = default_timer() - start
    return formatted
 
 
-# Takes ~115-120 s.
-def sort_via_cmp_to_key( cat_count :dict[ str, int ] ) ->Generator[ tuple[ str, int ], None, None ]:
+# Takes ~103-115 s.
+def sort_via_cmp_to_key( cat_count :dict[ str, int ] ) ->list[ str ]:
    """
    Returns:
-      A generator of 2-element `tuple`s of category & total count, sorted by
-      total count in descending order & by category in ascending order.
+      A a `list` of sorted keys; sorted by total count in descending order & by
+      category in ascending order.
 
    Args:
       cat_count: `dict` of category as key & total count as value.
@@ -137,9 +137,7 @@
    #
    comparator = cmp_to_key( compare_desc_count_asc_cat )
 
-   out = ( ( k, cat_count[ k ] )
-            for k in sorted( cat_count.keys(), key = comparator )
-         )
+   out = sorted( cat_count.keys(), key = comparator )
 
    time_stat['sort_via_cmp_to_key time'] = default_timer() - start
    return out
@@ -165,6 +163,12 @@
 time_stat['output time'] = output_end_time - process_end_time
 time_stat['total time'] = output_end_time - process_start_time
 #
+decimal_place = 1
+round_format = f'.{decimal_place}f'
+time_width = max( len( f'{t:{round_format}}' ) for t in time_stat.values() ) \
+            + decimal_place +1
+label_width = max( len( label ) for label in time_stat.keys() )
+#
 for label, time_taken in time_stat.items():
-   sys.stderr.write( f'{label} : {time_taken:0.1f}\n' )
+   sys.stderr.write( f'{label:{label_width}}: {time_taken:{time_width}{round_format}} s\n' )