#!/usr/bin/perl # -*- CPerl -*- use strict; use warnings; # Goals: # * Find a set of shared substrings for a set of input strings such that: # ** Each substring is at least 3 characters long. # ** Minimize total substring length, counting each substring as extra 2. use constant MIN_SUBSTRING_LEN => 3; use constant PER_SUBSTRING_OVERHEAD => 2; # sample input: my @list=("set abcde-efghi 12345", "set abcde-ijkl 12345", "clr abcde-efghi+123", "clr abcde-ijkl 12345"); # sample output: my @expected_substrings=("set","clr"," abcde-","efghi", "ijkl"," 12345","+123"); # cost of a solution set sub cost (@) { my $cost = PER_SUBSTRING_OVERHEAD * scalar @_; $cost += length shift while @_; return $cost } # algorithm: # attempt to split common prefixes and suffixes into separate substrings; # terminate when this is no longer possible my @substrings = @list; my $made_progress = 1; my $last_output = ''; # find common prefixes # returns [ , ... ]... sub partition (@) { my @strings = sort @_; my @bins = (); my $prefix = $strings[0]; for (my $i = 0; $i < @strings; $i++) { next if $prefix eq substr($strings[$i], 0, length $prefix); my $new_prefix = $prefix; $new_prefix = substr $new_prefix, 0, -1 while length $new_prefix and $new_prefix ne substr($strings[$i], 0, length $new_prefix); if (length $new_prefix < MIN_SUBSTRING_LEN and @strings) { push @bins, [$prefix, map {substr $_, length $prefix} splice @strings, 0, $i]; $i = 0; $prefix = $strings[0]; } else { $prefix = $new_prefix; } } push @bins, [$prefix, map {substr $_, length $prefix} splice @strings] if @strings; return @bins } while ($made_progress) { # find prefixes my %new_substrings = (); my @bins = partition @substrings; $new_substrings{$_}++ for map {@$_} @bins; @substrings = sort keys %new_substrings; # repeat for suffixes %new_substrings = (); @bins = partition map scalar reverse, @substrings; $new_substrings{$_}++ for map {@$_} @bins; @substrings = grep length, sort map scalar reverse, keys %new_substrings; $made_progress = ($last_output ne join(':', @substrings)); $last_output = join(':', @substrings); } print "results: (cost ",cost(@substrings),")\n"; print $_, "\n" for @substrings;