Challenge: "Words" In A String

There.

A trie is not only smaller, it's faster than a hash in this situation.

Then I build a tree that differentiates word branches from non-word branches. There are comments, but they're rather cryptic. I included debug output to help understand.

use strict;
use warnings;


use constant DEBUG => 1;
use constant DICT  => "2of4brif.txt";


my $trie;


sub load_dict {
   #
   # Constructs a trie from the dictionary.
   #

   open(my $fh, '<', DICT)
      or die("Unable to open dictionary \"" . DICT . "\": $!\n");

   while (<$fh>) {
      chomp;
      my $p = \$trie;
      for ( split(//, $_), "\0\0" ) {
         $p = \( $$p->{$_} );
      }
   }
}


sub words_from {
   my ($str) = @_;
   my @letters = split(//, $str);
   my @lengths;

   my $p = $trie;
   my $i = 0;
   for my $i ( 0 .. $#letters ) {
      last if !exists( $p->{ $letters[$i] } );
      $p = $p->{ $letters[$i] };
      push @lengths, $i+1 if exists( $p->{ "\0\0" } );
   }

   return @lengths;
}


sub find_substrs {
   my ($str) = @_;

   my @w_substrs;

   {
      #
      # First, construct the following structure from the input:
      #
      #     p e n i s l a n d
      #    -------------------
      #    [p e n]
      #    [p e n i s]
      #          [i s]
      #          [i s l a n d]
      #              [l a n d]
      #                [a n]
      #                [a n d]
      #    -------------------
      #     3     2   4 2
      #     5     6     3
      #

      for my $i ( 0 .. length( $str )-1 ) {
         $w_substrs[$i] = [ words_from( substr( $str, $i ) ) ];
      }
   }

   if (DEBUG) {
      require Data::Dumper;
      Data::Dumper->import(qw( Dumper ));
      no warnings 'once';
      local $Data::Dumper::Terse  = 1;
      local $Data::Dumper::Indent = 0;
      print( 'w_substrs: ', Dumper(\@w_substrs), "\n" );
   }


   my @n_substrs;
   {
      #
      # Then, construct the following structure from the input:
      #
      #    -------------------
      #     p e n i s l a n d
      #    -------------------
      #    [p]                   <-- Delete (Leads to nothing)
      #    [p e]                 <-- Delete (Leads to nothing)
      #    [p e n]               <-- Delete and proceed (Word)
      #      [e]                 <-- Delete (Leads to nothing)
      #      [e n]               <-- Keep   (Leads to "is")
      #      [e n i]             <-- Delete (Leads to nothing)
      #      [e n i s]           <-- Delete and proceed (Word)
      #        [n]               <-- Keep   (Leads to "is")
      #        [n i]             <-- Delete (Leads to nothing)
      #        [n i s]           <-- Delete and proceed (Word)
      #            [s]           <-- Keep   (Leads to "land")
      #            [s l]         <-- Keep   (Leads to "an")
      #            [s l a]       <-- Delete (Leads to nothing)
      #            [s l a n]     <-- Delete and proceed (Word)
      #              [l]         <-- Delete (Leads to nothing)
      #              [l a]       <-- Delete (Leads to nothing)
      #              [l a n]     <-- Delete and proceed (Word)
      #                [a]       <-- Delete (Leads to nothing)
      #                [a n]     <-- Delete and proceed (Word)
      #                  [n]     <-- Delete (Leads to nothing)
      #                  [n d]   <-- Keep   (Leads to end)
      #                    [d]   <-- Keep   (Leads to end)
      #    -------------------
      #       2 1   1 1   2 1
      #             2
      #
      #  The actual implementation differs from above.
      #  While the worse case is O(N^2), the usual
      #  case is far more likely to resemble O(N).
      # 
      
      my $j = @w_substrs;
      for my $i ( reverse 0 .. $#w_substrs ) {
         if ( @{$w_substrs[$i]} && $j-$i >= $w_substrs[$i][0] ) {
            $n_substrs[$i] = [ ];
         } elsif ( $j == @w_substrs ) {
            $n_substrs[$i] = [ $j-$i ];
         } else {
            $n_substrs[$i] = [ map { $_+($j-$i) } 0, @{ $n_substrs[$j]
+ } ];
         }
         
         if ( @{$w_substrs[$i]} ) {
            $j = $i;
         }
      }
   }

   if (DEBUG) {
      require Data::Dumper;
      Data::Dumper->import(qw( Dumper ));
      no warnings 'once';
      local $Data::Dumper::Terse  = 1;
      local $Data::Dumper::Indent = 0;
      print( 'n_substrs: ', Dumper(\@n_substrs), "\n" );
   }

   return [ \@w_substrs, \@n_substrs ];
}


sub list_substrs {
   my ($str, $substrs) = @_;

   my ($w_substrs, $n_substrs) = @$substrs;

   local *w_helper = sub {
      my ($i) = @_;

      my @results;
      for my $l ( @{ $w_substrs->[$i] } ) {
         my $substr = substr( $str, $i, $l );
         if ($i + $l == @$w_substrs) {
            push @results, [ $substr ];
         } else {
            push @results, map [ $substr, @$_ ], n_helper( $i + $l );
         }
      }

      return @results;
   };

   local *n_helper = sub {
      my ($i) = @_;

      my @results = w_helper( $i );
      for my $l ( @{ $n_substrs->[$i] } ) {
         my $substr = "[" . substr( $str, $i, $l ) . "]";
         if ($i + $l == @$n_substrs) {
            push @results, [ $substr ];
         } else {
            push @results, map [ $substr, @$_ ], w_helper( $i + $l );
         }
      }

      return @results;
   };

   return map join( ' ', @{$_->[0]} ),
          sort {  $a->[1] <=> $b->[1]
               || $a->[2] <=> $b->[2] }
          map [ $_, scalar(grep /^\[/, @$_), scalar(@$_) ],
          n_helper(0);
}


{
   load_dict();
   for my $input (qw( penisland zatxtaz xapenx )) {
      print( "$input\n" );
      print( ( "-" x length($input) ), "\n" );
      my $substrs = find_substrs( $input );
      for ( list_substrs( $input, $substrs ) ) {
         print( "$_\n" );
      }
      print( "\n" );
   }
}
[download]

penisland
---------
w_substrs: [[3,5],[],[],[2,6],[],[4],[2,3],[],[]]
n_substrs: [[],[2],[1],[],[1,2],[1],[],[2],[1]]
pen island
penis land
pen is land
penis [l] and
pen is [l] and
penis [l] an [d]
pen is [l] an [d]

zatxtaz
-------
w_substrs: [[],[2],[],[],[2],[],[]]
n_substrs: [[1],[],[2],[1],[],[2],[1]]
[z] at [x] ta [z]

xapenx
------
w_substrs: [[],[3],[3],[],[],[]]
n_substrs: [[1,2],[1],[],[3],[2],[1]]
[x] ape [nx]
[xa] pen [x]
[download]

[reply]
[d/l]
[select]

A trie ! Beautiful :-)

For repeated use it's obvious to consider preprocessing the dictionary into the trie, and reading that each time.

Of interest is how beautifully this trie will compress, by folding the matching sub-tries. Code to illustrate this given below. On my machine the result is:

  Loading dictionary '2of4brif.txt'... done -- 1.30 Secs
  Writing trie '2of4brif.trie'... 22599/138676 done -- 1.35 Secs
  Reading trie '2of4brif.trie'... done -- 0.25 Secs
  Walking the tries... done -- 1.94 Secs
  Original trie is: 27.1M, new trie is: 5.4M
[download]

(The memory footprint is also reduced, but that's not hugely significant.)

  use strict;
  use warnings;
  
  use constant DICT  => "2of4brif.txt";
  use constant TRIE  => "2of4brif.trie" ;
  
  sub get_cpu {
    my $t = (times)[0] ;
    if (@_) { $t = sprintf('%4.2f Secs', $t - $_[0]) ; } ;
    return $t ;
  } ;
  
  my $took ;
  
  print STDERR "Loading dictionary '", DICT, "'" ;
  $took = get_cpu() ;
  my $trie  = load_dict(DICT) ;
  print STDERR "... done -- ", get_cpu($took), "\n" ;
  
  print STDERR "Writing trie '", TRIE, "'" ;
  $took = get_cpu() ;
  my ($now, $was) = write_trie(TRIE, $trie) ;
  print STDERR "... done -- ", get_cpu($took), "\n" ;
  
  print STDERR "Reading trie '", TRIE, "'" ;
  $took = get_cpu() ;
  my $check = read_trie(TRIE) ;
  print STDERR "... done -- ", get_cpu($took), "\n" ;
  
  print STDERR "Walking the tries" ;
  $took = get_cpu() ;
  walk('', $trie, $check) ;
  print STDERR "... done -- ", get_cpu($took), "\n" ;

  use Devel::Size qw(total_size) ;

  printf STDERR "Original trie is: %3.1fM, new trie is: %3.1fM\n",
                      total_size( $trie)/(1024 * 1024),
                      total_size($check)/(1024 * 1024) ;

  #===================================================================
+======================
  # Loading the Dictionary into the Trie.
  
  sub load_dict {
    my ($d_name) = @_ ;
  
    # Constructs a trie from the dictionary.
  
    open(my $fh, '<', $d_name)
           or die("Unable to open dictionary '$d_name': $!\n");
  
    my $trie = undef ;
  
    while (<$fh>) {
      s/\s+$// ;  # chomp;
      my $p = \$trie;
      for ( split(//, $_), '!' ) {
        $p = \( $$p->{$_} ) ;
      }
    }
  
    return $trie ;
  }
  
  #===================================================================
+======================
  # Writing the Trie.
  
  my %node_map ;
  my $idx ;
  
  my $node_count ;
  
  sub write_trie {
    my ($t_name, $trie) = @_ ;
  
    open(my $fh, '>', $t_name)
           or die("Unable to create trie '$t_name': $!\n");
  
    $node_count = 0 ;
  
    $idx = 1 ;        # Index 0 reserved for what '!' points at !
    %node_map = ('!' => "$idx") ; # Preset end of word node
  
    write_node($fh, $trie) ;
  
    close $fh ;
  
    return ($idx, $node_count) ;
  } ;
  
  sub write_node {
    my ($fh, $node) = @_ ;
  
    $node_count++ ;
  
    my @chs = sort keys %$node ;  # NB '!' sorts to the front !
  
    my @n = $chs[0] eq '!' ? (shift @chs) : () ;
  
    foreach my $ch (@chs) {
      my $p = write_node($fh, $node->{$ch}) ;
      push @n, $ch.$p ;
    } ;
  
    my $n = join(' ', @n) ;
    my $p ;
    unless (defined($p = $node_map{$n})) {
      $p = $node_map{$n} = sprintf('%X', ++$idx) ;
            # Assign new index and record
      print $fh $n, "\n" ;    # Output new node
    } ;
  
    return $p ;
  } ;
  
  #===================================================================
+======================
  # Reading the Trie.
  
  my @nodes ;
  
  sub read_trie {
    my ($t_name) = @_ ;
  
    open(my $fh, '<', $t_name)
           or die("Unable to open trie '$t_name': $!\n");
  
    @nodes = (undef, {'!' => undef}) ;
  
    while (<$fh>) {
      push @nodes, { map { my ($c, $p) = unpack('AA*', $_) ;
                     ($c, $nodes[hex($p)]) } split } ; # Note hex('') 
+= 0
    } ;
  
    close $fh ;
  
    my $trie = pop @nodes ;
    @nodes = () ;
  
    return $trie ;
  } ;
  
  #===================================================================
+======================
  # Walk two tries to check they are identical

  sub walk {
    my ($w, $ra, $rb) = @_ ;
  
    my @ac = sort keys %$ra ;
    my @bc = sort keys %$rb ;
  
    if (@ac != @bc) { die "node length mismatch \@ '$w': (@ac) vs (@bc
+)" ; } ;
    for my $i (0..$#ac) {
      if ($ac[$i] ne $bc[$i]) { die "node mismatch \@ '$w': (@ac) vs (
+@bc)" ; } ;
    } ;
  
    foreach my $ch (@ac) {
      my $ad = $ra->{$ch} ;
      my $bd = $rb->{$ch} ;
      if ($ch eq '!') {
        if (defined($ad)) { die "'!' with defined down ($ad) in 'a' \@
+ '$w'" ; } ;
        if (defined($bd)) { die "'!' with defined down ($bd) in 'b' \@
+ '$w'" ; } ;
      }
      else {
        walk($w.$ch, $ad, $bd) ;
      } ;
    } ;
  } ;
[download]

[reply]
[d/l]
[select]

ikegami

here

Another idea I had but passed on was a two pass approach. The first pass would mark where each word in the dictionary overlapped the target word. The second pass would use a heuristic approach to a variation of the bin packing problem. I abandoned it because it seemed unnecessary given the expected length of input strings would usually be less than 30 characters and contain 2 to 3 words from the dictionary.

Cheers - L~R

[reply]

penisland
---------
pen island
penis land
pen is land
penis l and
pen is l and
penis l an d
pen is l an d

zatxtaz
-------
z at x ta z

xapenx
------
xa pen x
x ape nx
[download]

I get all but "xa pen x" at the moment — it prints "x a pen x" — and I haven't attempted sorting yet.

[reply]
[d/l]

sub wordify($string) {
    my $results = [];
    $min_end = length($string)
    for $pos (0 .. $min_end) {
        for $len (1 .. length($string)) {
            $word = substr($string, $pos, $len)
            if exists($dictionary{$word}) {
                $min_end = $pos+$len if $pos+$len < $min_end
                $prefix = substr($string, 0, $pos) . " " . $word # nee
+ds special treatment for $pos = 0
                $endings = wordify(substr($string, $pos+1))
                push(@$results, map("$prefix $_", @$endings))
            }
        }
    }
    return [$string] unless @$results 
    return $results
}
[download]

dynamic programming

[reply]
[d/l]

ikegami

#!/usr/bin/perl

use warnings;
use strict;

my %dictionary;
open(my $dict_file, '<', '2of12inf.txt') or die ("Can't open dictionar
+y: $!");
while (<$dict_file>) {
        $_ = lc($_);
        tr/a-z//dc;
        $dictionary{$_} = 1;
}

my @partials;
while (<DATA>) {
        $_ = lc($_);
        tr/a-z//dc;
        @partials = (); $partials[0] = [''];
        my $results = wordify($_);
        print "$_:\n";
        print "$_\n" for @$results;
}

sub wordify {
        my ($string) = @_;
        my $results = [];
        my $min_end = length($string);
        for (my $pos = 0; $pos < $min_end; $pos++) {
                for my $len (1 .. length($string) - $pos) {
                        my $word = substr($string, $pos, $len);
                        if (exists $dictionary{$word}) {
                                $min_end = $pos+$len if $pos+$len < $m
+in_end;
                                my $prefix = '';
                                $prefix = '[' . substr($string, 0, $po
+s) . '] ' unless $pos == 0;
                                $prefix .= $word;
                                my $rest = substr($string, $pos + $len
+);
                                wordify($rest) unless (defined $partia
+ls[length($rest)]);
                                my $endings = $partials[length($rest)]
+;
                                push(@$results, map("$prefix $_", @$en
+dings));
                        }
                }
        }
        $results = ["[$string]"] unless @$results;
        $partials[length($string)] = $results;
        return $results;
}

__DATA__
penisland
zatxtaz
xapenx
[download]

penisland:
pen is la [nd]
pen is land 
pen is [l] an [d]
pen is [l] and 
pen island 
penis la [nd]
penis land 
penis [l] an [d]
penis [l] and 
[p] en is la [nd]
[p] en is land 
[p] en is [l] an [d]
[p] en is [l] and 
[p] en island 
zatxtaz:
[z] at [xtaz]
xapenx:
[x] ape [nx]
[xa] pen [x]
[xap] en [x]
[download]

2of4brif

penisland:
pen is land 
pen is [l] an [d]
pen is [l] and 
pen island 
penis land 
penis [l] an [d]
penis [l] and 
zatxtaz:
[z] at [x] ta [z]
xapenx:
[x] ape [nx]
[xa] pen [x]
[download]

[reply]
[d/l]
[select]

pjotrik

Cheers - L~R

[reply]

Challenge: "Words" In A String

The Dictionary:

The Input:

The Output:

Example:


We don't bite newbies here... much
	PerlMonks