print compare( 'Hello', 'hello' ), $/; # 1 print compare( 'Hello', 'HELLO WORLD' ), $/; # 0.5 print compare( 'The quick brown fox jumped over the lazy dogs.', 'The quick brown dogs jumped over the lazy fox.' ), $/; # 1 print compare( 'The quick brown fox jumped over the lazy dogs.', 'The quick brown dogs jumped over the lazy kangaroo.' ); # 0.888 sub compare { my ( $str1, $str2 ) = @_; my $tok_str1 = tokenize($str1); my $tok_str2 = tokenize($str2); # swap unless @$tok_str1 contains the most tokens ($tok_str1, $tok_str2) = ($tok_str2, $tok_str1) if @$tok_str2 > @$tok_str1; # make a lookup hash for the smaller numer of tokens in str2 my %h; @h{@$tok_str2} = (); # slice syntax if fastest # now scan str1 for these tokens and count my $found = 0; for my $tok ( @$tok_str1 ) { $found++ if exists $h{$tok}; } my $similarity = $found/@$tok_str1; return $similarity; } sub tokenize { my ($str) = @_; # remove punctuation stuff $str =~ s/[^A-Za-z0-9 ]+//g; # lowercase $str = lc $str; # magic whitespace split and return array ref return [split ' ', $str]; }