my $text = Text::Plagiarized->new;
$text->original($original_text);
foreach my $comparison (@comparison_texts) {
# the following two lines should probably be collapsed to
# $text->compare($comparison);
$text->comparison($comparison);
$text->analyze;
print $text->percent, $/; # percent of matching sentences
if ($text->percent > $some_threshold) {
# arrayref of array refs with [$sentence, $possible_match]
print Dumper($text->matches);
}
}
####
package Text::Plagiarized;
$REVISION = '$Id: Plagiarized.pm,v 1.0 2003/07/13 19:15:57 ovid Exp $';
$VERSION = '0.01';
use 5.006;
use strict;
use warnings;
use String::Approx qw/amatch/;
use Text::Sentence qw/split_sentences/;
sub new {
my $class = shift;
my $self = bless {
original => {},
comparison => {},
matches => [],
threshold => 80,
} => $class;
}
sub original {
my ($self, $text) = @_;
local $_ = $text;
$self->{original} = {
text => $text,
sentences => [split_sentences($text)],
};
return $self;
}
sub comparison {
my ($self, $text) = @_;
local $_ = $text;
$self->{comparison} = {
text => $text,
sentences => [split_sentences($text)],
};
return $self;
}
my %percentage = map { $_ => 1 } 0 .. 100; # wow. This is a cheap hack
sub threshold {
my $self = shift;
if (@_) {
my $num = shift;
unless (exists $percentage{$num}) {
require Carp;
Carp::croak("threshold must be an integer between 0 and 100, inclusive");
}
$self->{threshold} = 100 - $num;
}
$self->{threshold};
}
sub analyze {
my $self = shift;
my @sentences;
my $threshold = $self->threshold;
foreach my $sentence1 (@{$self->{original}{sentences}}) {
foreach my $sentence2 (@{$self->{comparison}{sentences}}) {
my ($hash1, $hash2) = _hash($sentence1, $sentence2);
if ($hash1 eq $hash2 || amatch($hash1, ["$threshold%"], $hash2)) {
push @sentences => [$sentence1 => $sentence2];
last;
}
}
}
$self->{matches} = \@sentences;
}
sub matches { shift->{matches} }
sub percent {
my $self = shift;
my $precision = shift || 0;
my $matches = @{$self->matches};
my $sentences = @{$self->{original}{sentences}};
sprintf "%.${precision}f" => ($matches/$sentences) * 100;
}
# starts to break down if we have more than 26 different words
# use Unicode characters?
# stop words?
# memoize this
sub _hash {
my @string = map lc $_ => @_;
s/[^[:alnum:][:space:]]//g foreach @string;
s/[[:space:]]+/ /g foreach @string;
my %words;
my $letter = 'a';
s/(\S+)/
unless (exists $words{$1}) {
$words{$1} = $letter;
$letter++;
}
$words{$1}
/eg foreach @string;
s/ //g foreach @string;
return @string;
}
1;
##
##
Macbeth is presented as a mature man of definitely established character.
Macbeth is shown as an empowered man of well- established character.