#!/usr/local/bin/perl -w
######################################################################
+########################
# ngram.plx
# v 0.2
# by Mike Phenicie (TheEnigma on PerlMonks - www.perlmonks.org)
# 09-18-04
######################################################################
+########################
# Background: An analysis can be made of text samples, in which the
+frequency of the various
# ngrams is calulated. An ngram is my term for character sequences o
+f length n. I have seen
# 2 character sequences refered to as both digrams and digraphs; and
+3 character sequences
# refered to as trigrams and trigraphs. So to refer in a generic way
+ to sequences of any
# length, I chose ngram.
#
# This program scans through a text file, and looks at every sequence
+ of length n-1, noting
# what character comes after it. It will create a hash of hashes tha
+t contains the number
# of occurances of a particular character following a particular n-1
+length sequence. In
# other words, if there are 10 places in the text where 'te' is follo
+wed by 'r', then
# $freq{'te'}->{'r'} == 10.
#
# This frequency table can be used to create a new text that has simi
+larities to the original
# text file. The new file will contain text with a very similar rati
+o of ngrams. For instance,
# if the original text had a lot of occasions where the letters 'ne'
+were followed by
# an 's' (with n=3), but not many occasions where it was followed by
+a 'w', then that should
# be true in the new text as well.
#
# With low numbers of n (three or four, for instance), the only simil
+arity might be that it
# seems to be in the same language as the input. With higher values
+for n, the output will
# start to resemble the input more and more. With a high enough valu
+e it's possible you might
# get the exact same text for output as was input.
######################################################################
+########################
# Switches
#
# -i <input text file> file to analyze
# -o <output text file> computer generated tex
+t
# -d <depth (2-10)> default 3
# -v <verbosity> (1 to 10k) default 100 (how long
+the output is)
# -s <frequency file> save frequency hash to
+ a file
# -l <frequency file> use existing frequency
+ file for input
# (can't specify -i or -
+s with this)
# -t <"starting text"> default is random by s
+cript
# -a <human readable freq file> a human readable frequ
+ency file
# -e <frequency file> examine - ignores all
+other switches
# -D (no display on monitor) if switch present, no
+display on monitor
# -A (append) append to output file
#
# Usage: You must specify an -i or an -l (but not both) to tell the
+program what to use as input
# You may specify an -s, but if you do, you can't use -l
# You may specify an -o, and if you optionally also use -A, t
+he file will be appended
# The generated text will by default go to the monitor, turn
+this off with -D
# You may specify -t to tell it what text you want to start w
+ith
# You may specify -a to store the frequency data in a human r
+eadable file
# You may specify an -e (which will ignore all other switches
+) to examine a freq file
# If you don't specify -d, a default of 3 will be used
# If you don't specify -v, a default of 100 will be used
######################################################################
+#########################
# to do
# add help
# make line breaks occur on spaces
# try mixing two or more frequency files to create one output
# prevent it from printing bad start text to the output file
#
# and...?
######################################################################
+####
use strict;
use Getopt::Std;
use Storable qw(nstore);
$| = 1;
srand;
my $name = "ngram.plx v 0.2";
my $wrap = 80;
my(%freq, $ngram, $next_char, $freq_file, $freq_depth, $freq_name, $fr
+eq, @freq);
# Get command line options.
my($input, $output, $depth, $verbosity, $save, $load, $start_text, $as
+cii, $examine, $no_display, $append) = get_options();
if($examine){
$freq = Storable::retrieve("$examine") or die "Can't load frequency
+analysis data from $examine: $!";
($freq_name, $freq_file, $freq_depth, %freq) = @$freq;
print "\n\nFrequency file $examine created by $freq_name\n";
print "Based on text from $freq_file\n";
print "Depth = $freq_depth\n\n";
exit;
}
##################################################################
# Put frequency data in %freq, either by analyzing an input file,
# or reading a previously saved frequency file.
##################################################################
if($input){
print "\n\nOutput based on the file $input. Depth = $depth.\n\n";
%freq = create_freq_hash($input, $depth);
} elsif($load){
eval{$freq = Storable::retrieve("$load") or die "Can't load frequenc
+y analysis data from $load: $!";};
if($@){print "ERROR: $load does not appear to be a valid frequency f
+ile\n"; exit;}
($freq_name, $freq_file, $freq_depth, %freq) = @$freq;
$depth = $freq_depth;
if($freq_name !~ /ngram\.plx/){
print "\n\nERROR: $load was not created by this script\n";
exit;
}
print "\n\nOutput based on the file $freq_file. Depth = $freq_depth
+.\n\n";
}
##############################################
# If -s, save %freq to a file.
##############################################
if($save){
@freq = ($name, $input, $depth, %freq);
nstore(\@freq, "$save") or die "Can't store frequency analysis data
+to $save: $!";
}
########################################################
# If -a, store human readable version of frequency file
########################################################
if($ascii){
my($key, $value, $k, $v);
open (DBG, ">$ascii");
while(($key, $value) = each %freq){
print DBG "\n\n#$key#\n";
while(($k, $v) = each %$value){
print DBG "#$k $v# ";
}
}
close(DBG);
}
##################################
# Create and print the start text
##################################
$start_text = create_start_text() unless $start_text;
$ngram = substr($start_text, 1-$depth, $depth-1);
open(OFH, ">$output") or die "Can't open $output for writing: $!" if $
+output && ! $append;
open(OFH, ">>$output") or die "Can't open $output for appending: $!" i
+f $output && $append;
print "\n\n$start_text" unless $no_display;
print OFH "$start_text" if $output;
############################
# Let's create some text!
############################
for my $i (length($start_text)+1 .. $verbosity-1){
$next_char = get_next_char($ngram);
print "$next_char" unless $no_display;
print OFH "$next_char" if $output;
print OFH "\n" if($output && !($i % $wrap));
$ngram = substr($ngram, 1) . $next_char;
}
print "\n\n" unless $no_display;
close(OFH);
######################################################################
+######################
# SUBROUTINES
+ #
######################################################################
+######################
######################################################################
+###########
# Get next character
#
# Given a $depth-1 length character string, and the frequency hash (%
+freq), this
# routine will pick the next character in the output. It finds the v
+alue stored
# in %freq that is keyed by $ngram. This value is another hash that
+has as its
# keys all the possible letters that followed $ngram in the original
+text. The
# value of each of these keys is the number of times that letter foll
+owed $ngram
# in the original text. One of these letters is picked at random, wi
+th a higher
# probability of picking a letter that appeared more often in the ori
+ginal text.
#
# There are several ways to do this (TMTOWTDI). This routine does it
+as follows:
# Let's say the ngram is 'ceed'; and that 'ceed' is followed by 'e' 2
+ times, 's'
# 1 time, and 'i' 5 times. An array with 8 elements (2+1+5) will be
+created.
# 2 elements will contain 'e', 1 element will contain 's', and 5 elem
+ents will
# contain 'i'. An element is picked at random to be returned. Thus,
+ letters
# that were more likely to follow the ngram in the original text will
+ be more
# likely to follow in the created text.
######################################################################
+###########
sub get_next_char {
my($ngram) = @_;
my $ptr = 0;
my(@ary, $sub_hash);
if(defined $freq{$ngram}){
$sub_hash = $freq{$ngram};
} else {
print "\n\nERROR: The text you entered does not end with characte
+rs\n";
print "that are in the frequency hash.\n\n";
exit;
}
while(my($key, $value) = each %$sub_hash){
for($ptr..$value+$ptr-1){
$ary[$_] = $key;
}
$ptr += $value;
}
return $ary[rand($ptr)];
}
######################################################################
+###########
# Create start text
#
# This routine will first attempt to create an array, @keys, consisti
+ng of all
# the first level keys of %freq that start with an uppercase letter f
+ollowed
# by a lowercase letter. This is so the output text will have a 'pro
+per' start.
# If this array is empty, meaning there are no capitalized words in t
+he input
# text, it will repopulate @keys with all the keys of %freq. In eith
+er case,
# it will select one of the keys at random, to be used as the startin
+g text
# for the output.
######################################################################
+###########
sub create_start_text {
my(@keys);
for(keys %freq){
push(@keys, $_) if /^[A-Z][a-z]/;
}
@keys = keys %freq unless @keys;
return $keys[rand(@keys)];
}
######################################################################
+###########
# Create frequency hash
+
#
# This routine opens an input file, and puts the entire contents into
+ $text,
# converting instances of more than one whitespace character in a row
+ into one
# whitespace character. It creates a hash with keys consisting of al
+l the
# ngrams (of length $depth-1) in the input text. The value will be a
+nother hash
# with keys consisting of all the possible characters that follow tha
+t ngram in
# the text. The value of each of those keys will be the number of ti
+mes that
# particular combination of ngram and following letter occur in the t
+ext.
######################################################################
+###########
sub create_freq_hash {
my($input, $depth) = @_;
my(@input, %freq, @text, $text, $ngram, $ptr, $next_char);
# Make one long string, collapsing all multiple whitespace into one
+space
open (IFH, "$input") or die "Can't open $input for reading: $!";
@text = <IFH>;
$text = join('', @text);
$text =~ s/\s+/ /g;
close (IFH);
$ngram = substr($text,0,$depth-1);
$ptr = $depth-1;
for(0..length($text)-$depth){
$next_char = substr($text,$ptr,1);
$freq{"$ngram"}->{"$next_char"}++;
$ngram = substr($ngram,1) . $next_char;
$ptr++;
}
return %freq;
}
######################################################################
+###########
# Get command line switches
#
# -i: Specifies the input file to run frequency analysis on. Mutually
+ exclusive
# with -l, so if -l is also specified, it will quit with an error
+ message.
#
# -o: Specifies an output file to write results to. If switch is not
+ present
# results will not be written to a file. If -A is specified, res
+ults will
# be appended to whatever the output file is.
#
# -d: Specifies the depth of analysis. In other words, if depth is '
+3', the
# program will look at the last two letters output, and based on
+the
# frequency analysis for that digram, pick a letter to follow.
#
# -v: Specifies the length of the output (verbosity).
#
# -s: Specifies a file in which to store the frequency analysis. Als
+o stored
# in the file will be the name and version of this script, the na
+me of the
# input file the analysis was based on, and the depth of the anal
+ysis.
# Mutually exclusive with -l, so if -l is also specified, it will
+ quit with
# an error message.
#
# -l: Specifies a frequency file to load and use. Mutually exclusive
+ with -i
# and -s, so if either -i or -s is specified, it will quit with a
+n error message.
#
# -t: Specifies starting text for the output. The script will start
+with the
# last d-1 characters of the text (where d is the depth), and do
+its thing
# from there. If -t is not specified, the script will start with
+ a random
# d-1 character sequence from the frequency hash. If you specify
+ your own
# text and it contains spaces, then the text must be enclosed in
+"".
#
# -a: Specifies the name of a file in which to store a human readable
+ version
# of the frequency analysis. This works just like the -s switch,
+ except
# the frequency data is human readable. An excerpt of a file fol
+lows:
#
# #ki#
# #l 1# #n 20#
#
# #tr#
# #y 3# #e 22# #u 17# #a 21# #i 18# #o 5#
#
# This would mean that:
#
# sequence is followed by this many times in the original te
+xt
# ki l 1
# ki n 20
# tr y 3
# tr e 22
# tr u 17
# tr a 21
# tr i 18
# tr o 5
# (for $64,000, Name That Text! ;)
#
# -e: Specifies the name of a frequency file to examine. It will cau
+se
# the program to print out the following embedded information fro
+m the
# frequency file: the name and version of this script, the name o
+f the input
# file the frequency analysis is based on, and the depth used in
+the
# analysis. If this switch is present, all other switches are ig
+nored.
#
# -D: If specified, will not display output on monitor.
#
# -A: Output file will be appended to, not overwritten.
######################################################################
+###########
sub get_options {
my(%opts, $input, $output, $depth, $verbosity, $save, $load, $start_
+text, $ascii, $examine, $no_display, $append);
getopts('i:o:d:v:s:l:t:a:e:DA', \%opts);
#### check for mutually exclusive cases and lack of both -i and -l
if(defined $opts{i} && defined $opts{l}){
print "You may not specify both the -i and -l switches\n";
exit
}
if(! defined $opts{i} && ! defined $opts{l}){
print "You have to specify one of the -i or -l switches\n";
exit
}
if(defined $opts{s} && defined $opts{l}){
print "You may not specify both the -s and -l switches\n";
exit;
}
########################### check for switches ###################
if(defined $opts{e}){
$examine = $opts{e};
if(! -e $examine){
print "Frequency file $examine does not exist.\n";
exit;
}
return ($input, $output, $depth, $verbosity, $save, $load, $start_
+text, $ascii, $examine, $no_display, $append);
}
$no_display = $opts{D} || 0;
$append = $opts{A} || 0;
if(defined $opts{i}){
$input = $opts{i};
do {print "Input file $input does not exist\n"; exit} unless -e $i
+nput;
do {print "Input file $input does not appear to be a text file\n";
+ exit} unless -T "$input";
}
if(defined $opts{l}){
$load = $opts{l};
if(! -e $load){
print "Frequency file $load does not exist.\n";
exit;
}
}
if(defined $opts{s}){
$save = "$opts{s}";
if(-e $save){
print "$save already exists. Overwrite? n\b";
my $answer = <STDIN>;
chomp($answer);
$answer = lc($answer);
exit unless $answer eq "y";
}
}
if(defined $opts{o}){
$output = $opts{o};
if(-e $output && ! $append){
print "$output already exists. Overwrite? n\b";
my $answer = <STDIN>;
chomp($answer);
$answer = lc($answer);
exit unless $answer eq "y";
}
}
if(defined $opts{d}){
$depth = $opts{d};
if($depth < 2 || $depth > 10){
print "Depth must be from 2 to 10, inclusive\n";
exit;
}
} else {
$depth = 3;
}
if(defined $opts{v}){
$verbosity = $opts{v};
if($verbosity < 10 || $verbosity > 10000){
print "Length must be from 10 to 10,000 characters, inclusive\n"
+;
exit;
}
} else {
$verbosity = 100;
}
if(defined $opts{t}){
$start_text = $opts{t};
if(length($start_text) < $depth){
print "\n\nERROR: The length of the text you supplied with the -
+t option must be\n";
print "at least $depth characters long, because that is what you
+ set depth to.\n\n";
exit;
}
}
if(defined $opts{a}){
$ascii = "$opts{a}";
if(-e $ascii){
print "$ascii already exists. Overwrite? n\b";
my $answer = <STDIN>;
chomp($answer);
$answer = lc($answer);
exit unless $answer eq "y";
}
}
return ($input, $output, $depth, $verbosity, $save, $load, $start_te
+xt, $ascii, $examine, $no_display, $append);
}
In reply to ngram
by TheEnigma
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.