#!/usr/bin/perl
#Author: John Pate
#Written for a project that explored Probabilistic Context Free Gramma
+r
#contextualization possibilites (per Johnson 1998).
#
#counts the expansions of a given node in every Penn-Treebank style
#file of a given directory, sub-categorizing nodes according to
#local left-sister or mother context depending on the flag
#
#sister context is default
#
#Performs clever string handling rather than actual tree traversals
#to save time and space.
#identifies flags
#true extraCounts precedes each LHS with the number of RHSs and
#total instances
#true relevant separates each LHS section with a blank line
if($ARGV[0] =~/^-[a-z]+/)
{
$extraCounts = 1 if $ARGV[0] =~ /c/;
$relevant = 1 if $ARGV[0] =~ /r/;
$mother=1 if $ARGV[0]=~/m/;
shift @ARGV;
}
if($#ARGV<2)
{
print STDERR "\nUsage: ./CPCFG_count.pl [-crm] node collapsings in
+putdir [countmin]\n\n";
print STDERR "Options:\n";
print STDERR "\tc: print extra counts\n";
print STDERR "\tr: print LHS separating newline\n";
print STDERR "\tm: subcategorise according to mother rather than l
+ocal left-sister\n\n";
exit;
}
my ($node, $basicColl, $inputDir, $countmin) = @ARGV;
@aunties=();
@mothers=();
$depth=-1;
#Read in file containing nodes to treat the same
#for example, could treat all verb forms as just verb
#Format: rawCategory:preProcessedCategory
open(COLL,"$basicColl") or die ("Error: $!");
while($line=<COLL>)
{
chomp $line;
@coll = split /:/,$line;
$basicColl{$coll[0]}=$coll[1];
}
close(COLL);
opendir(INPUTCORP,"$inputDir") or die ("Error: $!");
@inputCorpora=readdir(INPUTCORP);
closedir(INPUTCORP);
foreach $file (@inputCorpora)
{
#skip system files
if($file=~/^\./)
{next;}
$path="$inputDir/$file";
open(CORP,"$path") or die ("Error: $!");
$char = getc CORP;
while(defined $char)
{
#holdover from debugging
#should never actually happen
#uncomment if you have problems
#if($depth<-1)
#{
# print STDERR "|$depth|$curNode|$path|\n";
# exit;
#}
#left parenthesis signals 1) new node coming and
#2) one constituent deeper
if($char eq '(')
{
$depth++;
#Any information about these daughters has already been re
+corded elsewhere
delete $daughters[$depth+1];
#new node coming
$curNode='';
#read new node (ignore grammatical function labels, which
+are preceded by '-')
$char = getc CORP;
while($char =~ /[\w0-9]/)
{
$curNode .= $char;
$char = getc CORP;
}
#if this node is to be collapsed, collapse it
if(defined $basicColl{$curNode})
{
$trueCurNode=$basicColl{$curNode}
}else{
$trueCurNode=$curNode;
}
#add to the expansion of the node just read in (ignoring p
+unctuation)
push @{$daughters[$depth]}, $trueCurNode if $depth>-1 && $
+trueCurNode =~ /\w/;
#remember the current dominating node as the left sister o
+f the node to be read in next
$aunties[$depth]=$mothers[$depth];
$mothers[$depth]=$trueCurNode;
$curPos=tell CORP;
$curPos--;
seek(CORP, $curPos, 0);
}elsif($char eq ')'){
#any information about these has already been recorded els
+ewhere
$aunties[$depth+1]='';
$mothers[$depth+1]='';
#form the components of the rule
$rhs=join(', ',@{$daughters[$depth+1]});
if($mother)
{
$lhs="$mothers[$depth]"."_$mothers[$depth-1]";
}else{
$lhs="$mothers[$depth]"."_$aunties[$depth]";
}
#increment the count for the rule seen
$rules{$lhs}{$rhs}++ if $rhs =~/\w/ && $mothers[$depth] eq
+ $node;
$depth--;
}
$char = getc CORP;
}
close(CORP);
}
#print out actual counts
foreach $lhs (keys %rules)
{
$presentation='';
$instances=0;
$rules=0;
foreach $rhs (keys %{$rules{$lhs}})
{
#print STDERR "\t$rhs\n";
$instances+=$rules{$lhs}{$rhs};
$rules++;
$presentation.= "$rules{$lhs}{$rhs}: $lhs --> $rhs\n";
}
$presentation="$instances\n$rules\n".$presentation if ($extraCount
+s && $instances >= $countmin);
$presentation.="\n" if ($relevant && $instances >= $countmin);
print $presentation if $instances >= $countmin;
}
if($depth==-1)
{
print STDERR "\nBrackets match!\n\n";
}else{
print STDERR "\nWARNING: Brackets do NOT match!\n\n";
}
print STDERR "\a";
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.
|