#!/usr/bin/perl #Author: John Pate #Written for a project that explored Probabilistic Context Free Grammar #contextualization possibilites (per Johnson 1998). # #counts the expansions of a given node in every Penn-Treebank style #file of a given directory, sub-categorizing nodes according to #local left-sister or mother context depending on the flag # #sister context is default # #Performs clever string handling rather than actual tree traversals #to save time and space. #identifies flags #true extraCounts precedes each LHS with the number of RHSs and #total instances #true relevant separates each LHS section with a blank line if($ARGV[0] =~/^-[a-z]+/) { $extraCounts = 1 if $ARGV[0] =~ /c/; $relevant = 1 if $ARGV[0] =~ /r/; $mother=1 if $ARGV[0]=~/m/; shift @ARGV; } if($#ARGV<2) { print STDERR "\nUsage: ./CPCFG_count.pl [-crm] node collapsings inputdir [countmin]\n\n"; print STDERR "Options:\n"; print STDERR "\tc: print extra counts\n"; print STDERR "\tr: print LHS separating newline\n"; print STDERR "\tm: subcategorise according to mother rather than local left-sister\n\n"; exit; } my ($node, $basicColl, $inputDir, $countmin) = @ARGV; @aunties=(); @mothers=(); $depth=-1; #Read in file containing nodes to treat the same #for example, could treat all verb forms as just verb #Format: rawCategory:preProcessedCategory open(COLL,"$basicColl") or die ("Error: $!"); while($line=) { chomp $line; @coll = split /:/,$line; $basicColl{$coll[0]}=$coll[1]; } close(COLL); opendir(INPUTCORP,"$inputDir") or die ("Error: $!"); @inputCorpora=readdir(INPUTCORP); closedir(INPUTCORP); foreach $file (@inputCorpora) { #skip system files if($file=~/^\./) {next;} $path="$inputDir/$file"; open(CORP,"$path") or die ("Error: $!"); $char = getc CORP; while(defined $char) { #holdover from debugging #should never actually happen #uncomment if you have problems #if($depth<-1) #{ # print STDERR "|$depth|$curNode|$path|\n"; # exit; #} #left parenthesis signals 1) new node coming and #2) one constituent deeper if($char eq '(') { $depth++; #Any information about these daughters has already been recorded elsewhere delete $daughters[$depth+1]; #new node coming $curNode=''; #read new node (ignore grammatical function labels, which are preceded by '-') $char = getc CORP; while($char =~ /[\w0-9]/) { $curNode .= $char; $char = getc CORP; } #if this node is to be collapsed, collapse it if(defined $basicColl{$curNode}) { $trueCurNode=$basicColl{$curNode} }else{ $trueCurNode=$curNode; } #add to the expansion of the node just read in (ignoring punctuation) push @{$daughters[$depth]}, $trueCurNode if $depth>-1 && $trueCurNode =~ /\w/; #remember the current dominating node as the left sister of the node to be read in next $aunties[$depth]=$mothers[$depth]; $mothers[$depth]=$trueCurNode; $curPos=tell CORP; $curPos--; seek(CORP, $curPos, 0); }elsif($char eq ')'){ #any information about these has already been recorded elsewhere $aunties[$depth+1]=''; $mothers[$depth+1]=''; #form the components of the rule $rhs=join(', ',@{$daughters[$depth+1]}); if($mother) { $lhs="$mothers[$depth]"."_$mothers[$depth-1]"; }else{ $lhs="$mothers[$depth]"."_$aunties[$depth]"; } #increment the count for the rule seen $rules{$lhs}{$rhs}++ if $rhs =~/\w/ && $mothers[$depth] eq $node; $depth--; } $char = getc CORP; } close(CORP); } #print out actual counts foreach $lhs (keys %rules) { $presentation=''; $instances=0; $rules=0; foreach $rhs (keys %{$rules{$lhs}}) { #print STDERR "\t$rhs\n"; $instances+=$rules{$lhs}{$rhs}; $rules++; $presentation.= "$rules{$lhs}{$rhs}: $lhs --> $rhs\n"; } $presentation="$instances\n$rules\n".$presentation if ($extraCounts && $instances >= $countmin); $presentation.="\n" if ($relevant && $instances >= $countmin); print $presentation if $instances >= $countmin; } if($depth==-1) { print STDERR "\nBrackets match!\n\n"; }else{ print STDERR "\nWARNING: Brackets do NOT match!\n\n"; } print STDERR "\a";