http://qs321.pair.com?node_id=618629
Category: Text Processing
Author/Contact Info raptur jpate@ling.ohio-state.edu
Description: Reads in a directory of corpus files in the format used by the Penn Treebank:

((NP
(ADJP (ADV just) (ADJ another))
(NN perl)(NN hacker)))

finds all the grammar rules demonstrated in the corpus files for a given node, contextualized relative to the local left-sister (or mother if -m is passed). Moreover, it counts how many times each rule is observed, and produces a weighted grammar for the given node only. Designed for exploring the sort of information contained in local context in natural language, and whether there are meaningful clusters of context that improve the accuracy of natural language parsing. For earlier work in this area, see:
Johnson, Mark (1998). The effect of alternative tree representations on tree bank grammars. In D.M.W. Powers (ed.) New Methods in Language Processing and Computational Natural Language Learning, ACL, pp. 39-48.

This is a revised title and description for the earlier post entitled CPCFG_count.pl in response to others' suggestions. This post is otherwise unchanged.

#!/usr/bin/perl
#Author: John Pate
#Written for a project that explored Probabilistic Context Free Gramma
+r
#contextualization possibilites (per Johnson 1998).
#
#counts the expansions of a given node in every Penn-Treebank style 
#file of a given directory, sub-categorizing nodes according to 
#local left-sister or mother context depending on the flag
#
#sister context is default
#
#Performs clever string handling rather than actual tree traversals
#to save time and space.


#identifies flags
#true extraCounts precedes each LHS with the number of RHSs and
#total instances
#true relevant separates each LHS section with a blank line
if($ARGV[0] =~/^-[a-z]+/)
{
    $extraCounts = 1 if $ARGV[0] =~ /c/;
    $relevant = 1 if $ARGV[0] =~ /r/;
    $mother=1 if $ARGV[0]=~/m/;
    shift @ARGV;
}

if($#ARGV<2)
{
    print STDERR "\nUsage: ./CPCFG_count.pl [-crm] node collapsings in
+putdir [countmin]\n\n";
    print STDERR "Options:\n";
    print STDERR "\tc: print extra counts\n";
    print STDERR "\tr: print LHS separating newline\n";
    print STDERR "\tm: subcategorise according to mother rather than l
+ocal left-sister\n\n";
    exit;
}

my ($node, $basicColl, $inputDir, $countmin) = @ARGV;


@aunties=();
@mothers=();
$depth=-1;

#Read in file containing nodes to treat the same
#for example, could treat all verb forms as just verb
#Format: rawCategory:preProcessedCategory
open(COLL,"$basicColl") or die ("Error: $!");
while($line=<COLL>)
{
    chomp $line;
    @coll = split /:/,$line;
    $basicColl{$coll[0]}=$coll[1];
}
close(COLL);

opendir(INPUTCORP,"$inputDir") or die ("Error: $!");
@inputCorpora=readdir(INPUTCORP);
closedir(INPUTCORP);



foreach $file (@inputCorpora)
{
    #skip system files
    if($file=~/^\./)
    {next;}
    
    $path="$inputDir/$file";
    
    open(CORP,"$path") or die ("Error: $!");
    
    $char = getc CORP;
    
    while(defined $char)
    {
        #holdover from debugging
        #should never actually happen
        #uncomment if you have problems
        #if($depth<-1)
        #{
        #    print STDERR "|$depth|$curNode|$path|\n";
        #    exit;
        #}
        
        #left parenthesis signals 1) new node coming and
        #2) one constituent deeper
        if($char eq '(')
        {
            $depth++;
            
            #Any information about these daughters has already been re
+corded elsewhere
            delete $daughters[$depth+1];
            #new node coming
            $curNode='';
        
        
            #read new node (ignore grammatical function labels, which 
+are preceded by '-')
            $char = getc CORP;
            while($char =~ /[\w0-9]/)
            {
                $curNode .= $char;
                $char = getc CORP;
            }
            
            #if this node is to be collapsed, collapse it
            if(defined $basicColl{$curNode})
            {
                $trueCurNode=$basicColl{$curNode}
            }else{
                $trueCurNode=$curNode;
            }
            
            #add to the expansion of the node just read in (ignoring p
+unctuation)
            push @{$daughters[$depth]}, $trueCurNode if $depth>-1 && $
+trueCurNode =~ /\w/;
            
            #remember the current dominating node as the left sister o
+f the node to be read in next
            $aunties[$depth]=$mothers[$depth];
            $mothers[$depth]=$trueCurNode;
            
            $curPos=tell CORP;
            $curPos--;
            seek(CORP, $curPos, 0);
        }elsif($char eq ')'){
            
            #any information about these has already been recorded els
+ewhere
            $aunties[$depth+1]='';
            $mothers[$depth+1]='';
            
            #form the components of the rule
            $rhs=join(', ',@{$daughters[$depth+1]});

            if($mother)
            {
                $lhs="$mothers[$depth]"."_$mothers[$depth-1]";
            }else{
                $lhs="$mothers[$depth]"."_$aunties[$depth]";
            }

            #increment the count for the rule seen
            $rules{$lhs}{$rhs}++ if $rhs =~/\w/ && $mothers[$depth] eq
+ $node;
            $depth--;
            
        }
        $char = getc CORP;
    }
    close(CORP);
}


#print out actual counts
foreach $lhs (keys %rules)
{
    $presentation='';
    $instances=0;
    $rules=0;
    foreach $rhs (keys %{$rules{$lhs}})
    {
        #print STDERR "\t$rhs\n";
        $instances+=$rules{$lhs}{$rhs};
        $rules++;
        $presentation.= "$rules{$lhs}{$rhs}: $lhs --> $rhs\n";
    }
    
    $presentation="$instances\n$rules\n".$presentation if ($extraCount
+s && $instances >= $countmin);
    $presentation.="\n" if ($relevant && $instances >= $countmin);
    print $presentation if $instances >= $countmin;
}


if($depth==-1)
{
    print STDERR "\nBrackets match!\n\n";
}else{
    print STDERR "\nWARNING: Brackets do NOT match!\n\n";
}

print STDERR "\a";