comment on

#!/usr/bin/perl
#Author: John Pate
#Written for a project that explored Probabilistic Context Free Gramma
+r
#contextualization possibilites (per Johnson 1998).
#
#counts the expansions of a given node in every Penn-Treebank style 
#file of a given directory, sub-categorizing nodes according to 
#local left-sister or mother context depending on the flag
#
#sister context is default
#
#Performs clever string handling rather than actual tree traversals
#to save time and space.


#identifies flags
#true extraCounts precedes each LHS with the number of RHSs and
#total instances
#true relevant separates each LHS section with a blank line
if($ARGV[0] =~/^-[a-z]+/)
{
    $extraCounts = 1 if $ARGV[0] =~ /c/;
    $relevant = 1 if $ARGV[0] =~ /r/;
    $mother=1 if $ARGV[0]=~/m/;
    shift @ARGV;
}

if($#ARGV<2)
{
    print STDERR "\nUsage: ./CPCFG_count.pl [-crm] node collapsings in
+putdir [countmin]\n\n";
    print STDERR "Options:\n";
    print STDERR "\tc: print extra counts\n";
    print STDERR "\tr: print LHS separating newline\n";
    print STDERR "\tm: subcategorise according to mother rather than l
+ocal left-sister\n\n";
    exit;
}

my ($node, $basicColl, $inputDir, $countmin) = @ARGV;


@aunties=();
@mothers=();
$depth=-1;

#Read in file containing nodes to treat the same
#for example, could treat all verb forms as just verb
#Format: rawCategory:preProcessedCategory
open(COLL,"$basicColl") or die ("Error: $!");
while($line=<COLL>)
{
    chomp $line;
    @coll = split /:/,$line;
    $basicColl{$coll[0]}=$coll[1];
}
close(COLL);

opendir(INPUTCORP,"$inputDir") or die ("Error: $!");
@inputCorpora=readdir(INPUTCORP);
closedir(INPUTCORP);



foreach $file (@inputCorpora)
{
    #skip system files
    if($file=~/^\./)
    {next;}
    
    $path="$inputDir/$file";
    
    open(CORP,"$path") or die ("Error: $!");
    
    $char = getc CORP;
    
    while(defined $char)
    {
        #holdover from debugging
        #should never actually happen
        #uncomment if you have problems
        #if($depth<-1)
        #{
        #    print STDERR "|$depth|$curNode|$path|\n";
        #    exit;
        #}
        
        #left parenthesis signals 1) new node coming and
        #2) one constituent deeper
        if($char eq '(')
        {
            $depth++;
            
            #Any information about these daughters has already been re
+corded elsewhere
            delete $daughters[$depth+1];
            #new node coming
            $curNode='';
        
        
            #read new node (ignore grammatical function labels, which 
+are preceded by '-')
            $char = getc CORP;
            while($char =~ /[\w0-9]/)
            {
                $curNode .= $char;
                $char = getc CORP;
            }
            
            #if this node is to be collapsed, collapse it
            if(defined $basicColl{$curNode})
            {
                $trueCurNode=$basicColl{$curNode}
            }else{
                $trueCurNode=$curNode;
            }
            
            #add to the expansion of the node just read in (ignoring p
+unctuation)
            push @{$daughters[$depth]}, $trueCurNode if $depth>-1 && $
+trueCurNode =~ /\w/;
            
            #remember the current dominating node as the left sister o
+f the node to be read in next
            $aunties[$depth]=$mothers[$depth];
            $mothers[$depth]=$trueCurNode;
            
            $curPos=tell CORP;
            $curPos--;
            seek(CORP, $curPos, 0);
        }elsif($char eq ')'){
            
            #any information about these has already been recorded els
+ewhere
            $aunties[$depth+1]='';
            $mothers[$depth+1]='';
            
            #form the components of the rule
            $rhs=join(', ',@{$daughters[$depth+1]});

            if($mother)
            {
                $lhs="$mothers[$depth]"."_$mothers[$depth-1]";
            }else{
                $lhs="$mothers[$depth]"."_$aunties[$depth]";
            }

            #increment the count for the rule seen
            $rules{$lhs}{$rhs}++ if $rhs =~/\w/ && $mothers[$depth] eq
+ $node;
            $depth--;
            
        }
        $char = getc CORP;
    }
    close(CORP);
}


#print out actual counts
foreach $lhs (keys %rules)
{
    $presentation='';
    $instances=0;
    $rules=0;
    foreach $rhs (keys %{$rules{$lhs}})
    {
        #print STDERR "\t$rhs\n";
        $instances+=$rules{$lhs}{$rhs};
        $rules++;
        $presentation.= "$rules{$lhs}{$rhs}: $lhs --> $rhs\n";
    }
    
    $presentation="$instances\n$rules\n".$presentation if ($extraCount
+s && $instances >= $countmin);
    $presentation.="\n" if ($relevant && $instances >= $countmin);
    print $presentation if $instances >= $countmin;
}


if($depth==-1)
{
    print STDERR "\nBrackets match!\n\n";
}else{
    print STDERR "\nWARNING: Brackets do NOT match!\n\n";
}

print STDERR "\a";
[download]

In reply to Contextualized weighted grammar generator by raptur

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


There's more than one way to do things
	PerlMonks