#Last Updated 05.09.09

##</code><code>##

    if ( $file[$i] =~ m/(\\(gll|[abcdef(exg.)]g\.)|textsc\/)/ ) {

##</code><code>##

... Note: The script and the TeX file have to be in the same directory. ...

##</code><code>##

N-=non- (e.g. NSG nonsingular, NPST nonpast)

##</code><code>##

#!/usr/bin/perl

=head1 NAME

name-of-script

=head1 SYNOPSIS

 name-of-script [-l] filename.tex

=head1 DESCRIPTION

This script reads a given LaTeX file, finds everything in the text
that looks like an abbreviation, and then creates a new file in the
same directory (called "filename-abbrev.txt") that lists them all.

In this process, an abbreviation in LaTeX is defined as:

  - this...

  - that...

  - whatever else...

If your LaTeX file uses abbreviations that are specified in the
'Leipzig Glossing Rules' (LGR), you can use the '-l' option to have
these abbreviations listed with the full terms that they represent.
In this case, the output file will list the non-LGR abbreviations
first, and then the LGR ones are given with their meanings.

=cut

use strict;
use Getopt::Long;

my %lgr;
while (<DATA>) {
    chomp;
    my ($abbr, $term) = split( /=/ );
    $lgr{$abbr} = $term;
}

my $Usage = "$0 [-l] filename.tex\n  (run 'perldoc $0' for help)\n";
my $opt_lgr;
my $opt_ok = GetOptions( 'l' => \$opt_lgr );
my $arg_ok = ( @ARGV == 1 and -f $ARGV[0] );
die $Usage unless ( $opt_ok and $arg_ok );

my $filename = shift;

open( TEX, "<:utf8", $filename ) or die "$0: $filename: $!\n";
my @texlines = <TEX>;
close TEX;
chomp @texlines;

my %abbr_seen;
for my $ln ( 0 .. $#texlines - 1 ) {
    next unless ( $texlines[$ln] =~ /(\\(?:gll|[abcdef]g\.|exg\.?))/ );
    my $ln1 = $ln + 1;
    while ( $texlines[$ln1] =~
            / [-=\s.:]([A-Z]+)[-=\s.:] | (SG|DU|PL) | ([123]) /gx ) {
        $abbr_seen{$1}++;
    }
}

$filename =~ s/\.tex.*//;
$filename .= '-abbrev.txt';
open( ABBR, ">:utf8", $filename ) or die "$0: $filename: $!\n";

for my $abbr ( sort keys %abbr_seen ) {
    next if ( $opt_lgr and exists( $lgr{$abbr} ));
    print ABBR "$abbr\n";
}
if ( $opt_lgr ) {
    print ABBR "\n";
    for my $abbr ( sort keys %abbr_seen ) {
        print ABBR "\\item[$abbr] '$lgr{$abbr}'" if ( exists( $lgr{$abbr} ));
    }
}
close ABBR;

__DATA__
1=first person
2=second person
3=third person
A=agent-like argument of canonical transitive verb
ABL=ablative
ABS=absolutive
ACC=accusative
ADJ=adjective
ADV=adverb(ial)
AGR=agreement
ALL=allative
ANTIP=antipassive
APPL=applicative
ART=article
AUX=auxiliary
BEN=benefactive
CAUS=causative
CLF=classifier
COM=comitative
COMP=complementizer
COMPL=completive
COND=conditional
COP=copula
CVB=converb
DAT=dative
DECL=declarative
DEF=definite
DEM=demonstrative
DET=determiner
DIST=distal
DISTR=distributive
DU=dual
DUR=durative
ERG=ergative
EXCL=exclusive
F=feminine
FOC=focus
FUT=future
GEN=genitive
IMP=imperative
INCL=inclusive
IND=indicative
INDF=indefinite
INF=infinitive
INS=instrumental
INTR=intransitive
IPFV=imperfective
IRR=irrealis
LOC=locative
M=masculine
N=neuter
N-=non- (e.g. NSG nonsingular, NPST nonpast)
NEG=negation, negative
NMLZ=nominalizer/nominalization
NOM=nominative
OBJ=object
OBL=oblique
P=patient-like argument of canonical transitive verb
PASS=passive
PFV=perfective
PL=plural
POSS=possessive
PRED=predicative
PRF=perfect
PRS=present
PROG=progressive
PROH=prohibitive
PROX=proximal/proximate
PST=past
PTCP=participle
PURP=purposive
Q=question particle/marker
QUOT=quotative
RECP=reciprocal
REFL=reflexive
REL=relative
RES=resultative
S=single argument of canonical intransitive verb
SBJ=subject
SBJV=subjunctive
SG=singular
TOP=topic
TR=transitive
VOC=vocative