http://qs321.pair.com?node_id=610410
Category: text processing
Author/Contact Info Chris Monahan aka Maze ForeverWatcher@googlemail.com
Description: this guesses the semantic structure from a text document, stripping the line endings and guessing where the paragraph breaks and headers should be. Good for processing Gutenburg 'plain vanilla ASCII' version 3 of txt2docbook, modularised ready for expansion

#!/usr/bin/perl
#--------------
#txtattack.pl
#
#this is a script which takes text input by default and outputs DocBoo
+k XML by default having guessed at the semantic structure of the text
+.
#At the moment it's arranged in such a way that allows for expansion, 
+including the development of a module based on this as a template
use strict;
use warnings;
use vars qw($articlename $headertest $nextline $lnapply $writestart $w
+ritetitle $writeelement $writeheader $writeend $lineallowance $inform
+at $outformat $val $marker $line $isheader $string $paranumber $artic
+lename);
$informat = "text";
$outformat = "DocBook";
$lineallowance = 0;

#here should go the code for overriding the defaults
#
#but in the meantime i'll happily setup blind defaults and do the modu
+larity bit later, as i've seperated it all cool like
#good for testing ;-)

if($informat eq "text"){
    $articlename = sub{
        my $val = <SOURCE>;
        chomp($val);
        return $val;
    };
    $nextline = sub{
        return <SOURCE>;
    };
    $headertest = sub{
        if($string eq "\n" and $marker > $lineallowance){
            $isheader = 1;
        }
    };
}

if($outformat eq "DocBook"){
    $writestart = sub{
         print '<?xml version="1.0" encoding="UTF-8"?>';
         print '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4
+.1.2//EN" "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd">
+';
         print "\n\n";
    };
    $writetitle = sub{
        print "<article>\n <title>";
        print &$articlename;
        print "</title>";
    };
    $writeelement = sub {
        print "\n<para>\n$line\n</para>\n";
    };
    $writeheader = sub {
#         print "</chapter>";
#         print "\n<chapter id=\"$element\">\n";
        print "<title>$line</title>\n";
    };
    $writeend = sub{
        print "\n</article>";
    };
}
$lnapply = sub {
    if($isheader == 0){
    track("break isn't header");
        &$writeelement($line);
    }
    elsif($isheader == 1){
    track("break is header");
        &$writeheader($line);
     }
};

#---------------------
#sort out all function aliases before here
#--------------------
#and here we have the actual algorithm

sub liberate{
    
if (defined $_[0]){
    open SOURCE, $ARGV[0] or return("$!");
}
else{
    print "usage: semget [file] > [outfile]" and return;
}

$marker = 0;
$isheader = 0;
$paranumber = 0;

&$writestart;
&$writetitle;

while(defined($string = &$nextline)){
    &$headertest($string);
    if($string eq "\n") {
        track("found break");
        if($marker == $lineallowance){
            track("hit line allowance");
            $paranumber++;
            &$lnapply;
            $isheader = 0;
            $line = undef;
        }
        track("redundant break");
        $marker++;
    }
    else{
        chomp($string);
        #track("found text");
        if (defined $line){
            $line = "${line} $string";
        }
        else{
            $line = $string;
        }
        $marker = 0;
    }
#print "$string";
}

&$writeend;

}

liberate($ARGV[0]);

sub track{
    warn "\ntrack:$_[0] at $paranumber";
}