Category: | text processing |
Author/Contact Info | Chris Monahan aka Maze ForeverWatcher@googlemail.com |
Description: | this guesses the semantic structure from a text document, stripping the line endings and guessing where the paragraph breaks and headers should be. Good for processing Gutenburg 'plain vanilla ASCII' version 3 of txt2docbook, modularised ready for expansion |
#!/usr/bin/perl #-------------- #txtattack.pl # #this is a script which takes text input by default and outputs DocBoo +k XML by default having guessed at the semantic structure of the text +. #At the moment it's arranged in such a way that allows for expansion, +including the development of a module based on this as a template use strict; use warnings; use vars qw($articlename $headertest $nextline $lnapply $writestart $w +ritetitle $writeelement $writeheader $writeend $lineallowance $inform +at $outformat $val $marker $line $isheader $string $paranumber $artic +lename); $informat = "text"; $outformat = "DocBook"; $lineallowance = 0; #here should go the code for overriding the defaults # #but in the meantime i'll happily setup blind defaults and do the modu +larity bit later, as i've seperated it all cool like #good for testing ;-) if($informat eq "text"){ $articlename = sub{ my $val = <SOURCE>; chomp($val); return $val; }; $nextline = sub{ return <SOURCE>; }; $headertest = sub{ if($string eq "\n" and $marker > $lineallowance){ $isheader = 1; } }; } if($outformat eq "DocBook"){ $writestart = sub{ print '<?xml version="1.0" encoding="UTF-8"?>'; print '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4 +.1.2//EN" "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd"> +'; print "\n\n"; }; $writetitle = sub{ print "<article>\n <title>"; print &$articlename; print "</title>"; }; $writeelement = sub { print "\n<para>\n$line\n</para>\n"; }; $writeheader = sub { # print "</chapter>"; # print "\n<chapter id=\"$element\">\n"; print "<title>$line</title>\n"; }; $writeend = sub{ print "\n</article>"; }; } $lnapply = sub { if($isheader == 0){ track("break isn't header"); &$writeelement($line); } elsif($isheader == 1){ track("break is header"); &$writeheader($line); } }; #--------------------- #sort out all function aliases before here #-------------------- #and here we have the actual algorithm sub liberate{ if (defined $_[0]){ open SOURCE, $ARGV[0] or return("$!"); } else{ print "usage: semget [file] > [outfile]" and return; } $marker = 0; $isheader = 0; $paranumber = 0; &$writestart; &$writetitle; while(defined($string = &$nextline)){ &$headertest($string); if($string eq "\n") { track("found break"); if($marker == $lineallowance){ track("hit line allowance"); $paranumber++; &$lnapply; $isheader = 0; $line = undef; } track("redundant break"); $marker++; } else{ chomp($string); #track("found text"); if (defined $line){ $line = "${line} $string"; } else{ $line = $string; } $marker = 0; } #print "$string"; } &$writeend; } liberate($ARGV[0]); sub track{ warn "\ntrack:$_[0] at $paranumber"; } |
Back to
Code Catacombs