Beefy Boxes and Bandwidth Generously Provided by pair Networks
Just another Perl shrine

txt2docbook 2

by Maze (Sexton)
on Apr 16, 2007 at 19:14 UTC ( #610410=sourcecode: print w/replies, xml ) Need Help??
Category: text processing
Author/Contact Info Chris Monahan aka Maze
Description: this guesses the semantic structure from a text document, stripping the line endings and guessing where the paragraph breaks and headers should be. Good for processing Gutenburg 'plain vanilla ASCII' version 3 of txt2docbook, modularised ready for expansion

#this is a script which takes text input by default and outputs DocBoo
+k XML by default having guessed at the semantic structure of the text
#At the moment it's arranged in such a way that allows for expansion, 
+including the development of a module based on this as a template
use strict;
use warnings;
use vars qw($articlename $headertest $nextline $lnapply $writestart $w
+ritetitle $writeelement $writeheader $writeend $lineallowance $inform
+at $outformat $val $marker $line $isheader $string $paranumber $artic
$informat = "text";
$outformat = "DocBook";
$lineallowance = 0;

#here should go the code for overriding the defaults
#but in the meantime i'll happily setup blind defaults and do the modu
+larity bit later, as i've seperated it all cool like
#good for testing ;-)

if($informat eq "text"){
    $articlename = sub{
        my $val = <SOURCE>;
        return $val;
    $nextline = sub{
        return <SOURCE>;
    $headertest = sub{
        if($string eq "\n" and $marker > $lineallowance){
            $isheader = 1;

if($outformat eq "DocBook"){
    $writestart = sub{
         print '<?xml version="1.0" encoding="UTF-8"?>';
         print '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4
+.1.2//EN" "">
         print "\n\n";
    $writetitle = sub{
        print "<article>\n <title>";
        print &$articlename;
        print "</title>";
    $writeelement = sub {
        print "\n<para>\n$line\n</para>\n";
    $writeheader = sub {
#         print "</chapter>";
#         print "\n<chapter id=\"$element\">\n";
        print "<title>$line</title>\n";
    $writeend = sub{
        print "\n</article>";
$lnapply = sub {
    if($isheader == 0){
    track("break isn't header");
    elsif($isheader == 1){
    track("break is header");

#sort out all function aliases before here
#and here we have the actual algorithm

sub liberate{
if (defined $_[0]){
    open SOURCE, $ARGV[0] or return("$!");
    print "usage: semget [file] > [outfile]" and return;

$marker = 0;
$isheader = 0;
$paranumber = 0;


while(defined($string = &$nextline)){
    if($string eq "\n") {
        track("found break");
        if($marker == $lineallowance){
            track("hit line allowance");
            $isheader = 0;
            $line = undef;
        track("redundant break");
        #track("found text");
        if (defined $line){
            $line = "${line} $string";
            $line = $string;
        $marker = 0;
#print "$string";




sub track{
    warn "\ntrack:$_[0] at $paranumber";

Log In?

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://610410]
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others rifling through the Monastery: (5)
As of 2020-10-27 12:51 GMT
Find Nodes?
    Voting Booth?
    My favourite web site is:

    Results (256 votes). Check out past polls.