http://qs321.pair.com?node_id=14932
Category: HTML Utilities
Author/Contact Info Big Joe Big_Joe1008@linuxstart.com
Description: This script you can run on a html document to remove all embedded tables that are in it. Assuming that the tables were programmed into the document correctly. By default it will remove all embedded and leave the main table but you can also tell how many embedded tables are allowed by changing the numofTables variable.
#!/usr/bin/perl -w
$inputfile="test.htm";
$outputfile=">outfile2.html";
$numofTables=1;


open(INFILE, $inputfile) or die ("no file $inputfile");
$filesize = -s INFILE;
read(INFILE, $thispage, $filesize);
close(INFILE);

#this removes anypage breaks
$thispage=~s/<BR>/ /g;
$thispage=~s/<\/BR>/ /g;


@myarray=split("\s", $thispage);
open(OUTFILE, $outputfile);


$start=0;
foreach(@myarray){
#this is not to clean but the ASP that wrote the HTML 
#put the table tags and script tags on their own line
    if(($_ =~ m/<TABLE/)||($_ =~ m/<SCRIPT/))
    {
        $start++;
    }
    if($start<=$numofTables){
    print OUTFILE "$_\s";
    }
    if($_ =~ m/<\/TABLE>/)
    {
        $start--;
        print OUTFILE "</TR><TR>\n<TD>";
    }elsif($_ =~ m/<\/SCRIPT>/){
        $start--;
    }
} 



close(OUTFILE);