Beefy Boxes and Bandwidth Generously Provided by pair Networks
There's more than one way to do things
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??
#!/usr/local/bin/perl # # Uses docx2txt project of Sandeep Kumar: # http://docx2txt.sourceforge.net/ # Uses newbie Perl/Tk example code from: # http://www.geocities.com/binnyva/code/perl/perl_tk_tutorial/ # Uses CakeCMD unzipper because other commandline unzippers # not extract corrupt word/xml and word/_rels/document.xml.rels files # use Tk; # # Create the Main Window # my $mw = new MainWindow; # # Hides TK logo with my own logo # my $icon = $mw->Photo(-file => 'ddte.gif'); $mw->iconimage($icon); # # Declare that there is a menu, create text # editor and create a vertical scroll bar # my $mbar = $mw -> Menu(); $mw -> configure(-menu => $mbar); my $textarea = $mw -> Frame(); #Creating Another Frame my $txt = $textarea -> Text(-width=>80, -height=>22); my $srl_y = $textarea -> Scrollbar(-orient=>'v',-command=>[yview => $t +xt]); $txt -> configure(-yscrollcommand=>['set', $srl_y]); $txt -> grid(-row=>1,-column=>1); $srl_y -> grid(-row=>1,-column=>2,-sticky=>"ns"); $textarea -> grid(-row=>5,-column=>1,-columnspan=>2); # # Main Menu Choices Setup section # my $file = $mbar -> cascade(-label=>"File", -underline=>0, -tearoff => + 0); my $help = $mbar -> cascade(-label =>"Help", -underline=>0, -tearoff = +> 0); # # File Menu Choices section # $file -> checkbutton(-label =>"Open", -underline => 0, -command => [\&menuopenClicked, "Open"]); $file -> command(-label =>"Save", -underline => 0, -command => [\&menusavedClicked, "Save"]); $file -> separator(); $file -> command(-label =>"Exit", -underline => 1, -command => sub { exit } ); # # Help Menu Choices section # $help -> command(-label =>"About", -command => sub { $txt->delete('1.0','end'); $txt->insert('end', "About ---------- How to use this program: 1. Click on the File Menu and choose Opem. 2. Choose your docx Word 2007 file from which you wish to extract text. 3. You extracted text will be displayed. 4. Choose the Save menu choice on the File Menu. 5. Save the text file to the name and file location you wish. This program is made by Paul D Pruitt (socrtwo) and uses a modification of the docx2txt Perl script by Sandeep Kumar for it's main logic. See http://docx2txt.sourceforge.net/. It also uses Binny V A's Perl/Tk code for the GUI elements from http://www.geocities.com/binnyva/code. CakeCMD is by Leung Yat Chun Joseph. http://www.quickzip.org/softwares-cakecmd It requires Microsoft .NET Framework Version 2.0 http://tinyurl.com/ms2-0-netframework My software website is http://www.godskingsandheroes.info/software/. Also visit my data recovery software list http://www.s2services.com. My E-Mail : socrtwo\@s2services"); }); # # Open Dialog Box File Extension Declaration section # my $typesopen = [ ['Word 2007 files', '.docx'], ['All files', '*'],]; # # Main loop currently activated by selecting the file # MainLoop; sub menuopenClicked { my $mainfilepath = $mw->getOpenFile(-filetypes => $typesop +en, -defaultextension => '.docx'); return if undefined $mainfilepath; # # Delete old XML data found in the word folder section # use Win32::OLE; $dir = '<word>'; if(-e $dir){ $Win32::OLE::Warn = 3; # ------ SCRIPT CONFIGURATION ------ $strFolderPath = '<word>'; # e.g. "d:\temp" # ------ END CONFIGURATION --------- $objFSO = Win32::OLE->new('Scripting.FileSystemObject'); $objFSO->DeleteFolder($strFolderPath); } else { print "\n"; } # # Docx file rename to zip section necessary for CakeCMD to unzip. # my $zipwordfilepath = $mainfilepath . '.zip'; rename($mainfilepath,$zipwordfilepath); # # Unzip docx/zip file section # my $unzip = "cakecmd.exe"; open my $wfh, "| $unzip extract \"$zipwordfilepath\" word/document.xml + \"\" " or die "Could not start $unzip: $!"; open my $wfh, "| $unzip extract \"$zipwordfilepath\" word/_rels/docume +nt.xml.rels \"\" " or die "Could not start $unzip: $!"; # # Script sleep section to allow unzipping action to catch up with scri +pt # $num = 1; while($num--){ sleep(1); } close $zipwordfilepath; # # Revert the target file to it's original extension # rename ($zipwordfilepath,$mainfilepath); # # Housekeeping section to make the program # run well in Windows? Check with docx2xml author. # my $nl = "\r\n"; # Alternative is "\n". my $lindent = " "; # Indent nested lists by "\t", " " etc. my $lwidth = 80; # Line width, used for short line justificatio +n. # # ToDo: Better list handling. Currently assumed 8 level nesting. # my @levchar = ('*', '+', 'o', '-', '**', '++', 'oo', '--'); # # Added routine for reading file into $_ variable. # This is necessary because the unzipper, cakecmd, does not write to S +TDIN or STDOUT # Source is Perl Monks: http://www.perlmonks.org/?node_id=1952 # { local $/=undef; open FILE, "word/_rels/document.xml.rels" or die "Couldn't open file +: $!"; binmode FILE; $_= <FILE>; close FILE; } # # Gather information about header, footer, hyperlinks, images, footnot +es etc. # my %docurels; while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"( + .*?)?\/>/g) { $docurels{"$2:$1"} = $3; } # # Subroutines for center and right justification of text in a line. # sub cjustify { my $len = length $_[0]; if ($len < ($lwidth - 1)) { my $lsp = ($lwidth - $len) / 2; return ' ' x $lsp . $_[0]; } else { return $_[0]; } } # sub rjustify { my $len = length $_[0]; if ($len < $lwidth) { return ' ' x ($lwidth - $len) . $_[0]; } else { return $_[0]; } } # # Subroutines for dealing with embedded links and images # sub hyperlink { return "{$_[1]}[HYPERLINK: $docurels{\"hyperlink:$_[0]\"}]"; } # # Routine for reading file into $content variable. # Source is Perl Monks: http://www.perlmonks.org/?node_id=1952 # { local $/=undef; open FILE, "word/document.xml" or die "Couldn't open file: $!"; binmode FILE; $content = <FILE>; close FILE; } # # Text extraction begins section # $content =~ s/<?xml .*?\?>(\r)?\n//; $content =~ s{<w:p [^/>]+?/>|</w:p>}|$nl|og; $content =~ s|<w:br/>|$nl|og; $content =~ s|<w:tab/>|\t|og; # my $hr = '-' x 78 . $nl; $content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og; $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$lindent x $1 . "$le +vchar[$1] "|oge; # # Uncomment either of below two lines and comment above line, if deali +ng # with more than 8 level nested lists. # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$lindent x $1 . '* + '|oge; # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|'*' x ($1+1) . ' ' +|oge; # $content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge; $content =~ s{<w:pPr><w:jc w:val="center"/></w:pPr><w:r><w:t>(.*?)</w: +t></w:r>}/cjustify($1)/oge; $content =~ s{<w:pPr><w:jc w:val="right"/></w:pPr><w:r><w:t>(.*?)</w:t +></w:r>}/rjustify($1)/oge; $content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperl +ink($1,$2)/oge; $content =~ s/<.*?>//g; # # Convert non-ASCII characters/character sequences to ASCII characters +. # $content =~ s/\xE2\x82\xAC/\xC8/og; # euro symbol as saved by MSOf +fice # $content =~ s/\xE2\x82\xAC/E/og; # euro symbol expressed as E $content =~ s/\xE2\x80\xA6/.../og; $content =~ s/\xE2\x80\xA2/::/og; # four dot diamond symbol $content =~ s/\xE2\x80\x9C/"/og; $content =~ s/\xE2\x80\x99/'/og; $content =~ s/\xE2\x80\x98/'/og; $content =~ s/\xE2\x80\x93/-/og; $content =~ s/\xC2\xA0//og; $content =~ s/&amp;/&/ogi; $content =~ s/&lt;/</ogi; $content =~ s/&gt;/>/ogi; # # Write the extracted and converted text contents to output. # #Text Area # $txt->delete('1.0','end'); $txt -> insert('end',$content); my $typessaved = [ ['Text files', '.txt'], ['All files', '*'],]; # # Subroutine actived by clicking on the save menu # sub menusavedClicked { my $saved = $mw->getSaveFile(-filetypes => $typessaved, -defaultextension => '.txt', -initialfile => "$mainfilepath" . '.txt' +); return if not defined $saved; # # Opens results text file for writing. # open($saved, "> $saved") || die "Can't create <$docx_name> for output! +\n"; print ($saved $content) if $saved; close $saved; }}

In reply to Corrupt MS Word 2007 Text Extractor by socrtwo

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others imbibing at the Monastery: (5)
As of 2024-04-19 06:50 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found