#!/usr/bin/perl =pod =head1 BulkImport.pl - Import plain HTML files into Joomla database =head1 SYNOPSIS This script imports HTML files directly into Joomla's database. It works with both Joomla 1.0.12 and Joomla 1.5rc1 =head1 DESCRIPTION This script reads a directory to find all the HTML files and imports them into a Joomla site. It reads the content of the HTML page and looks at the tag to determine the title of the page. It looks at the file’s modification date to determine the “publication date” for Joomla, and then it makes a MySQL database connection and executes the query. I hacked the original script together in about an hour. It probably works, but it is not for the faint of heart. If you barely understand Joomla, and you’ve never looked at perl before, and you’re working from a Windows PC, this isn't ready for you yet. If you’re using a Mac or Linux, or you’re comfortable running Perl on Windows, this is pretty straightforward. =head1 OPTIONS This program takes the standard 2 options. =over 6 =item B<-n> Dry run. Don't do anything. Just show what would be done. It won't connect to the database, it won't insert any files. It will just show you the names and dates of all the files that it would try to insert. =item B<-D> Old Database. Use the 1.0.12 database schema. Otherwise, the 1.5 schema is assumed. Use this option if you use Joomla 1.0.12. =item B<-r> Recurse. Descend subdirectories and find all their files and import them, too. Otherwise, subdirectories are skipped and only the files in directory are inserted into the site. =item B<-f> Use the file name as the title, not the <title> from inside the file. The current regular expression will strip off '.(asp|aspx|htm|html|shtm|shtml)' and use the remaining file name. =item B<-F> B<Append> the filename to the title of the page. For example, if the page contains <title>New Page 1 and it is named I, then this will set the title (in Joomla) to be "New Page 1 menu". Again, as in B<-f>, common file extensions are stripped off. =back =head1 OUTPUT You'll see a line or two for each file processed. =head1 ERRORS The script blows up on errors. More details will follow. =head1 DIAGNOSTICS HTML files that have no tags will be named "Article" unless B<-f> or B<-F> are used. If you use the wrong database type (e.g., you forget to specify B<-D> when you should), it will blow up at the SQL level, but keep on truckin. In other words, you'll get lots and lots of errors. =head1 EXAMPLES perl BulkImport.pl -r html =over 2 This will go into the B<html> directory and find pages and upload them. Because of B<-r>, it will descend into any subdirectories that are found and process HTML files there, too. =back =head1 AUTHOR Paco Hope <paco@paco.to> =head1 COPYRIGHT Copyright (C) 2007 Paco Hope <paco@paco.to> Distributed under the BSD License. (See the bottom of this file) Original from http://paco.to/?p=191 Now at http://joomlacode.org/gf/project/bulkimport/ =head1 SEE ALSO DBI(3pm), DBD::MySQL(3pm), HTMLL::Parser =head1 ACKNOWLEDGEMENTS Thanks to David Glah from Cable & Wireless for the SQL update and the impetus to do recursion. =head1 INTERNAL DOCUMENTATION The remainder of this documentation is per-function, internal documentation. It is only intended for the developers and maintainers of this code. =cut use strict; use HTML::Parser; use POSIX qw(strftime); use DBI; use DBD::mysql; use Getopt::Long; # Here's the MySQL database stuff you need to configure $db::user = "root"; $db::passwd = "zeroc00l"; $db::database = "hsrgiglio"; $db::hostname = "localhost"; $db::port = "3306"; $db::tablename = "jos_content"; $db::ver = "1.5"; # default # state for all articles (1=published) $j::state = 1; # numeric Joomla section and category where you want the articles inserted $j::section = 1; $j::category = 1; # numeric creator ID (62 = admin) for all articles $j::creator = 62; # By default, do not recurse. Use -r to enable recursion. $j::recurse = ''; ########### ### No need to change anything below here ########### # this first bit is right out of the HTML::Parser perldoc sub title_handler { return if shift ne "title"; my $self = shift; $self->handler( text => sub { $j::title = shift }, "dtext" ); $self->handler( end => sub { shift->eof if shift eq "title"; }, "tagname,self" ); } sub date_handler { return if shift ne "date"; my $self = shift; $self->handler( text => sub { $j::date = shift }, "dtext" ); $self->handler( end => sub { shift->eof if shift eq "date"; }, "tagname,self" ); } #sub date_handler { # my ($self, $tagname, $attr, $attrseq, $origtext) = @_; # if ($tagname eq 'date') { # end => sub { shift->eof if shift eq "date"; }, # "tagname,self" # } #} # Given a file name: # Parse it for <title> # Get its date from the filesystem # Insert it into the Joomla Database sub insertFile { my $file = shift; my $p = HTML::Parser->new( api_version => 3 ); $p->handler( start => \&title_handler, "tagname,self" ); $p->handler( start => \&date_handler, "tagname,self" ); $p->parse_file($file); # Get the mod time on the file, so we can set the creation time of the # Joomla article to that time. This blatently taken from perldoc -f stat my ( $dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks ) = stat($file); # Break $mtime down into its constituent parts. # This taken from perldoc -f localtime my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = localtime($mtime); # make a MySQL compatible date my $mysqlDate = strftime( "%F %T", $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ); # Open the file and stick its entire contents into $htmlBody my $htmlBody; open HTMLFILE, "<$file"; my $numread = read HTMLFILE, $htmlBody, $size; if( $j::dryrun ) { print " Titolo: \"$j::title\"\n"; # print " Date: \"$mysqlDate\"\n"; print " Data: \"$j::date\"\n"; } else { # $db::sth->execute( # $j::title, $j::title, $htmlBody, $j::state, $j::section, # $j::category, $mysqlDate, $j::creator, $mysqlDate # ); $db::sth->execute( $j::title, $j::title, $htmlBody, $j::state, $j::section, $j::category, $j::date, $j::creator, $j::date ); } } =pod =head2 sub processDir Given a directory, process all the entries in the directory. If we have -r on the command line, then we will recurse into directories that we find. Otherwise, we skip them. =cut sub processDir { my $dir = shift; my $entry = ""; my $dirhandle; if ( !opendir( $dirhandle, $dir ) ) { warn "can't opendir $dir: $! (continuing)"; return; } # Go through all the dir entries, but ignore '.' and '..' while ( $entry = readdir($dirhandle) ) { next if "$entry" eq "."; next if "$entry" eq ".."; if ( -d "$dir/$entry" ) { print "Processing directory $dir/$entry\n"; # if we have a directory, and we want to recurse, call # processDir on it. if ($j::recurse) { processDir("$dir/$entry"); } next; } # Note that this ignores symbolic links, too. next unless -f "$dir/$entry"; print " + $entry\n"; insertFile("$dir/$entry"); } closedir DIR; } ### ### Begin Main ### # Default title for our articles, if one isn't defined in the HTML $j::title = "Article"; # Process command line arguments GetOptions( 'r' => \$j::recurse, 'D' => \$j::dbver, 'f' => \$j::useFileName, 'F' => \$j::appendFileName, 'n' => \$j::dryrun ); if( $j::dbver ) { # -D means use old database (1.0.12) $db::ver = "1.0.12"; } if ( $#ARGV != 0 ) { die "need a directory name ($#ARGV)"; } else { $j::dir = $ARGV[0]; if ( !-r $j::dir ) { die "can't open $j::dir"; } if ( !-d $j::dir ) { die "$j::dir is not a directory"; } } $db::dsn = "DBI:mysql:database=$db::database;host=$db::hostname"; if( $j::dryrun ) { print "Would connect to $db::dsn with $db::user and pass xxxx\n"; } else { $db::dbh = DBI->connect( $db::dsn, $db::user, $db::passwd ); } # Now build up the query my $q = "INSERT INTO `$db::tablename` VALUES "; # first int is the autoincrement field. We assume that will be set by MySQL # date: 2007-07-04 21:07:51 # Depending on which version we've been asked to do if ( $db::ver eq "1.0.12" ) { $q .= "(null, ?, ?, ?, '', ?, ?, 0, ?, ?, ?, '', '0000-00-00 00:00:00', "; $q .= "0, 0, '0000-00-00 00:00:00', ?, '0000-00-00 00:00:00', '', '', "; $q .= "'pageclass_sfx=\\nback_button=\\nitem_title=1\\nlink_titles=\\nintrotext=1\\n"; $q .= "section=0\\nsection_link=0\\ncategory=0\\ncategory_link=0\\nrating=\\nauthor=\\n"; $q .= "createdate=\\nmodifydate=\\npdf=\\nprint=\\nemail=\\nkeyref=\\ndocbook_type=', "; $q .= "1, 0, 1, '', '', 0, 0)"; } elsif ( $db::ver eq "1.5" ) { $q .= "(null, ?, '', ?, ?, '', ?, ?, 0, ?, ?, ?, '', '0000-00-00 00:00:00', "; $q .= "0, 0, '0000-00-00 00:00:00', ?, '0000-00-00 00:00:00', '', '', "; $q .= "'pageclass_sfx=\\nback_button=\\nitem_title=1\\nlink_titles=\\nintrotext=1\\n"; $q .= "section=0\\nsection_link=0\\ncategory=0\\ncategory_link=0\\nrating=\\nauthor=\\n"; $q .= "createdate=\\nmodifydate=\\npdf=\\nprint=\\nemail=\\nkeyref=\\ndocbook_type=', "; $q .= "1, 0, 1, '', '', 0, 0,'')"; } if( $j::dryrun ) { print "Using Joomla database schema for version $db::ver\n"; } else { # Prepare the query once. We'll execute it many times. $db::sth = $db::dbh->prepare($q); } print "processing '$j::dir'\n"; processDir($j::dir); if( ! $j::dryrun ) { $db::dbh->disconnect; } =pod =head1 LICENSE License Terms for this file. This is the BSD License. (http://opensource.org/licenses/bsd-license.php) Copyright (c) 2007, Paco Hope All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: =over 2 =item - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. =item - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. =item - Neither the name of Paco Hope nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. =back THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =cut