#!/usr/bin/perl
=pod
=head1 BulkImport.pl - Import plain HTML files into Joomla database
=head1 SYNOPSIS
This script imports HTML files directly into Joomla's database.
It works with both Joomla 1.0.12 and Joomla 1.5rc1
=head1 DESCRIPTION
This script reads a directory to find all the HTML files and imports them into a Joomla site. It reads the content of the HTML page and looks at the
tag to determine the title of the page. It looks at the file’s modification date to determine the “publication date” for Joomla, and then it makes a MySQL database connection and executes the query.
I hacked the original script together in about an hour. It probably works, but it is not for the faint of heart. If you barely understand Joomla, and you’ve never looked at perl before, and you’re working from a Windows PC, this isn't ready for you yet. If you’re using a Mac or Linux, or you’re comfortable running Perl on Windows, this is pretty straightforward.
=head1 OPTIONS
This program takes the standard 2 options.
=over 6
=item B<-n>
Dry run. Don't do anything. Just show what would be done. It won't connect to
the database, it won't insert any files. It will just show you the names and
dates of all the files that it would try to insert.
=item B<-D>
Old Database. Use the 1.0.12 database schema. Otherwise, the 1.5 schema is
assumed. Use this option if you use Joomla 1.0.12.
=item B<-r>
Recurse. Descend subdirectories and find all their files and import them, too.
Otherwise, subdirectories are skipped and only the files in directory are
inserted into the site.
=item B<-f>
Use the file name as the title, not the from inside the file. The
current regular expression will strip off '.(asp|aspx|htm|html|shtm|shtml)'
and use the remaining file name.
=item B<-F>
B the filename to the title of the page. For example, if the page contains New Page 1 and it is named I, then this will set the title (in Joomla) to be "New Page 1 menu". Again, as in B<-f>, common file extensions are stripped off.
=back
=head1 OUTPUT
You'll see a line or two for each file processed.
=head1 ERRORS
The script blows up on errors. More details will follow.
=head1 DIAGNOSTICS
HTML files that have no tags will be named "Article" unless B<-f> or
B<-F> are used.
If you use the wrong database type (e.g., you forget to specify B<-D> when you
should), it will blow up at the SQL level, but keep on truckin. In other
words, you'll get lots and lots of errors.
=head1 EXAMPLES
perl BulkImport.pl -r html
=over 2
This will go into the B directory and find pages and upload them.
Because of B<-r>, it will descend into any subdirectories that are found and
process HTML files there, too.
=back
=head1 AUTHOR
Paco Hope
=head1 COPYRIGHT
Copyright (C) 2007 Paco Hope Distributed under the BSD License.
(See the bottom of this file)
Original from http://paco.to/?p=191
Now at http://joomlacode.org/gf/project/bulkimport/
=head1 SEE ALSO
DBI(3pm), DBD::MySQL(3pm), HTMLL::Parser
=head1 ACKNOWLEDGEMENTS
Thanks to David Glah from Cable & Wireless for the SQL update
and the impetus to do recursion.
=head1 INTERNAL DOCUMENTATION
The remainder of this documentation is per-function, internal documentation.
It is only intended for the developers and maintainers of this code.
=cut
use strict;
use HTML::Parser;
use POSIX qw(strftime);
use DBI;
use DBD::mysql;
use Getopt::Long;
# Here's the MySQL database stuff you need to configure
$db::user = "root";
$db::passwd = "zeroc00l";
$db::database = "hsrgiglio";
$db::hostname = "localhost";
$db::port = "3306";
$db::tablename = "jos_content";
$db::ver = "1.5"; # default
# state for all articles (1=published)
$j::state = 1;
# numeric Joomla section and category where you want the articles inserted
$j::section = 1;
$j::category = 1;
# numeric creator ID (62 = admin) for all articles
$j::creator = 62;
# By default, do not recurse. Use -r to enable recursion.
$j::recurse = '';
###########
### No need to change anything below here
###########
# this first bit is right out of the HTML::Parser perldoc
sub title_handler {
return if shift ne "title";
my $self = shift;
$self->handler( text => sub { $j::title = shift }, "dtext" );
$self->handler(
end => sub { shift->eof if shift eq "title"; },
"tagname,self"
);
}
sub date_handler {
return if shift ne "date";
my $self = shift;
$self->handler( text => sub { $j::date = shift }, "dtext" );
$self->handler(
end => sub { shift->eof if shift eq "date"; },
"tagname,self"
);
}
#sub date_handler {
# my ($self, $tagname, $attr, $attrseq, $origtext) = @_;
# if ($tagname eq 'date') {
# end => sub { shift->eof if shift eq "date"; },
# "tagname,self"
# }
#}
# Given a file name:
# Parse it for
# Get its date from the filesystem
# Insert it into the Joomla Database
sub insertFile {
my $file = shift;
my $p = HTML::Parser->new( api_version => 3 );
$p->handler( start => \&title_handler, "tagname,self" );
$p->handler( start => \&date_handler, "tagname,self" );
$p->parse_file($file);
# Get the mod time on the file, so we can set the creation time of the
# Joomla article to that time. This blatently taken from perldoc -f stat
my (
$dev, $ino, $mode, $nlink, $uid, $gid, $rdev,
$size, $atime, $mtime, $ctime, $blksize, $blocks
) = stat($file);
# Break $mtime down into its constituent parts.
# This taken from perldoc -f localtime
my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) =
localtime($mtime);
# make a MySQL compatible date
my $mysqlDate = strftime(
"%F %T", $sec, $min, $hour, $mday,
$mon, $year, $wday, $yday, $isdst
);
# Open the file and stick its entire contents into $htmlBody
my $htmlBody;
open HTMLFILE, "<$file";
my $numread = read HTMLFILE, $htmlBody, $size;
if( $j::dryrun ) {
print " Titolo: \"$j::title\"\n";
# print " Date: \"$mysqlDate\"\n";
print " Data: \"$j::date\"\n";
} else {
# $db::sth->execute(
# $j::title, $j::title, $htmlBody, $j::state, $j::section,
# $j::category, $mysqlDate, $j::creator, $mysqlDate
# );
$db::sth->execute(
$j::title, $j::title, $htmlBody, $j::state, $j::section,
$j::category, $j::date, $j::creator, $j::date
);
}
}
=pod
=head2 sub processDir
Given a directory, process all the entries in the directory. If we have -r on
the command line, then we will recurse into directories that we find.
Otherwise, we skip them.
=cut
sub processDir {
my $dir = shift;
my $entry = "";
my $dirhandle;
if ( !opendir( $dirhandle, $dir ) ) {
warn "can't opendir $dir: $! (continuing)";
return;
}
# Go through all the dir entries, but ignore '.' and '..'
while ( $entry = readdir($dirhandle) ) {
next if "$entry" eq ".";
next if "$entry" eq "..";
if ( -d "$dir/$entry" ) {
print "Processing directory $dir/$entry\n";
# if we have a directory, and we want to recurse, call
# processDir on it.
if ($j::recurse) {
processDir("$dir/$entry");
}
next;
}
# Note that this ignores symbolic links, too.
next unless -f "$dir/$entry";
print " + $entry\n";
insertFile("$dir/$entry");
}
closedir DIR;
}
###
### Begin Main
###
# Default title for our articles, if one isn't defined in the HTML
$j::title = "Article";
# Process command line arguments
GetOptions( 'r' => \$j::recurse,
'D' => \$j::dbver,
'f' => \$j::useFileName,
'F' => \$j::appendFileName,
'n' => \$j::dryrun );
if( $j::dbver ) {
# -D means use old database (1.0.12)
$db::ver = "1.0.12";
}
if ( $#ARGV != 0 ) {
die "need a directory name ($#ARGV)";
}
else {
$j::dir = $ARGV[0];
if ( !-r $j::dir ) {
die "can't open $j::dir";
}
if ( !-d $j::dir ) {
die "$j::dir is not a directory";
}
}
$db::dsn = "DBI:mysql:database=$db::database;host=$db::hostname";
if( $j::dryrun ) {
print "Would connect to $db::dsn with $db::user and pass xxxx\n";
} else {
$db::dbh = DBI->connect( $db::dsn, $db::user, $db::passwd );
}
# Now build up the query
my $q = "INSERT INTO `$db::tablename` VALUES ";
# first int is the autoincrement field. We assume that will be set by MySQL
# date: 2007-07-04 21:07:51
# Depending on which version we've been asked to do
if ( $db::ver eq "1.0.12" ) {
$q .= "(null, ?, ?, ?, '', ?, ?, 0, ?, ?, ?, '', '0000-00-00 00:00:00', ";
$q .= "0, 0, '0000-00-00 00:00:00', ?, '0000-00-00 00:00:00', '', '', ";
$q .= "'pageclass_sfx=\\nback_button=\\nitem_title=1\\nlink_titles=\\nintrotext=1\\n";
$q .= "section=0\\nsection_link=0\\ncategory=0\\ncategory_link=0\\nrating=\\nauthor=\\n";
$q .= "createdate=\\nmodifydate=\\npdf=\\nprint=\\nemail=\\nkeyref=\\ndocbook_type=', ";
$q .= "1, 0, 1, '', '', 0, 0)";
}
elsif ( $db::ver eq "1.5" ) {
$q .= "(null, ?, '', ?, ?, '', ?, ?, 0, ?, ?, ?, '', '0000-00-00 00:00:00', ";
$q .= "0, 0, '0000-00-00 00:00:00', ?, '0000-00-00 00:00:00', '', '', ";
$q .= "'pageclass_sfx=\\nback_button=\\nitem_title=1\\nlink_titles=\\nintrotext=1\\n";
$q .= "section=0\\nsection_link=0\\ncategory=0\\ncategory_link=0\\nrating=\\nauthor=\\n";
$q .= "createdate=\\nmodifydate=\\npdf=\\nprint=\\nemail=\\nkeyref=\\ndocbook_type=', ";
$q .= "1, 0, 1, '', '', 0, 0,'')";
}
if( $j::dryrun ) {
print "Using Joomla database schema for version $db::ver\n";
} else {
# Prepare the query once. We'll execute it many times.
$db::sth = $db::dbh->prepare($q);
}
print "processing '$j::dir'\n";
processDir($j::dir);
if( ! $j::dryrun ) {
$db::dbh->disconnect;
}
=pod
=head1 LICENSE
License Terms for this file. This is the BSD License.
(http://opensource.org/licenses/bsd-license.php)
Copyright (c) 2007, Paco Hope
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
=over 2
=item -
Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
=item -
Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
=item -
Neither the name of Paco Hope nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
=back
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
=cut