Description: |
We have several internal twikis, and one of the problems we are now dealing with is that users have been uploading word documents and other attachments without describing them on a twiki page, or adding suitable key-words.
The consequence is that we often can't find useful docs since twiki doesn't search attachments... so I wrote this program to allow users to search inside attachments (or just their filenames) for useful content.
A few parameters need setting in the code, but hopefully they are clear enough as to what values they need (basically, $searchpath, $doc_url, $twiki_url, $return_url and @twikis). One final note - if you want the option to search all Twikis, make 'All' the first elemement of @twikis, otherwise leave it out.
I suspect that the code would run against other wikis as well, but I haven't tested.
Runs under warn and taint without problems. |
#!/usr/bin/perl -wT
use strict;
use CGI qw(:standard);
use File::Find;
# base paths
my $searchpath = '/var/www/twiki/pub/';
my $doc_url = '/twiki/pub/';
my $twiki_url = '/twiki/bin/view/';
my $return_url = '/'; # link back to main twiki
# twikis that can be searched, incl. All
my @twikis = qw(
All CV Customers Dev Implementation Info Product Proton
RENLStar Sales Sandbox Support
);
# for popup_menu values=array element
# (stops user adding another directory)
my %twikis;
foreach(0..$#twikis){
$twikis{$_} = $twikis[$_];
}
# will hold final list of twikis to search (1 twiki unless All)
my @twikipaths;
# params from form
my $mode = param('mode'); # cgi app mode
my $isregex = param('isregex'); # regex search
my $cs = param('cs'); # case-sensitive
my $fname_only = param('fname_only'); # filenames only
# searchterm - allow anything, but eval regex ($msg for regex errs)
my ($msg, $searchterm) = ('','');
$searchterm = $1 if (defined param('searchterm') &&
param('searchterm') =~ /(.+)/);
if($isregex){
eval {'foobar' =~ /$searchterm/};
if($@){
$isregex = 0;
$@ =~ /([^;]*)/;
$msg = "<em>Bad Regex - non-Regex search performed: $1
(if you entered '*', you probably meant '.*')</em><br>";
}
}
# search_ext - allow anything, but eval (always regex)
my $search_ext = 'doc';
$search_ext = $1 if (defined param('search_ext') &&
param('search_ext') =~ /(.+)/);
eval{ 'foobar' =~ /$search_ext/};
if($@){
$search_ext = 'doc';
$@ =~ /([^;]*)/;
$msg .= "<em>Bad Extension Regex - searching for .doc: $1</em>";
}
# for display
my $nice_ext = '.' . (lc $search_ext);
my $title;
# translate array element from param to twiki name
my $twiki = $twikis[0];
$twiki = $twikis[$1] if (defined param('twiki') &&
param('twiki') =~ /^(\d+)$/) and defined $twikis[$1];
# for list of files found in search
my @matched_files;
# no search, just form
if(!$mode or $searchterm =~ /^\s*$/ or !$twiki){
$mode = 0;
$title = 'Twiki Attachment Search';
&SearchTop($title);
&SearchEnd();
}
# form + search + search results
elsif($mode){
# build list of twikis to search
if($twiki eq 'All'){
foreach(@twikis){
push @twikipaths, $searchpath . $_ . '/' if $_ ne 'All';
}
}
else{
push @twikipaths, $searchpath . $twiki . '/';
}
# handle regex/non-regex search, run search and print results
$searchterm = quotemeta $searchterm if !$isregex;
&DoSearch();
$title = 'Twiki Attachment Search Results';
&SearchTop($title)
&SearchResults();
&SearchEnd();
}
# page top
sub SearchTop(){
my $title = $_[0];
my $title2;
$title2 = "(Search for: $searchterm)" if $mode;
$title2 .= "(Extension: $nice_ext)";
$title2 .= "<br>(Twiki: $twiki)";
$title2 .= $isregex ? '(regex)' : '(non-regex)';
$title2 .= $cs ? '(case-sensitive)' : '(case-insensitive)';
$title2 .= $fname_only ? '(filenames only)' : '(filenames and conten
+t)';
print header();
print start_html($title);
&SearchForm($title, $title2);
}
# page end
sub SearchEnd(){
print end_html();
}
# search form, use numeric vals for twikis
sub SearchForm(){
print h2($_[0]);
print h4($_[1]);
print p(
a({-href=>$return_url},'Back to TWiki')
);
print p("<em>WARNING - Searching may take some time,
especially if you search all Twikis (although \"Filenames Only\" s
+hould
be okay).</em><br>",
"To return a list of all docs, use \".*\"
as a Regex with the \"Filenames Only\" and \"All\" options.<br>
The extension regex is not case-sensitive.");
print start_form('GET');
print p("Search For: ", textfield('searchterm', '',35), '.',
textfield('search_ext', 'doc',7),'(extension regex)'
);
print p(
a({-href=>'http://tinyurl.com/pcpbq',-target=>'_new'},"Regex: "),
checkbox('isregex',0,1, ''),
"Case Sensitive: ", checkbox('cs',0,1,''),
"Filenames Only: ", checkbox('fname_only',0,1,'')
);
print p("In TWikis: ",popup_menu('twiki',[0..$#twikis],0,\%twikis),
hidden('mode', 1));
print p(submit('Search'), ' ', CGI::reset());
print endform;
}
# handle search results (mainly building urls)
sub SearchResults(){
print h4('Warning'), '<p>', $msg, '</p>' if $msg;
my @filesout;
foreach(@matched_files){
my $file = $_;
if($file =~ m#^$searchpath([^/]*)(.*)/(.*)#){
my $shorttwiki = $1 . $2;
my $twikilink = $twiki_url . $shorttwiki;
my $filelink = $doc_url . $shorttwiki . '/' . $3;
my $filename = $3;
push @filesout, [$shorttwiki, $twikilink, $filelink, $filename];
}
}
print table({-border=>1},
Tr([
th(['Document', 'On TWiki']),
map{
td([
a({-href=>$$_[2]},$$_[3]), a({-href=>$$_[1]},$$_[0])
])
} @filesout
])
);
}
# perform search, checking if fname_only, cs, etc.,
# stripping non-printable ascii
sub DoSearch{
my $path = $searchpath;
$path .= $twiki;
find({wanted=>\&wanted,
untaint=>1,untaint_pattern=>'^([\040-\176]*)$',untaint_skip=>1},
@twikipaths);
sub wanted{
# ,v files are twiki attachment versions -
# we only check current version
if($_ !~ /,v$/i and /.+\.$search_ext$/i){
if($cs && /$searchterm/){
push @matched_files, $File::Find::name;
}
elsif(!$cs && /$searchterm/i){
push @matched_files, $File::Find::name;
}
elsif(!$fname_only){
open(DOC, $File::Find::name)||
die "Couldn't open $File::Find::name:$!\n";
THISFILE: while(my $line = <DOC>){
$line =~ s/[^\011\012\015\040-\176]//g;
if($cs && $line =~ /$searchterm/){
close DOC;
push @matched_files, $File::Find::name;
last THISFILE;
}
elsif(!$cs && $line =~ /$searchterm/i){
close DOC;
push @matched_files, $File::Find::name;
last THISFILE;
}
}
}
}
}
}
|