Beefy Boxes and Bandwidth Generously Provided by pair Networks
good chemistry is complicated,
and a little bit messy -LW
 
PerlMonks  

Attachment Search for TWiki

by Melly (Hermit)
on Oct 11, 2006 at 15:43 UTC ( #577627=sourcecode: print w/replies, xml ) Need Help??
Category: CGI Programming
Author/Contact Info Tom Melly tom@tomandlu.co.uk
Description:

We have several internal twikis, and one of the problems we are now dealing with is that users have been uploading word documents and other attachments without describing them on a twiki page, or adding suitable key-words.

The consequence is that we often can't find useful docs since twiki doesn't search attachments... so I wrote this program to allow users to search inside attachments (or just their filenames) for useful content.

A few parameters need setting in the code, but hopefully they are clear enough as to what values they need (basically, $searchpath, $doc_url, $twiki_url, $return_url and @twikis). One final note - if you want the option to search all Twikis, make 'All' the first elemement of @twikis, otherwise leave it out.

I suspect that the code would run against other wikis as well, but I haven't tested.

Runs under warn and taint without problems.

#!/usr/bin/perl -wT

use strict;
use CGI qw(:standard);
use File::Find;

# base paths
my $searchpath = '/var/www/twiki/pub/';
my $doc_url = '/twiki/pub/';
my $twiki_url = '/twiki/bin/view/';
my $return_url = '/'; # link back to main twiki

# twikis that can be searched, incl. All
my @twikis = qw(
  All CV Customers Dev Implementation Info Product Proton
  RENLStar Sales Sandbox Support
);

# for popup_menu values=array element
# (stops user adding another directory)
my %twikis;
foreach(0..$#twikis){
  $twikis{$_} = $twikis[$_];
}

# will hold final list of twikis to search (1 twiki unless All)
my @twikipaths;

# params from form
my $mode = param('mode'); # cgi app mode
my $isregex = param('isregex'); # regex search
my $cs = param('cs'); # case-sensitive
my $fname_only = param('fname_only'); # filenames only

# searchterm - allow anything, but eval regex ($msg for regex errs)
my ($msg, $searchterm) = ('','');
$searchterm = $1 if (defined param('searchterm') &&
  param('searchterm') =~ /(.+)/);

if($isregex){
  eval {'foobar' =~ /$searchterm/};
  if($@){
    $isregex = 0;
    $@ =~ /([^;]*)/;
    $msg = "<em>Bad Regex - non-Regex search performed: $1
      (if you entered '*', you probably meant '.*')</em><br>";
  }
}

# search_ext - allow anything, but eval (always regex)
my $search_ext = 'doc';
$search_ext = $1 if (defined param('search_ext') &&
  param('search_ext') =~ /(.+)/);
eval{ 'foobar' =~ /$search_ext/};
if($@){
  $search_ext = 'doc';
  $@ =~ /([^;]*)/;
  $msg .= "<em>Bad Extension Regex - searching for .doc: $1</em>";
}

# for display
my $nice_ext = '.' . (lc $search_ext);
my $title;

# translate array element from param to twiki name
my $twiki = $twikis[0];
$twiki = $twikis[$1] if (defined param('twiki') &&
  param('twiki') =~ /^(\d+)$/) and defined $twikis[$1];

# for list of files found in search
my @matched_files;

# no search, just form
if(!$mode or $searchterm =~ /^\s*$/ or !$twiki){
  $mode = 0;
  $title = 'Twiki Attachment Search';
  &SearchTop($title);
  &SearchEnd();
}
# form + search + search results
elsif($mode){
  # build list of twikis to search
  if($twiki eq 'All'){
    foreach(@twikis){
      push @twikipaths, $searchpath . $_ . '/' if $_ ne 'All';
    }
  }
  else{
    push @twikipaths, $searchpath . $twiki . '/';
  }

  # handle regex/non-regex search, run search and print results
  $searchterm = quotemeta $searchterm if !$isregex;
  &DoSearch();
  $title = 'Twiki Attachment Search Results';
  &SearchTop($title)
  &SearchResults();
  &SearchEnd();
}

# page top
sub SearchTop(){
  my $title = $_[0];
  my $title2;
  $title2 = "(Search for: $searchterm)" if $mode;
  $title2 .= "(Extension: $nice_ext)";
  $title2 .= "<br>(Twiki: $twiki)";
  $title2 .= $isregex ? '(regex)' : '(non-regex)';
  $title2 .= $cs ? '(case-sensitive)' : '(case-insensitive)';
  $title2 .= $fname_only ? '(filenames only)' : '(filenames and conten
+t)';
  print header();
  print start_html($title);
  &SearchForm($title, $title2);
}

# page end
sub SearchEnd(){
  print end_html();
}

# search form, use numeric vals for twikis
sub SearchForm(){
  print h2($_[0]);
  print h4($_[1]);
  print p(
    a({-href=>$return_url},'Back to TWiki')
  );
  print p("<em>WARNING - Searching may take some time,
    especially if you search all Twikis (although \"Filenames Only\" s
+hould
    be okay).</em><br>",
    "To return a list of all docs, use \".*\"
    as a Regex  with the \"Filenames Only\" and \"All\" options.<br>
    The extension regex is not case-sensitive.");
  print start_form('GET');
  print p("Search For: ", textfield('searchterm', '',35), '.',
    textfield('search_ext', 'doc',7),'(extension regex)'
  );
  print p(
    a({-href=>'http://tinyurl.com/pcpbq',-target=>'_new'},"Regex: "),
    checkbox('isregex',0,1, ''),
    "Case Sensitive: ", checkbox('cs',0,1,''),
    "Filenames Only: ", checkbox('fname_only',0,1,'')
  );
  print p("In TWikis: ",popup_menu('twiki',[0..$#twikis],0,\%twikis),
    hidden('mode', 1));
  print p(submit('Search'), ' ', CGI::reset());
  print endform;
}

# handle search results (mainly building urls)
sub SearchResults(){
  print h4('Warning'), '<p>', $msg, '</p>' if $msg;
  my @filesout;
  foreach(@matched_files){
    my $file = $_;
    if($file =~ m#^$searchpath([^/]*)(.*)/(.*)#){
      my $shorttwiki = $1 . $2;
      my $twikilink =  $twiki_url . $shorttwiki;
      my $filelink = $doc_url . $shorttwiki . '/' . $3;
      my $filename = $3;
      push @filesout, [$shorttwiki, $twikilink, $filelink, $filename];
    }
  }
  print table({-border=>1},
    Tr([
      th(['Document', 'On TWiki']),
      map{
        td([
            a({-href=>$$_[2]},$$_[3]), a({-href=>$$_[1]},$$_[0])
        ])
      } @filesout
    ])
  );
}

# perform search, checking if fname_only, cs, etc.,
# stripping non-printable ascii
sub DoSearch{
  my $path = $searchpath;
  $path .= $twiki;
  find({wanted=>\&wanted,
    untaint=>1,untaint_pattern=>'^([\040-\176]*)$',untaint_skip=>1},
    @twikipaths);

  sub wanted{
    # ,v files are twiki attachment versions -
    # we only check current version
    if($_ !~ /,v$/i and /.+\.$search_ext$/i){
      if($cs && /$searchterm/){
        push @matched_files, $File::Find::name;
      }
      elsif(!$cs && /$searchterm/i){
        push @matched_files, $File::Find::name;
      }
      elsif(!$fname_only){
        open(DOC, $File::Find::name)||
          die "Couldn't open $File::Find::name:$!\n";
        THISFILE: while(my $line = <DOC>){
          $line =~ s/[^\011\012\015\040-\176]//g;
          if($cs && $line =~ /$searchterm/){
            close DOC;
            push @matched_files, $File::Find::name;
            last THISFILE;
          }
          elsif(!$cs && $line =~ /$searchterm/i){
            close DOC;
            push @matched_files, $File::Find::name;
            last THISFILE;
          }
        }
      }
    }
  }
}

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://577627]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others exploiting the Monastery: (6)
As of 2021-01-20 10:44 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    Notices?