Beefy Boxes and Bandwidth Generously Provided by pair Networks
good chemistry is complicated,
and a little bit messy -LW

Attachment Search for TWiki

by Melly (Hermit)
on Oct 11, 2006 at 15:43 UTC ( #577627=sourcecode: print w/replies, xml ) Need Help??
Category: CGI Programming
Author/Contact Info Tom Melly

We have several internal twikis, and one of the problems we are now dealing with is that users have been uploading word documents and other attachments without describing them on a twiki page, or adding suitable key-words.

The consequence is that we often can't find useful docs since twiki doesn't search attachments... so I wrote this program to allow users to search inside attachments (or just their filenames) for useful content.

A few parameters need setting in the code, but hopefully they are clear enough as to what values they need (basically, $searchpath, $doc_url, $twiki_url, $return_url and @twikis). One final note - if you want the option to search all Twikis, make 'All' the first elemement of @twikis, otherwise leave it out.

I suspect that the code would run against other wikis as well, but I haven't tested.

Runs under warn and taint without problems.

#!/usr/bin/perl -wT

use strict;
use CGI qw(:standard);
use File::Find;

# base paths
my $searchpath = '/var/www/twiki/pub/';
my $doc_url = '/twiki/pub/';
my $twiki_url = '/twiki/bin/view/';
my $return_url = '/'; # link back to main twiki

# twikis that can be searched, incl. All
my @twikis = qw(
  All CV Customers Dev Implementation Info Product Proton
  RENLStar Sales Sandbox Support

# for popup_menu values=array element
# (stops user adding another directory)
my %twikis;
  $twikis{$_} = $twikis[$_];

# will hold final list of twikis to search (1 twiki unless All)
my @twikipaths;

# params from form
my $mode = param('mode'); # cgi app mode
my $isregex = param('isregex'); # regex search
my $cs = param('cs'); # case-sensitive
my $fname_only = param('fname_only'); # filenames only

# searchterm - allow anything, but eval regex ($msg for regex errs)
my ($msg, $searchterm) = ('','');
$searchterm = $1 if (defined param('searchterm') &&
  param('searchterm') =~ /(.+)/);

  eval {'foobar' =~ /$searchterm/};
    $isregex = 0;
    $@ =~ /([^;]*)/;
    $msg = "<em>Bad Regex - non-Regex search performed: $1
      (if you entered '*', you probably meant '.*')</em><br>";

# search_ext - allow anything, but eval (always regex)
my $search_ext = 'doc';
$search_ext = $1 if (defined param('search_ext') &&
  param('search_ext') =~ /(.+)/);
eval{ 'foobar' =~ /$search_ext/};
  $search_ext = 'doc';
  $@ =~ /([^;]*)/;
  $msg .= "<em>Bad Extension Regex - searching for .doc: $1</em>";

# for display
my $nice_ext = '.' . (lc $search_ext);
my $title;

# translate array element from param to twiki name
my $twiki = $twikis[0];
$twiki = $twikis[$1] if (defined param('twiki') &&
  param('twiki') =~ /^(\d+)$/) and defined $twikis[$1];

# for list of files found in search
my @matched_files;

# no search, just form
if(!$mode or $searchterm =~ /^\s*$/ or !$twiki){
  $mode = 0;
  $title = 'Twiki Attachment Search';
# form + search + search results
  # build list of twikis to search
  if($twiki eq 'All'){
      push @twikipaths, $searchpath . $_ . '/' if $_ ne 'All';
    push @twikipaths, $searchpath . $twiki . '/';

  # handle regex/non-regex search, run search and print results
  $searchterm = quotemeta $searchterm if !$isregex;
  $title = 'Twiki Attachment Search Results';

# page top
sub SearchTop(){
  my $title = $_[0];
  my $title2;
  $title2 = "(Search for: $searchterm)" if $mode;
  $title2 .= "(Extension: $nice_ext)";
  $title2 .= "<br>(Twiki: $twiki)";
  $title2 .= $isregex ? '(regex)' : '(non-regex)';
  $title2 .= $cs ? '(case-sensitive)' : '(case-insensitive)';
  $title2 .= $fname_only ? '(filenames only)' : '(filenames and conten
  print header();
  print start_html($title);
  &SearchForm($title, $title2);

# page end
sub SearchEnd(){
  print end_html();

# search form, use numeric vals for twikis
sub SearchForm(){
  print h2($_[0]);
  print h4($_[1]);
  print p(
    a({-href=>$return_url},'Back to TWiki')
  print p("<em>WARNING - Searching may take some time,
    especially if you search all Twikis (although \"Filenames Only\" s
    be okay).</em><br>",
    "To return a list of all docs, use \".*\"
    as a Regex  with the \"Filenames Only\" and \"All\" options.<br>
    The extension regex is not case-sensitive.");
  print start_form('GET');
  print p("Search For: ", textfield('searchterm', '',35), '.',
    textfield('search_ext', 'doc',7),'(extension regex)'
  print p(
    a({-href=>'',-target=>'_new'},"Regex: "),
    checkbox('isregex',0,1, ''),
    "Case Sensitive: ", checkbox('cs',0,1,''),
    "Filenames Only: ", checkbox('fname_only',0,1,'')
  print p("In TWikis: ",popup_menu('twiki',[0..$#twikis],0,\%twikis),
    hidden('mode', 1));
  print p(submit('Search'), ' ', CGI::reset());
  print endform;

# handle search results (mainly building urls)
sub SearchResults(){
  print h4('Warning'), '<p>', $msg, '</p>' if $msg;
  my @filesout;
    my $file = $_;
    if($file =~ m#^$searchpath([^/]*)(.*)/(.*)#){
      my $shorttwiki = $1 . $2;
      my $twikilink =  $twiki_url . $shorttwiki;
      my $filelink = $doc_url . $shorttwiki . '/' . $3;
      my $filename = $3;
      push @filesout, [$shorttwiki, $twikilink, $filelink, $filename];
  print table({-border=>1},
      th(['Document', 'On TWiki']),
            a({-href=>$$_[2]},$$_[3]), a({-href=>$$_[1]},$$_[0])
      } @filesout

# perform search, checking if fname_only, cs, etc.,
# stripping non-printable ascii
sub DoSearch{
  my $path = $searchpath;
  $path .= $twiki;

  sub wanted{
    # ,v files are twiki attachment versions -
    # we only check current version
    if($_ !~ /,v$/i and /.+\.$search_ext$/i){
      if($cs && /$searchterm/){
        push @matched_files, $File::Find::name;
      elsif(!$cs && /$searchterm/i){
        push @matched_files, $File::Find::name;
        open(DOC, $File::Find::name)||
          die "Couldn't open $File::Find::name:$!\n";
        THISFILE: while(my $line = <DOC>){
          $line =~ s/[^\011\012\015\040-\176]//g;
          if($cs && $line =~ /$searchterm/){
            close DOC;
            push @matched_files, $File::Find::name;
            last THISFILE;
          elsif(!$cs && $line =~ /$searchterm/i){
            close DOC;
            push @matched_files, $File::Find::name;
            last THISFILE;

Log In?

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://577627]
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others exploiting the Monastery: (6)
As of 2021-01-20 10:44 GMT
Find Nodes?
    Voting Booth?