comment on

I have created a little perl script that searches documents for a kind of binary string and lists the files that contain that exact string. Can you make suggestions on how I could improve this program or what I could have done to make this better? Also, if you see some obvious bugs in my program, would you please tell me? Thank you! Here is my script:

Edit: Okay, as requested, I am inserting the code here instead of just a link. And thank you for your ideas and advice. I will read through them more carefully tomorrow and try to implement them! Thank you!


#!/usr/bin/perl

#
# This Perl program looks for a certain binary string
# in a file and lists the files found and the total number
# of files found. I tested this script in TinyPerl 5.8.
#
#################################################################

use strict;
use warnings;

my $FIND            = ".\nThis is";
my $CASE_SENSITIVE  = 0;
my $FOLDER          = 'H:\\HOME';
my $RECURSIVE       = 1;
my @INCLUDE         = qw( *.txt *.doc *.xls *.ppt *.log *.htm *.html *
+.xml *. *.eml *.pdf *.rtf *.bas *.c *.h *.pl *.cgi *.bak *.reg *.ini 
+*.inf *.ion *.1st *.lst *.pas *.py *.cpp *.php *.xfm *.js *.asm *.rt 
+*.old );
my $VERBOSE         = 0;
my $PAGE_BREAK      = 0;

################################################################# Main
###
###    PROGRAM STARTS HERE:
##
#

my $TOTAL = 0;
my $BUFF_SIZE = 4000000;
my $LINUX = (index(uc($^O), 'WIN') < 0) ? 1 : 0;
$CASE_SENSITIVE or $FIND = uc($FIND);
CheckDIR($FOLDER);
print "\n", '-' x 72;
print "\n $TOTAL file(s) found with matching string\n";
exit;

################################################################# Sear
+chFile
#
# This function is automatically called by CheckDIR() every time
# a file is found. This function gets the path & name of the file.
#
#   Usage: SearchFile(PATH, NAME) <-- Called by CheckDIR()
#
sub SearchFile
{
  my $PATH = $_[0];
  my $NAME = $_[1];
  my $FULLNAME = "$PATH$NAME";

  (-f $FULLNAME) or return;  # not a plain file?
  my $L = -s $FULLNAME;      # Get file size
  $L or return;              # file size == 0?

  # Is the search string longer than
  # the file we're searching?
  $L >= length($FIND) or return;

  my $F;
  my $START = 0;
  my $FOUND = 0;
  my $BUFFER;
  open $F, "<$FULLNAME" or return;
  binmode $F;
  while ($START < $L)
  {
    if ($START > $L) { last; }
    read $F, $BUFFER, $BUFF_SIZE, $START;
    if (defined $BUFFER)
    {
      if (!$CASE_SENSITIVE) { $BUFFER = uc($BUFFER); }
      if (index($BUFFER, $FIND) >= 0)
      { $FOUND++; last; }
    }
    else
    { last; }
    if (length($BUFFER) < $BUFF_SIZE) { last; }
    $START += $BUFF_SIZE - length($FIND);
  }
  close $F;
  if ($FOUND)
  {
    $TOTAL++;
    if ($PAGE_BREAK)
    {
      if ($TOTAL % $PAGE_BREAK == 0)
      { print "\nPress ENTER to continue..."; <STDIN>; }
    }
    PRINT("FOUND: $FULLNAME");
  }
}
################################################################# Chec
+kDIR
#
# This function reads the contents of a folder and calls
# SearchFile() for each file that was found.
#
#   Usage: CheckDIR(PATH)
#
sub CheckDIR
{
  @_ or return;
  my $FULLNAME;
  my $PATH = shift;
  defined $PATH or return;
  length($PATH) or return;

  if ($VERBOSE) { PRINT("SEARCHING: ", FormatPath($PATH)); }
  $PATH = AddSeparator($PATH);
  opendir(my $DIR, $PATH) or return;
  while (my $NAME = readdir $DIR)
  {
    $FULLNAME = "$PATH$NAME";
    if (-d($FULLNAME))
    {
      # Check into subdirectory if RECURSIVE == 1
      # Skip directory if its name starts with "."
      if ($RECURSIVE)
      { CheckDIR($FULLNAME) unless (vec($NAME, 0, 8) == 46); }
      next;
    }

    # Scan file contents only if the file name
    # matches a certain pattern.
    foreach my $W (@INCLUDE)
    {
      if (isMatch($NAME, $W))
      { SearchFile($PATH, $NAME); }
    }
  }
  closedir $DIR;
}
################################################################# isMa
+tch
#
# This function returns 1 if a filename matches a certain
# wildcard pattern. There may be several question marks in
# the search pattern, but only one asterisk is allowed!
# The asterisk may be at the beginning, the end, or in the
# middle of the pattern. A question mark matches any
# character. An asterisk matches zero or more characters.
# The matching is NOT case sensitive!
#
#   Usage: INTEGER = isMatch(FILENAME, WILDCARD)
#
# Example: isMatch("New_Document.txt", "n*.txt") ---> 1
#
sub isMatch
{
  @_ > 1 or return 0;
  my $F = shift; defined $F or return 0; length($F) or return 0;
  my $W = shift; defined $W or return 0; length($W) or return 0;
  $F = uc($F);
  $W = uc($W);

  # If there aren't any wildcards at all...
  if (CountChars($W, '*?') == 0) { return ($F eq $W) ? 1 : 0; }

  # Match what's before the asterisk...
  return 0 unless (_isMatch($F, $W, 1));

  # Match what comes after the asterisk...
  return _isMatch($F, $W, -1);
}
################################################################# _isM
+atch
#
# This private function is called by isMatch()
# This function compares two strings and returns 1 if
# both strings match until the first asterisk.
# This function can start comparing strings starting
# from the beginning or starting from the end!
# DIRECTION must be either 1 or -1.
#
#   Usage: INTEGER = _isMatch(FILENAME, WILDCARD, DIRECTION)
#
sub _isMatch
{
  my $F = shift; my $f; my $LF = length($F)-1;
  my $W = shift; my $w; my $LW = length($W)-1;
  my $DIR = shift;
  my $STOP = $LW; my $START = 0; my $FSTART = 0;
  if ($DIR < 0) { $STOP = 0; $START = $LW; $FSTART = $LF; }
  while ($START != $STOP)
  {
    $w = vec($W, $START, 8);  # Grab byte from wildcard pattern
    $f = vec($F, $FSTART, 8); # Grab byte from filename
    $START += $DIR;
    $FSTART += $DIR;
    if ($w == 42)   # ASTERISK?
    { return 1; }
    else
    {
      # If the character is "?" then skip, but if
      # it's not "?", then the characters must match.
      if ($w != 63) { ($f == $w) or return 0; }
    }
  }
  return 1;
}
################################################################# PRIN
+T
#
# This function prints the total number of matches found
# so far followed by some text and a new line character.
#
#   Usage: PRINT(STRING)
#
sub PRINT { print ' ', RSPACE(8, $TOTAL), @_, "\n"; }
################################################################# Form
+atPath
#
# This function replaces the \\ or / characters in a path
# to match the current OS's separator character.
#
#   Usage: PATH = FormatPath(PATH)
#
sub FormatPath
{
  @_ or return '';
  my $PATH = $_[0];
  length($PATH) or return '';
  if ($LINUX)
  { $PATH =~ tr|\\|/|; }
  else
  { $PATH =~ tr|/|\\|; }
  return $PATH;
}
################################################################# AddS
+eparator
#
# This function makes sure that the given path ends with
# a / or \ depending on the current OS.
#
#   Usage: PATH = AddSeparator(PATH)
#
sub AddSeparator
{
  @_ or return '';
  my $PATH = $_[0];
  length($PATH) or return '';
  my $c = vec($PATH, length($PATH)-1, 8);
  return FormatPath($PATH . (($c == 47 || $c == 92) ? '' : '/') );
}
################################################################# RSPA
+CE
#
# This function adds spaces to the end of a string list to
# make sure it's exactly N characters long. If the string
# is longer than N, then just returns the string itself.
#
#   Usage: STRING = RSPACE(N, STRING, [STRINGS...])
#
sub RSPACE
{
  @_ > 1 or return '';
  my $LEN = shift;
  my $S = join('', @_);
  my $L = length($S);
  $L < $LEN or return $S;
  return $S . (' ' x ($LEN - $L));
}
################################################################# Coun
+tChars
#
# This function counts how many times STRING contains
# any of the characters of SUBSTR.
#
#   Usage: INTEGER = CountChars(STRING, SUBSTR)
#
sub CountChars
{
  @_ > 1 or return 0;
  my $S = shift; defined $S or return 0; length($S) or return 0;
  my $L = shift; defined $L or return 0; length($L) or return 0;
  my $P;
  my $c;
  my $i = length($L);
  my $N = 0;
  while ($i-- > 0)
  {
    $P = 0;
    $c = substr($L, $i, 1);
    while (($P = 1+index($S, $c, $P)) > 0)
    { $N++; }
  }
  return $N;
}
#################################################################
[download]

Btw I tested the program, and it seems to work fine.

In reply to Perl program to search files by harangzsolt33

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


more useful options
	PerlMonks