Beefy Boxes and Bandwidth Generously Provided by pair Networks
Don't ask to ask, just ask
 
PerlMonks  

Directory Tree Comparison Module (File::DiffTree)

by bikeNomad (Priest)
on Jun 23, 2001 at 00:23 UTC ( [id://90858]=sourcecode: print w/replies, xml ) Need Help??
Category: Utility Scripts
Author/Contact Info bikeNomad Ned Konz, perl@bike-nomad.com
Description: This is a package that I wrote after seeing some other scripts here that did similar things. This package allows the behavior on same/different files as well as comparison to be pluggable using CODE references. It may become a CPAN module if the response here is positive enough. An example program that uses it is at the end.
use strict;
package File::DiffTree;
# File::DiffTree -- Compare two directory hierarchies
# By Ned Konz, perl@bike-nomad.com
# $Revision$

use vars '$VERSION';
use Algorithm::Diff 1.01 ();
use File::Find ();

BEGIN {
    $File::DiffTree::VERSION = "0.1";
}

sub _findFiles
{
    my $topDir = shift;
    my $statFields = shift;
    my $reject = shift;

    my $topDirLength = length($topDir);
    my @files;
    File::Find::find( sub {
        my @stat = stat($_);
        if (!@stat) { warn "can't stat $File::Find::name : $!\n"; retu
+rn }
        return if -d _;
        my $fileName = substr($File::Find::name, $topDirLength);
        return if defined($reject) && &$reject($File::Find::name, @sta
+t);
        push(@files, [ $fileName, @stat[ @$statFields ] ]);
    }, $topDir);
    return \@files;
}

sub diffTree
{
    my $dirA = shift;
    my $dirB = shift;
    my $userOptions = shift || { };
    my $numberOfFields;
    my $numberOfSignificantFields;
    my $foldCase = sub { $_[0] };       # default no fold
    my %options = (
        onlya           => sub { },
        onlyb           => sub { },
        match           => sub { },
        statfields      => [ 7, 9 ],    # size, mtime
        hash            => sub {        # default=stringize them
            my $arr = shift;
            join($;, $foldCase->($arr->[0]),
                @{$arr}[ 1 .. $numberOfSignificantFields ])
        },
        reject          => undef,
        significantfields => undef,
        foldcase        => 0,
        sort        => sub { $foldCase->($a->[0]) cmp $foldCase->($b->
+[0]) },
        # normalize the user's keys (DWIM)
        map { my $key = $_;
            $key =~ tr/A-Z_/a-z/d;
            $key, $userOptions->{$_}
        } keys(%$userOptions)
    );

    $numberOfFields = scalar(@{$options{statfields}});
    $numberOfSignificantFields = defined($options{significantfields})
        ? $options{significantfields}
        : $numberOfFields;
    $numberOfSignificantFields = $numberOfFields
        if $numberOfSignificantFields > $numberOfFields;
    $foldCase = sub { lc($_[0]) } if $options{foldcase};

    my $filesA = _findFiles($dirA, $options{statfields}, $options{reje
+ct});
    my $filesB = _findFiles($dirB, $options{statfields}, $options{reje
+ct});

    # sort by name
    @$filesA = sort { $options{sort}->() } @$filesA;
    @$filesB = sort { $options{sort}->() } @$filesB;

    Algorithm::Diff::traverse_sequences(
        $filesA, $filesB, {
            MATCH       => sub {
                $options{match}->($filesA->[$_[0]], $filesB->[$_[1]]) 
+},
            DISCARD_A   => sub { $options{onlya}->($filesA->[$_[0]]) }
+,
            DISCARD_B   => sub { $options{onlyb}->($filesB->[$_[1]]) }
+,
        }, $options{hash} );
}  

1;
__END__

=head1 NAME

File::DiffTree - Compare two directory hierarchies

=head1 SYNOPSIS

  use File::DiffTree;

  File::DiffTree::diffTree($dirA, $dirB, {
    Match => sub { print $_[0]->[0], " matches\n" },
    # Fold_Case => 1,  # if on OS that doesn't care like windoze
  });

=head1 DESCRIPTION

C<File::DiffTree> compares the files in two directory hierarchies, cal
+ling
optional user-supplied callbacks for files in just one or the other di
+rectory,
as well as for files that match.

Matching is determined by matching the name (with optional case foldin
+g), as
well as zero or more of the numbers output by the C<stat()> call. You 
+can
specify how many fields from C<stat> will be looked at for a match.

You can also specify how many fields from stat will be provided to you
+r
callback routines. See OPTIONS below for the options to
C<File::DiffTree::diffTree>.

=head1 OPTIONS

The third argument to File::DiffTree::diffTree is a hash reference
that can contain the following options. Option names may have undersco
+res or
capital letters as desired (that is, OnlyA, O_n_L_ya, Only_A, onlya, a
+nd only_a
are equivalent).

Since nothing by default is done for B<only_a>, B<only_b>, or B<match>
+, you
must provide at least one of these for any interesting behavior.

=over 4

=item B<only_a>

=item B<only_b>

The B<only_a> and B<only_b> options supply CODE references to user cal
+lback
routines that are called when a file appears in only one of the two di
+rectory
trees, or exists in both but has different significant stat fields.

By default, nothing is done for these files.

The argument to these routines is an array that contains the filename 
+relative
to the starting directory, as well as whatever fields from stat were d
+efaulted
or specified with the B<stat_fields> option.

  File::DiffTree::diffTree( $dir1, $dir2, {
      only_a => sub { print "only in $dir1: ", $_[0]->[0], "\n" },
      only_b => sub { print "only in $dir2: ", $_[0]->[0], "\n" },
  });

Of course, you can also specify a reference to a separate subroutine t
+hat you've
written:

  File::DiffTree::diffTree( $dir1, $dir2, {
      only_a => \&onlyA,
      only_b => \&onlyB,
  });

=item B<match>

The B<match> option supplies a CODE reference to a user callback routi
+ne that
is called when a file appears to match (based on name and significant 
+fields
from the stat call).

By default, nothing is done for these files.

The arguments to the B<match> routine are two arrays (one for each dir
+ectory)
that contain the filename relative to the starting directory, as well 
+as
whatever fields from stat were defaulted or specified with the B<stat_
+fields>
option.

  File::DiffTree::diffTree( $dir1, $dir2, {
      match => sub { print "in both $dir1 and $dir2: ", $_[0]->[0], "\
+n" },
  });

=item B<stat_fields>

The B<stat_fields> option specifies which fields from C<stat> will be
passed to the B<only_a>, B<only_b>, or B<match> user callbacks. This i
+s
an ARRAY reference consisting of numbers from 0 through 12. By default
+,
it is:

  stat_fields => [ 7, 9 ],

That is, the size and mtime (last modified time) of the files are pass
+ed.
The possible field numbers are:

   0 dev      device number of filesystem
   1 ino      inode number
   2 mode     file mode  (type and permissions)
   3 nlink    number of (hard) links to the file
   4 uid      numeric user ID of file's owner
   5 gid      numeric group ID of file's owner
   6 rdev     the device identifier (special files only)
   7 size     total size of file, in bytes
   8 atime    last access time in seconds since the epoch
   9 mtime    last modify time in seconds since the epoch
  10 ctime    inode change time (NOT creation time!) in seconds since 
+the epoch
  11 blksize  preferred block size for file system I/O
  12 blocks   actual number of blocks allocated

If you want to compare only the name and size, but still have access t
+o the
modification time and inode, you can specify this using:

  File::DiffTree::diffTree( $dir1, $dir2, {
      match => sub { print "in both $dir1 and $dir2: ",
          $_[0]->[0], "\n" },
      stat_fields => [ 7, 9, 1 ], # size, mtime, inode
      significant_fields => 1,    # just size
  });

Unless the B<significant_fields> option below is specified, all of the
B<stat_fields> will be considered when looking for a match. So by defa
+ult,
file comparisons compare name, size, and modification time.

=item B<significant_fields>

The B<significant_fields> option is a number that specifies how many o
+f
the fields from C<stat> will be considered when comparing files. By de
+fault,
all of the fields will be compared. If you supply a 0 for B<significan
+t_fields>,
only the name will be compared.

This option is provided so that you can have separate control over how
+ many
fields from C<stat> you are passed and how many of those fields are co
+mpared
by C<File::DiffTree>.

=item B<reject>

The B<reject> option is a CODE reference that can be provided to filte
+r files
that are unwanted. It is called from inside C<File::Find::find> with t
+he
full filename and all the fields from C<stat> (this is 13 arguments).

Also, the C<$_> variable is set to the last component of the filename,
the current directory is the directory of the file, and the C<_> pseud
+o-
file handle can be tested.

If it returns true, the file will not be considered.

For instance, to ignore files that are unreadable or end in C<.bak>,
you can do this:

  File::DiffTree::diffTree( $dir1, $dir2, {
      match => sub { print "in both $dir1 and $dir2: ",
          $_[0]->[0], "\n" },
      reject => sub { /\.bak$/ || ! -r _ } 
  });

=item B<fold_case>

If the B<fold_case> option is provided and is true, filenames will be
compared ignoring case differences. The filenames passed to the user
callbacks will have the actual case preserved.

This is probably what is wanted under Windows. You can do this for
portability:

    Fold_Case => ($^O eq 'Win32'),

=item B<sort>

The B<sort> option is a CODE reference that supplies an optional subro
+utine
that will be called when sorting the lists of files. It will have the 
+two arrays
to be compared passed in via the package variables C<$File::DiffTree::
+a> and
C<$File::DiffTree::b>. By default, sorting is by filename, with case f
+olding
if the B<fold_case> option is set.

You probably won't need this option. If you do, you may have to supply
+ the
B<hash> option as well.

=item B<hash>

The B<hash> option is a CODE reference that supplies an optional subro
+utine
that will be called to generate a key to determine uniqueness of the f
+iles.
By default, this key will consist of the file name, and all the stat f
+ields
specified by the B<significant_fields> option, turned into strings and
separated by the C<$;> character (by default C<\034>). Specify the B<h
+ash>
option if you need to do something different. The argument to this sub
+routine
is an array reference like those passed to the B<only_a> and B<match> 
+subroutines.

You shouldn't need this option. If you do, you'll probably have to sup
+ply
the B<sort> option as well.

=back

=head2 EXPORT

File::DiffTree doesn't export anything. Typing is good for you. Call d
+iffTree
as File::DiffTree::diffTree .

=head1 AUTHOR

By Ned Konz, perl@bike-nomad.com.

=head1 LICENSE

This module is licensed under the same license as Perl itself.

=head1 SEE ALSO

perl(1).

L<Algorithm::Diff>

=cut

# vim: ts=4 sw=4

Here's a little demo program:

#!/usr/bin/perl -w
# This is a demo program to show the use of File::DiffTree
# by Ned Konz
use strict;

use File::DiffTree;
use File::Compare;

if (@ARGV != 2) {
    print STDERR "usage: $0 dir1 dir2\n";
    exit(1);
}

my $dirA = shift;
my $dirB = shift;

sub onlyA { print "Only in $dirA: ", $_[0]->[0], "\n" }

sub onlyB { print "Only in $dirB: ", $_[0]->[0], "\n" }

# This will be called if names match.
# The file size or contents (or mtimes or inodes) could still be diffe
+rent.
sub match {
    my $arr1 = shift;
    my $arr2 = shift;
    my $fn1 = $dirA . $arr1->[0];
    my $fn2 = $dirB . $arr2->[0];
    my $compare = $arr1->[1] <=> $arr2->[1];    # different if sizes d
+iffer
    if (! $compare)
    {
        my $retval = File::Compare::compare($fn1, $fn2);
        if ($retval == -1) {
            print STDERR "Problems opening $fn1 or $fn2: $!\n";
            return;
        }
        $compare = $retval;
    }
    if (! $compare) {
        print "Match: ", join('|', @$arr1), "\t", join('|', @$arr2), "
+\n"
    }
    else {
        print "Different: ", join('|', @$arr1), "\t", join('|', @$arr2
+), "\n"
    }
}

File::DiffTree::diffTree($dirA, $dirB, {
    Only_A              => \&onlyA,
    Only_B              => \&onlyB,
    Match               => \&match,
    Significant_Fields  => 0,   # just name (not size, mtime or inode)
    Reject              => sub { /(?:~|\.bak|\.tmp)$/ || ! -r _ },
    Fold_Case => ($^O eq 'Win32'),      # if on OS that doesn't care l
+ike windoze
});

# vim: ts=4 sw=4

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://90858]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others admiring the Monastery: (5)
As of 2024-04-18 02:03 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found