Beefy Boxes and Bandwidth Generously Provided by pair Networks
There's more than one way to do things
 
PerlMonks  

Duplicate file bounty hunter

by djw (Vicar)
on Apr 24, 2002 at 00:42 UTC ( [id://161492]=sourcecode: print w/replies, xml ) Need Help??
Category: Utility Scripts
Author/Contact Info djw - djw@perldev.org
Description: This will search out a directory recursively for duplicate files of a specified size (default: 100 MB). It logs everything and makes you dinner.

Enjoy, djw

*update* 04.24.2002
After some suggestions by fellow monks, I have changed this to use Digest::MD5. Thanks everyone++.

djw
#!/usr/bin/perl -w
use strict;

use File::stat;
use Digest::MD5;
use File::Find  qw(finddepth);
use Time::HiRes qw(gettimeofday);

# -----------
#   1024 x 1024 = 1048576 
# kbytes x 1024 = 1 MB

use constant MINFILESIZE => 104857600; # 100 MB
use constant ONEMEGABYTE => 1048576;
use vars qw(%fileInfo $number $totalSpace);

my ($totalFiles, $duplicateFiles) = 0;
my $dir = shift || &usage;

print STDOUT "\nRunning.  This could take a few minutes....";

# -----------    
# turn off buffering for errorlog et al.

$| = 1;

# -----------    
# Redirecting standard error output to 'error.log' - can
# get large if there are permission issues during the 
# recursive search.

open(OLDERR, ">&STDERR");
open(STDERR, '>', "error.log") || die "Can't redirect STDERR: ($!)\n";
select(STDERR);

# -----------    
# I wanted to see how long it would take for this
# to search trough large volumes.
#
# 89.2 minutes to search through a mounted drive 
# (130 GB of data) over a 100mbit switched network.
# Found 4 duplicates that were over 100 MB in size.
#
# 812.5 MB of total duplicated space.

my $beginRun = gettimeofday;

finddepth \&search, $dir;

my $endRun = gettimeofday;
my $runTime = $endRun - $beginRun;

# -----------    
# translate seconds into appropriate time for display
# later. precise? nah...

if ($runTime > 60) {
    $runTime = sprintf("%.2f minutes", $runTime / 60);
} elsif ($runTime > 3600) {
    $runTime = sprintf("%.2f hours", $runTime / 3600);
} else {
    $runTime = sprintf("%.2f seconds", $endRun - $beginRun);
}

print STDOUT "Complete.\n";

# -----------    
# This writes file info to our 'duplicate.log' file.
# [filename], [size], [quantity] (greater than 1)

&write;

close(STDERR);
close(OLDERR);

sub search {
    # -----------    
    # The Meat (tm).
    #
    # Using File::Find this recursively searches
    # through each directory from the directory
    # given at runtime.  It checks to see if each
    # file is of the size we are curious about.
    #
    # If it is, we get the MD5 digest info for the
    # file to see if we already have it in our
    # hash.  If it exists, we increment the
    # counter, if not, a new key gets created
    # (using the MD5 digest).

    if (-f) {
        my $fsize = stat($_)->size;
        if ($fsize > MINFILESIZE) {
        
            open(MD5FILE, "$_") || warn "Can't open file ($_): ($!)\n"
+;
            binmode(MD5FILE);
            my $md5hash = Digest::MD5->new->addfile(*MD5FILE)->hexdige
+st;        
            close(MD5FILE);
            
            if (exists($fileInfo{$md5hash})) {
                $fileInfo{$md5hash}[2]{count} += 1;
            } else {
                $fileInfo{$md5hash}[0]{filename} = $_;
                $fileInfo{$md5hash}[1]{size}     = $fsize;
                $fileInfo{$md5hash}[2]{count}    = 1;
            }
        }
        $totalFiles++;
    }
}


sub write {
    foreach (keys %fileInfo) {
        if ($fileInfo{$_}[2]{count} < 2) {
            delete $fileInfo{$_};
        }
    }

    if (%fileInfo) {
        open (LOG, "+>duplicates.log") || die "Can't create logfile: (
+$!)\n";
        foreach (keys %fileInfo) {
            next if ($fileInfo{$_}[2]{count} < 2);
            $duplicateFiles++;
            $number = sprintf("%.1f", $fileInfo{$_}[1]{size} / ONEMEGA
+BYTE);
            my $duplicateSpace = $number * ($fileInfo{$_}[2]{count} - 
+1);
            $totalSpace += $duplicateSpace;
            write(LOG);
        }
        close(LOG);
        print STDOUT "\nFound $duplicateFiles/$totalFiles duplicate fi
+les.\n";
        print STDOUT "Runtime: $runTime.\n";
        print STDOUT "Duplicated Space: $totalSpace MB\n";
    } else {
        print STDOUT "\nNo duplicates found - 0/$totalFiles files.\n";
        print STDOUT "Runtime: $runTime.\n";
    }
}

sub usage {
    print "Usage: ./duplicates.pl [dirname]\n";
    print "\n";
    print "BAD MR. KITTY!\n\nMake sure you supply a directory to searc
+h through!\n";
    print "Example: ./duplicates.pl /home/foo/\n";

    exit;
}

format LOG_TOP =
                              FILENAME  SIZE       QTY
-----------------------------------------------------------
.

format LOG =
@>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>  @>>>> MB   (@>)
$fileInfo{$_}[0]{filename}, $number, $fileInfo{$_}[2]{count}
.
Replies are listed 'Best First'.
Re: Duplicate file bounty hunter
by belg4mit (Prior) on Apr 24, 2002 at 01:02 UTC
    One word MD5.
    PS> I have a utility which does this and much more at snafu. I've been meaning to freshen it up and submit here. It might meet your needs.

    --
    perl -pew "s/\b;([mnst])/'$1/g"

      A previous discussion on this subject lies here: Find duplicate files.. Much the same conclusion: use Digest::MD5.


      print@_{sort keys %_},$/if%_=split//,'= & *a?b:e\f/h^h!j+n,o@o;r$s-t%t#u'
      MD5 - never thought of using that, thanks for the tip. I downloaded your utility but have to take a look at it a bit later. Thanks belg4mit++.

      djw
Re: Duplicate file bounty hunter
by rob_au (Abbot) on Apr 24, 2002 at 02:27 UTC
    I must agree wholeheartedly with the recommendation of belg4mit to explore the usage of Digest::MD5 for the comparison of files. The following is a small script that I wrote previously based upon a node by demerphq here which may be of use for comparative purposes.

    #!/usr/bin/perl -wT use Digest::MD5; use File::Find; use IO::File; use strict; $| = 1; $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin'; my $ctx = Digest::MD5->new; my %digest; my $path = $ARGV[0] || '.'; find ({ 'wanted' => sub { if (-f $_) { lstat; if ((-r _) && (!-l _)) { $ctx->reset; my $fh = IO::File->new($_, 'r'); $ctx->addfile(\$fh); my $md5 = $ctx->hexdigest; if (exists $digest{$md5}) { push @{$digest{$md5}->{'dupes'}}, $_; } else { $digest{$md5} = { 'file' => $_, 'dupes' => [] } } } } else { print "Searching $_\n"; } }, 'no_chdir' => 1 }, $path); print "There are ", ((scalar @{$digest{$_}->{'dupes'}}) || 0), " dupli +cate files.\n"; exit 0;

     

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://161492]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others admiring the Monastery: (5)
As of 2024-03-28 20:30 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found