package FileDups; use Digest::MD5; use Moose; use namespace::autoclean; has 'name' => (is => 'ro', isa => 'Str', required => 1,); has 'pathname' => (is => 'ro', isa => 'Str', required => 1,); has 'max_size' => (is => 'ro', isa => 'Num', required => 1,); has 'big' => (is => 'rw', isa => 'Bool', required => 1, default => 0); has 'unread' => (is => 'rw', isa => 'Bool', required => 1, default => 0); has 'dupe' => (is => 'rw', isa => 'Bool', required => 1, default => 0); has 'md5' => (is => 'ro', isa => 'Str', lazy => 1, builder => '_calculate_md5'); has 'size' => (is => 'ro', isa => 'Num', lazy => 1, builder => '_calculate_size'); sub _calculate_size { my $self = shift; my $size = -s $self->name; if (-s $self->name > $self->max_size) { $size = $self->max_size; $self->big(1); } return $size; } sub _calculate_md5 { my $self = shift; my $file = $self->pathname; my $size = $self->size; my $chksum = 0; if ($size == $self->max_size) { $chksum = 'a'x32; } else { my $fh; unless (open $fh, "<", "$file" ) { $self->unread(1); return -1; #return -1 and exit from subrutine if file cannot be opened } binmode($fh); $chksum = Digest::MD5->new->addfile($fh)->hexdigest; close($fh); } return $chksum; } ;1 #### #!/usr/bin/env perl # References: # http://drdobbs.com/web-development/184416070 use strict; use warnings; use File::Find; use lib qw(lib); use FileDups; use Data::Dumper; # Hash of => [array of [array]], [array of objects] my (%dup, %sizes, @object, $number_files, $number_size_dups); my $max_size = 99999999; # Size above of whitch md5 will not be calculated my $return = "Press return to continue \n\n"; my $line = "-"x70 . "\n"; while (my $dir = shift @ARGV) { # Find and classify files die "\"$dir\" is not a directory. Give me a directory to search\n" unless (-d "$dir"); File::Find::find (\&wanted,"$dir"); } print "\n"; foreach (@object) { # Calculates md5 for files with equal size if ($sizes{$_->size} == "1") { $number_size_dups += 1; print "$number_size_dups Files with the same size \r"; $_->dupe(1); # The object has another object with the same size $_->md5; # Calculates md5 } } foreach (@object) { # Creates a hash of md5 values if ($_->dupe == 1) { # for files with the same size if (exists $dup{$_->md5}) { push @{$dup{$_->md5}}, [$_->size, $_->name, $_->pathname]; } else { $dup{$_->md5} = [ [$_->size, $_->name, $_->pathname] ]; } } } print "\n\nDuplicated files\n $line $return"; my $pausa4 = <>; foreach (sort keys %dup) { # sort hash by md5sum if ($#{$dup{$_}} > 0) # $_ = keys { # if we have more than 1 array whithin the same hash printf("\n%8s %10.10s %s\n", "Size", "Name", "Pathname"); foreach ( @{$dup{$_}} ) # $_ = keys, $dupes{keys} = list of references (scalars) { # iterate trough the first dimension of the array printf("%8d %10.10s %s\n",@{$_}); # dereference reference to array } } } my $r1 = &list_files("Big files","big",@object); # List big files my $r2 = &list_files("Unread files","unread",@object); # List unread files sub wanted { return unless (-f $_); my $file = FileDups->new(name => $_, pathname => $File::Find::name, max_size => $max_size); $number_files += 1; print "$number_files Files seen\r"; if ($file->size == $max_size) { # Identifies big files $sizes{$file->size} = "0"; # We do not check md5 for big files } elsif (exists $sizes{$file->size}) { # There are more then one file with this size $sizes{$file->size} = "1"; } else { $sizes{$file->size} = "0"; # This is a new size value, not duplicated } push @object, $file; # Puts the object in the @object array } sub list_files { # List objects according to criteria: my ($title,$criteria,@object) = @_; # (a) big files; (b) unread files print "\n \n $title \n" . $line; my $pausa = <>; foreach (@object) { if ($_->$criteria) { printf(" %10.10s %s\n",$_->name,$_->pathname); } } print $line; }