package FileDups;
use Digest::MD5;
use Moose;
use namespace::autoclean;

has 'name'     => (is => 'ro', isa => 'Str',  required => 1,);
has 'pathname' => (is => 'ro', isa => 'Str',  required => 1,);
has 'max_size' => (is => 'ro', isa => 'Num',  required => 1,);
has 'big'      => (is => 'rw', isa => 'Bool', required => 1, default => 0);
has 'unread'   => (is => 'rw', isa => 'Bool', required => 1, default => 0);
has 'dupe'     => (is => 'rw', isa => 'Bool', required => 1, default => 0);

has 'md5'      => (is => 'ro', isa => 'Str',      lazy => 1, builder => '_calculate_md5');
has 'size'     => (is => 'ro', isa => 'Num',      lazy => 1, builder => '_calculate_size');

sub _calculate_size {
	my $self = shift;
	my $size = -s $self->name;
	if (-s $self->name > $self->max_size) {
		$size = $self->max_size;
		$self->big(1);
	} 
	return $size;
}

sub _calculate_md5 {
	 my $self = shift;
	 my $file = $self->pathname;
	 my $size = $self->size;
	 my $chksum = 0;
	 if ($size == $self->max_size) {
		 $chksum = 'a'x32;
	 } else {
		my $fh;
		unless (open $fh, "<", "$file" ) {
			$self->unread(1);
			return -1;  #return -1 and exit from subrutine if file cannot be opened
		}
 		binmode($fh);
 		$chksum = Digest::MD5->new->addfile($fh)->hexdigest;
 		close($fh);
	}
	return $chksum;
}

;1

##</code><code>##

#!/usr/bin/env perl
# References:
# 	http://drdobbs.com/web-development/184416070

use strict;
use warnings;
use File::Find;
use lib qw(lib);
use FileDups;
use Data::Dumper;

# Hash of => [array of [array]], [array of objects]
my (%dup, %sizes, @object, $number_files, $number_size_dups);

my $max_size = 99999999; 			# Size above of whitch md5 will not be calculated
my $return = "Press return to continue \n\n";
my $line = "-"x70 . "\n";

while (my $dir = shift @ARGV) {			# Find and classify files
	die "\"$dir\" is not a directory. Give me a directory to search\n" unless (-d "$dir");
	File::Find::find (\&wanted,"$dir");
}

print "\n";
foreach (@object) {				# Calculates md5 for files with equal size
	if ($sizes{$_->size} == "1") {
		$number_size_dups += 1; print "$number_size_dups Files with the same size \r";
		$_->dupe(1);			# The object has another object with the same size
		$_->md5;			# Calculates md5
	}
}

foreach (@object) {				# Creates a hash of md5 values
	if ($_->dupe == 1) {			# for files with the same size
		if (exists $dup{$_->md5}) {
			push @{$dup{$_->md5}}, [$_->size, $_->name, $_->pathname];
		} else {
		$dup{$_->md5} = [ [$_->size, $_->name, $_->pathname] ]; 
		}
	}
}

print "\n\nDuplicated files\n $line $return"; my $pausa4 = <>;		
foreach (sort keys %dup) 
{							# sort hash by md5sum
	if ($#{$dup{$_}} > 0)				# $_ = keys 
	{						# if we have more than 1 array whithin the same hash
		printf("\n%8s %10.10s %s\n", "Size", "Name", "Pathname");
 		foreach ( @{$dup{$_}} )  		# $_ = keys, $dupes{keys} = list of references (scalars)
		{					# iterate trough the first dimension of the array
			printf("%8d %10.10s %s\n",@{$_});	# dereference reference to array
 		}
	}
}

my $r1 = &list_files("Big files","big",@object);	# List big files
my $r2 = &list_files("Unread files","unread",@object);	# List unread files

sub wanted {
	return unless (-f $_);
	my $file = FileDups->new(name => $_, pathname => $File::Find::name, max_size => $max_size);

	$number_files += 1; print "$number_files Files seen\r";
	if ($file->size == $max_size) {			# Identifies big files
		$sizes{$file->size} = "0";		# We do not check md5 for big files
	} elsif (exists $sizes{$file->size}) {		# There are more then one file with this size
		$sizes{$file->size} = "1";
	} else {
		$sizes{$file->size} = "0";		# This is a new size value, not duplicated
	}
	push @object, $file;				# Puts the object in the @object array
}

sub list_files {					# List objects according to criteria:
	my ($title,$criteria,@object) = @_;		#   (a) big files; (b) unread files
	print "\n \n $title \n" . $line; my $pausa = <>;
	foreach (@object) {
		if ($_->$criteria) {
			printf("    %10.10s %s\n",$_->name,$_->pathname);
		}
	}
	print $line;
}