http://qs321.pair.com?node_id=671503

I have seen some threads around perlmonks regarding finding duplicate files. I noticed that my idea was similar with a post by superfrink a while ago in 2006. You can find out more about how to use it here So I used DBI,YAML,File::Find and SQLite and Digest::SHA1 to write a project that does this. You can find more about it . I have done tests and it works ok for 100.000 files and 2gb of total file size.But because of how it was built I am sure it can scale to much more. Any suggestions/oppinions are welcomed.


setup.pl
#this script will create the database which will be used #CAREFUL! IF THERE EXISTS A PREVIOUS DATABASE IT WILL DELETE IT !!! use strict; use warnings; use DBI; `rm checksum_db.sqlite`; my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","",""); $dbh->do("CREATE TABLE checksums (id INTEGER PRIMARY KEY,checksum VARC +HAR(42),size INTEGER,last_date_modified DATE,name VARCHAR(200) UNIQUE +,is_dir VARCHAR(1),is_file VARCHAR(1),is_link VARCHAR(1),UNIQUE (chec +ksum,name));");

config.yml
minsize: 64 directories: - path: /usr dir: 1 file: 1 link: 0 regex: .* - path: /home/spx2/perlhobby dir: 1 file: 1 link: 0 regex: .* - path: /lib dir: 1 file: 1 link: 0 regex: .*

build_database.pl

#this will be used just for the first run #!/usr/bin/perl use strict; use warnings; use File::Find; use YAML qw/LoadFile/; use Data::Dumper; use Digest::SHA1 qw/sha1_hex/; use DBI; use DateTime; my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","",""); my $config_path = 'config.yml'; my $config = LoadFile($config_path); #to add columns in db for link,dir,file to know what the name column s +tands for... sub add_to_db { my ($checksum,$last_modif_time,$size,$name)=@_; #maybe calculating is_* should be done in process_file my $is_dir = (-d $name)?'Y':'N'; my $is_file = (-f $name)?'Y':'N'; my $is_link = (-l $name)?'Y':'N'; $dbh->do( sprintf "INSERT INTO checksums (checksum,size,last_date_modifi +ed,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\ +"%s\",\"%s\",\"%s\");", $checksum, $size, $last_modif_time->ymd, $name, $is_dir, $is_file, $is_link ); }; sub delete_from_db {#remains to be completed my ($name)=@_; }; sub file2sha1 { my $file=$_[0]; return '' if -d $file; #have to find out if to prune when a direct +ory is found that doesn't match the regex open my $f,"<$file"; my $sha1 = Digest::SHA1->new; $sha1->addfile(*$f); return $sha1->hexdigest; } sub process_file { my $dir_configs=$_[0]; ##optimisation using -d -l -f -s just once for return and also for + adding #if current "file"(unix terminology) is a directory and the yaml c +onfiguration #tells us to eliminate directories from the search we do so by ret +urning from the #callback return if -d $File::Find::name && ! $dir_configs->{dir}; return if -l $File::Find::name && ! $dir_configs->{link}; return if -f $File::Find::name && ! $dir_configs->{file}; return if -s $File::Find::name < $config->{minsize}; unless($File::Find::name =~ /$dir_configs->{regex}/) { if(-d $File::Find::name) { $File::Find::prune=1; } return; } my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat($File::Find::name); my $last_modif_time=DateTime->from_epoch(epoch=>$mtime); # printf "%s %s %s %s\n", # $File::Find::name, # file2sha1($File::Find::name), # -s $File::Find::name, # $last_modif_time; add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $File:: +Find::name,$File::Find::name); #print Dumper $dir_configs; }; for my $searched_dir_hash (@{ $config->{directories} }) { # we skip the entry if it does not exist or it is not a directory next unless (-e $searched_dir_hash->{path} && -d $searched_dir_has +h->{path}); #we pass to the process_file function the yml configuration for th +e current directory that is searched find( { wanted=> sub { process_file($searched_dir_hash);} }, $searched_dir_hash->{path} ); }


update_database.pl

#!/usr/bin/perl use strict; use warnings; use File::Find; use YAML qw/LoadFile/; use Data::Dumper; use Digest::SHA1 qw/sha1_hex/; use DBI; use DateTime; my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","",""); my $config_path = 'config.yml'; my $config = LoadFile($config_path); sub add_to_db { my ($checksum,$last_modif_time,$size,$name)=@_; #maybe calculating is_* should be done in process_file my $is_dir = (-d $name)?'Y':'N'; my $is_file = (-f $name)?'Y':'N'; my $is_link = (-l $name)?'Y':'N'; $dbh->do( sprintf "INSERT INTO checksums (checksum,size,last_date_modifi +ed,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\ +"%s\",\"%s\",\"%s\");", $checksum, $size, $last_modif_time->ymd, $name, $is_dir, $is_file, $is_link ); } sub update { my ($name,$checksum,$last_modif_time)=@_; $dbh->do(sprintf "UPDATE checksums SET checksum=\"%s\",last_date_m +odified=\"%s\" WHERE name=\"%s\";",$checksum,$name,$last_modif_time-> +ymd); } sub find_or_update { my ($name,$last_modif_time)=@_; my $s=$dbh->prepare(sprintf "SELECT last_date_modified FROM checks +ums WHERE name=\"%s\" ;",$name); $s->execute; my $results = $s->fetchall_arrayref; if($results) { #found it in the db; return 2 if $last_modif_time->ymd eq $results->[0]->[0] ; #ret +urn 2 if the entry is up to date update($name,file2sha1($name),$last_modif_time); return 1;# the entry is not up to date } return 0; #the entry has not be found- should be updated }; sub file2sha1 { my $file=$_[0]; return '' if -d $file; #have to find out if to prune when a direct +ory is found that doesn't match the regex open my $f,"<$file"; my $sha1 = Digest::SHA1->new; $sha1->addfile(*$f); return $sha1->hexdigest; } sub process_file { my $dir_configs=$_[0]; return if -d $File::Find::name && ! $dir_configs->{dir}; return if -l $File::Find::name && ! $dir_configs->{link}; return if -f $File::Find::name && ! $dir_configs->{file}; return if -s $File::Find::name < $config->{minsize}; unless($File::Find::name =~ /$dir_configs->{regex}/) { if(-d $File::Find::name) { $File::Find::prune=1; } return; } my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat($File::Find::name); my $last_modif_time=DateTime->from_epoch(epoch=>$mtime); #find out if entry needs update and update it if necesary #find_or_update returns 0 only if it hasnt found the file in the c +hecksum db unless(find_or_update($File::Find::name,$last_modif_time)) { add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $Fi +le::Find::name,$File::Find::name); #add it to db }; }; for my $searched_dir_hash (@{ $config->{directories} }) { next unless (-e $searched_dir_hash->{path} && -d $searched_dir_has +h->{path}); find( { wanted=> sub { process_file($searched_dir_hash);} }, $searched_dir_hash->{path} ); }


build_duplicate_script.pl
#!/usr/bin/perl use strict; use warnings; use DBI; use DateTime; use Data::Dumper; my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","",""); open my $script,">duplicate_erase_script.sh"; sub get_unique_checksums { my $sql="SELECT checksum as groupsize FROM checksums GROUP BY size + HAVING groupsize > 1;"; #because groups of size 1 cannot have duplicates my $sth=$dbh->prepare($sql); $sth->execute; my $results=$sth->fetchall_arrayref; return map { $_->[0] } @{$results}; }; sub checksum2names { my ($checksum)=@_; my $sql=sprintf "SELECT name FROM checksums WHERE checksum=\"%s\"; +",$checksum; my $sth=$dbh->prepare($sql); $sth->execute; my $results=$sth->fetchall_arrayref; return map { $_->[0] } @{$results}; }; for my $checksum (get_unique_checksums()) { my @same_checksum=checksum2names($checksum); my $leader = shift @same_checksum;#take aside on element of the gr +oup making it the leader print $script "# duplicates of $leader follow:\n"; for my $name (@same_checksum) {#get all the others and write comma +nds to delete them print $script "# rm $name\n"; } }; close $script;

EDIT: The latest version of this can be found (it has heavily mutated due to suggestions of the monks).