I have seen some threads around perlmonks regarding finding duplicate files.
I noticed that my idea was similar with a post by superfrink
a while ago in 2006.
You can find out more about how to use it here
So I used DBI,YAML,File::Find and SQLite and Digest::SHA1 to write a project that does this.
You can find more about it .
I have done tests and it works ok for 100.000 files and 2gb of total file size.But because of how
it was built I am sure it can scale to much more.
Any suggestions/oppinions are welcomed.
setup.pl
#this script will create the database which will be used
#CAREFUL! IF THERE EXISTS A PREVIOUS DATABASE IT WILL DELETE IT !!!
use strict;
use warnings;
use DBI;
`rm checksum_db.sqlite`;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
$dbh->do("CREATE TABLE checksums (id INTEGER PRIMARY KEY,checksum VARC
+HAR(42),size INTEGER,last_date_modified DATE,name VARCHAR(200) UNIQUE
+,is_dir VARCHAR(1),is_file VARCHAR(1),is_link VARCHAR(1),UNIQUE (chec
+ksum,name));");
config.yml
minsize: 64
directories:
- path: /usr
dir: 1
file: 1
link: 0
regex: .*
- path: /home/spx2/perlhobby
dir: 1
file: 1
link: 0
regex: .*
- path: /lib
dir: 1
file: 1
link: 0
regex: .*
build_database.pl
#this will be used just for the first run
#!/usr/bin/perl
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
my $config_path = 'config.yml';
my $config = LoadFile($config_path);
#to add columns in db for link,dir,file to know what the name column s
+tands for...
sub add_to_db {
my ($checksum,$last_modif_time,$size,$name)=@_;
#maybe calculating is_* should be done in process_file
my $is_dir = (-d $name)?'Y':'N';
my $is_file = (-f $name)?'Y':'N';
my $is_link = (-l $name)?'Y':'N';
$dbh->do(
sprintf "INSERT INTO checksums (checksum,size,last_date_modifi
+ed,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\
+"%s\",\"%s\",\"%s\");",
$checksum,
$size,
$last_modif_time->ymd,
$name,
$is_dir,
$is_file,
$is_link
);
};
sub delete_from_db {#remains to be completed
my ($name)=@_;
};
sub file2sha1 {
my $file=$_[0];
return '' if -d $file; #have to find out if to prune when a direct
+ory is found that doesn't match the regex
open my $f,"<$file";
my $sha1 = Digest::SHA1->new;
$sha1->addfile(*$f);
return $sha1->hexdigest;
}
sub process_file {
my $dir_configs=$_[0];
##optimisation using -d -l -f -s just once for return and also for
+ adding
#if current "file"(unix terminology) is a directory and the yaml c
+onfiguration
#tells us to eliminate directories from the search we do so by ret
+urning from the
#callback
return if -d $File::Find::name && ! $dir_configs->{dir};
return if -l $File::Find::name && ! $dir_configs->{link};
return if -f $File::Find::name && ! $dir_configs->{file};
return if -s $File::Find::name < $config->{minsize};
unless($File::Find::name =~ /$dir_configs->{regex}/) {
if(-d $File::Find::name) {
$File::Find::prune=1;
}
return;
}
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks)
= stat($File::Find::name);
my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);
# printf "%s %s %s %s\n",
# $File::Find::name,
# file2sha1($File::Find::name),
# -s $File::Find::name,
# $last_modif_time;
add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $File::
+Find::name,$File::Find::name);
#print Dumper $dir_configs;
};
for my $searched_dir_hash (@{ $config->{directories} }) {
# we skip the entry if it does not exist or it is not a directory
next unless (-e $searched_dir_hash->{path} && -d $searched_dir_has
+h->{path});
#we pass to the process_file function the yml configuration for th
+e current directory that is searched
find(
{ wanted=> sub { process_file($searched_dir_hash);} },
$searched_dir_hash->{path}
);
}
update_database.pl
#!/usr/bin/perl
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
my $config_path = 'config.yml';
my $config = LoadFile($config_path);
sub add_to_db {
my ($checksum,$last_modif_time,$size,$name)=@_;
#maybe calculating is_* should be done in process_file
my $is_dir = (-d $name)?'Y':'N';
my $is_file = (-f $name)?'Y':'N';
my $is_link = (-l $name)?'Y':'N';
$dbh->do(
sprintf "INSERT INTO checksums (checksum,size,last_date_modifi
+ed,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\
+"%s\",\"%s\",\"%s\");",
$checksum,
$size,
$last_modif_time->ymd,
$name,
$is_dir,
$is_file,
$is_link
);
}
sub update {
my ($name,$checksum,$last_modif_time)=@_;
$dbh->do(sprintf "UPDATE checksums SET checksum=\"%s\",last_date_m
+odified=\"%s\" WHERE name=\"%s\";",$checksum,$name,$last_modif_time->
+ymd);
}
sub find_or_update {
my ($name,$last_modif_time)=@_;
my $s=$dbh->prepare(sprintf "SELECT last_date_modified FROM checks
+ums WHERE name=\"%s\" ;",$name);
$s->execute;
my $results = $s->fetchall_arrayref;
if($results) {
#found it in the db;
return 2 if $last_modif_time->ymd eq $results->[0]->[0] ; #ret
+urn 2 if the entry is up to date
update($name,file2sha1($name),$last_modif_time);
return 1;# the entry is not up to date
}
return 0; #the entry has not be found- should be updated
};
sub file2sha1 {
my $file=$_[0];
return '' if -d $file; #have to find out if to prune when a direct
+ory is found that doesn't match the regex
open my $f,"<$file";
my $sha1 = Digest::SHA1->new;
$sha1->addfile(*$f);
return $sha1->hexdigest;
}
sub process_file {
my $dir_configs=$_[0];
return if -d $File::Find::name && ! $dir_configs->{dir};
return if -l $File::Find::name && ! $dir_configs->{link};
return if -f $File::Find::name && ! $dir_configs->{file};
return if -s $File::Find::name < $config->{minsize};
unless($File::Find::name =~ /$dir_configs->{regex}/) {
if(-d $File::Find::name) {
$File::Find::prune=1;
}
return;
}
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks)
= stat($File::Find::name);
my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);
#find out if entry needs update and update it if necesary
#find_or_update returns 0 only if it hasnt found the file in the c
+hecksum db
unless(find_or_update($File::Find::name,$last_modif_time)) {
add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $Fi
+le::Find::name,$File::Find::name);
#add it to db
};
};
for my $searched_dir_hash (@{ $config->{directories} }) {
next unless (-e $searched_dir_hash->{path} && -d $searched_dir_has
+h->{path});
find(
{ wanted=> sub { process_file($searched_dir_hash);} },
$searched_dir_hash->{path}
);
}
build_duplicate_script.pl
#!/usr/bin/perl
use strict;
use warnings;
use DBI;
use DateTime;
use Data::Dumper;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
open my $script,">duplicate_erase_script.sh";
sub get_unique_checksums {
my $sql="SELECT checksum as groupsize FROM checksums GROUP BY size
+ HAVING groupsize > 1;";
#because groups of size 1 cannot have duplicates
my $sth=$dbh->prepare($sql);
$sth->execute;
my $results=$sth->fetchall_arrayref;
return map { $_->[0] } @{$results};
};
sub checksum2names {
my ($checksum)=@_;
my $sql=sprintf "SELECT name FROM checksums WHERE checksum=\"%s\";
+",$checksum;
my $sth=$dbh->prepare($sql);
$sth->execute;
my $results=$sth->fetchall_arrayref;
return map { $_->[0] } @{$results};
};
for my $checksum (get_unique_checksums()) {
my @same_checksum=checksum2names($checksum);
my $leader = shift @same_checksum;#take aside on element of the gr
+oup making it the leader
print $script "# duplicates of $leader follow:\n";
for my $name (@same_checksum) {#get all the others and write comma
+nds to delete them
print $script "# rm $name\n";
}
};
close $script;
EDIT: The latest version of this can be found (it has heavily mutated due to suggestions of the monks).
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.
|
|