comment on

I have seen some threads around perlmonks regarding finding duplicate files. I noticed that my idea was similar with a post by superfrink a while ago in 2006. You can find out more about how to use it here So I used DBI,YAML,File::Find and SQLite and Digest::SHA1 to write a project that does this. You can find more about it . I have done tests and it works ok for 100.000 files and 2gb of total file size.But because of how it was built I am sure it can scale to much more. Any suggestions/oppinions are welcomed.

setup.pl

#this script will create the database which will be used
#CAREFUL! IF THERE EXISTS A PREVIOUS DATABASE IT WILL DELETE IT !!!
use strict;
use warnings;
use DBI;
`rm checksum_db.sqlite`;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
$dbh->do("CREATE TABLE checksums (id INTEGER PRIMARY KEY,checksum VARC
+HAR(42),size INTEGER,last_date_modified DATE,name VARCHAR(200) UNIQUE
+,is_dir VARCHAR(1),is_file VARCHAR(1),is_link VARCHAR(1),UNIQUE (chec
+ksum,name));");
[download]

config.yml

minsize: 64
directories:
  - path: /usr
    dir: 1
    file: 1
    link: 0
    regex: .*
  - path: /home/spx2/perlhobby
    dir: 1
    file: 1
    link: 0
    regex: .*
  - path: /lib
    dir: 1
    file: 1
    link: 0
    regex: .*
[download]

build_database.pl

#this will be used just for the first run
#!/usr/bin/perl 
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;

my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");

my $config_path = 'config.yml';
my $config = LoadFile($config_path);

#to add columns in db for link,dir,file to know what the name column s
+tands for...
sub add_to_db {
    my ($checksum,$last_modif_time,$size,$name)=@_;
    #maybe calculating is_* should be done in process_file
    my $is_dir  = (-d $name)?'Y':'N';
    my $is_file = (-f $name)?'Y':'N';
    my $is_link = (-l $name)?'Y':'N';
    $dbh->do(
        sprintf "INSERT INTO checksums (checksum,size,last_date_modifi
+ed,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\
+"%s\",\"%s\",\"%s\");",
        $checksum,
        $size,
        $last_modif_time->ymd,
        $name,
        $is_dir,
        $is_file,
        $is_link
    );
};

sub delete_from_db {#remains to be completed
    my ($name)=@_;
};


sub file2sha1 {
    my $file=$_[0];
    return '' if -d $file; #have to find out if to prune when a direct
+ory is found that doesn't match the regex
    open my $f,"<$file";
    my $sha1 = Digest::SHA1->new;
    $sha1->addfile(*$f);
    return $sha1->hexdigest;
}


sub process_file {
    my $dir_configs=$_[0];
    ##optimisation using -d -l -f -s just once for return and also for
+ adding

    #if current "file"(unix terminology) is a directory and the yaml c
+onfiguration
    #tells us to eliminate directories from the search we do so by ret
+urning from the
    #callback
    return if -d $File::Find::name && ! $dir_configs->{dir};
    return if -l $File::Find::name && ! $dir_configs->{link};
    return if -f $File::Find::name && ! $dir_configs->{file};
    return if -s $File::Find::name < $config->{minsize};
    unless($File::Find::name =~  /$dir_configs->{regex}/) {
        if(-d $File::Find::name) {
            $File::Find::prune=1;
        }
        return;
    }

    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
        $atime,$mtime,$ctime,$blksize,$blocks)
    = stat($File::Find::name);
    my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);
#    printf "%s %s %s %s\n",
#    $File::Find::name,
#    file2sha1($File::Find::name),
#    -s $File::Find::name,
#    $last_modif_time;

    add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $File::
+Find::name,$File::Find::name);
    #print Dumper $dir_configs;
};


for my $searched_dir_hash (@{ $config->{directories} }) {
    # we skip the entry if it does not exist or it is not a directory
    next unless (-e $searched_dir_hash->{path} && -d $searched_dir_has
+h->{path}); 
    #we pass to the process_file function the yml configuration for th
+e current directory that is searched
    find(
        {  wanted=> sub { process_file($searched_dir_hash);}  }, 
        $searched_dir_hash->{path} 
    );
}
[download]

update_database.pl

#!/usr/bin/perl 
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");

my $config_path = 'config.yml';
my $config = LoadFile($config_path);

sub add_to_db {
    my ($checksum,$last_modif_time,$size,$name)=@_;
    #maybe calculating is_* should be done in process_file
    my $is_dir  = (-d $name)?'Y':'N';
    my $is_file = (-f $name)?'Y':'N';
    my $is_link = (-l $name)?'Y':'N';
    $dbh->do(
        sprintf "INSERT INTO checksums (checksum,size,last_date_modifi
+ed,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\
+"%s\",\"%s\",\"%s\");",
        $checksum,
        $size,
        $last_modif_time->ymd,
        $name,
        $is_dir,
        $is_file,
        $is_link
    );
}

sub update {
    my ($name,$checksum,$last_modif_time)=@_;
    $dbh->do(sprintf "UPDATE checksums SET checksum=\"%s\",last_date_m
+odified=\"%s\" WHERE name=\"%s\";",$checksum,$name,$last_modif_time->
+ymd);
}

sub find_or_update {
    my ($name,$last_modif_time)=@_;
    my $s=$dbh->prepare(sprintf "SELECT last_date_modified FROM checks
+ums WHERE name=\"%s\" ;",$name);
    $s->execute;
    my $results = $s->fetchall_arrayref;
    if($results) {
        #found it in the db;
        return 2 if $last_modif_time->ymd eq $results->[0]->[0] ; #ret
+urn 2 if the entry is up to date
        update($name,file2sha1($name),$last_modif_time);
        return 1;# the entry is not up to date
    }
    return 0; #the entry has not be found- should be updated
};


sub file2sha1 {
    my $file=$_[0];
    return '' if -d $file; #have to find out if to prune when a direct
+ory is found that doesn't match the regex
    open my $f,"<$file";
    my $sha1 = Digest::SHA1->new;
    $sha1->addfile(*$f);
    return $sha1->hexdigest;
}


sub process_file {
    my $dir_configs=$_[0];
    return if -d  $File::Find::name && ! $dir_configs->{dir};
    return if -l  $File::Find::name && ! $dir_configs->{link};
    return if -f  $File::Find::name && ! $dir_configs->{file};
    return if -s  $File::Find::name < $config->{minsize};
    unless($File::Find::name =~  /$dir_configs->{regex}/) {
        if(-d $File::Find::name) {
            $File::Find::prune=1;
        }
        return;
    }

    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
        $atime,$mtime,$ctime,$blksize,$blocks)
    = stat($File::Find::name);
    my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);

    #find out if entry needs update and update it if necesary
    #find_or_update returns 0 only if it hasnt found the file in the c
+hecksum db
    unless(find_or_update($File::Find::name,$last_modif_time)) {
        add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $Fi
+le::Find::name,$File::Find::name);
        #add it to db
    };
};


for my $searched_dir_hash (@{ $config->{directories} }) {
    next unless (-e $searched_dir_hash->{path} && -d $searched_dir_has
+h->{path}); 
    find(
        {  wanted=> sub { process_file($searched_dir_hash);}  }, 
        $searched_dir_hash->{path} 
    );
}
[download]

build_duplicate_script.pl

#!/usr/bin/perl 
use strict;
use warnings;
use DBI;
use DateTime;
use Data::Dumper;

my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");

open my $script,">duplicate_erase_script.sh";

sub get_unique_checksums {
    my $sql="SELECT checksum as groupsize FROM checksums GROUP BY size
+ HAVING groupsize > 1;";
        #because groups of size 1 cannot have duplicates
    my $sth=$dbh->prepare($sql);
    $sth->execute;
    my $results=$sth->fetchall_arrayref;
    return map { $_->[0] } @{$results};
};

sub checksum2names {
    my ($checksum)=@_;
    my $sql=sprintf "SELECT name FROM checksums WHERE checksum=\"%s\";
+",$checksum;
    my $sth=$dbh->prepare($sql);
    $sth->execute;
    my $results=$sth->fetchall_arrayref;
    return map { $_->[0] } @{$results};
};


for my $checksum (get_unique_checksums()) {
    my @same_checksum=checksum2names($checksum);
    my $leader = shift @same_checksum;#take aside on element of the gr
+oup making it the leader
    print $script "# duplicates of $leader follow:\n";
    for my $name (@same_checksum) {#get all the others and write comma
+nds to delete them
        print $script "# rm $name\n";
    }
};

close $script;
[download]

EDIT: The latest version of this can be found (it has heavily mutated due to suggestions of the monks).

In reply to scalable duplicate file remover by spx2

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


Your skill will accomplish what the force of many cannot
	PerlMonks