scalable duplicate file remover

I have seen some threads around perlmonks regarding finding duplicate files. I noticed that my idea was similar with a post by superfrink a while ago in 2006. You can find out more about how to use it here So I used DBI,YAML,File::Find and SQLite and Digest::SHA1 to write a project that does this. You can find more about it . I have done tests and it works ok for 100.000 files and 2gb of total file size.But because of how it was built I am sure it can scale to much more. Any suggestions/oppinions are welcomed.

setup.pl

#this script will create the database which will be used
#CAREFUL! IF THERE EXISTS A PREVIOUS DATABASE IT WILL DELETE IT !!!
use strict;
use warnings;
use DBI;
`rm checksum_db.sqlite`;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
$dbh->do("CREATE TABLE checksums (id INTEGER PRIMARY KEY,checksum VARC
+HAR(42),size INTEGER,last_date_modified DATE,name VARCHAR(200) UNIQUE
+,is_dir VARCHAR(1),is_file VARCHAR(1),is_link VARCHAR(1),UNIQUE (chec
+ksum,name));");
[download]

config.yml

minsize: 64
directories:
  - path: /usr
    dir: 1
    file: 1
    link: 0
    regex: .*
  - path: /home/spx2/perlhobby
    dir: 1
    file: 1
    link: 0
    regex: .*
  - path: /lib
    dir: 1
    file: 1
    link: 0
    regex: .*
[download]

build_database.pl

#this will be used just for the first run
#!/usr/bin/perl 
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;

my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");

my $config_path = 'config.yml';
my $config = LoadFile($config_path);

#to add columns in db for link,dir,file to know what the name column s
+tands for...
sub add_to_db {
    my ($checksum,$last_modif_time,$size,$name)=@_;
    #maybe calculating is_* should be done in process_file
    my $is_dir  = (-d $name)?'Y':'N';
    my $is_file = (-f $name)?'Y':'N';
    my $is_link = (-l $name)?'Y':'N';
    $dbh->do(
        sprintf "INSERT INTO checksums (checksum,size,last_date_modifi
+ed,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\
+"%s\",\"%s\",\"%s\");",
        $checksum,
        $size,
        $last_modif_time->ymd,
        $name,
        $is_dir,
        $is_file,
        $is_link
    );
};

sub delete_from_db {#remains to be completed
    my ($name)=@_;
};


sub file2sha1 {
    my $file=$_[0];
    return '' if -d $file; #have to find out if to prune when a direct
+ory is found that doesn't match the regex
    open my $f,"<$file";
    my $sha1 = Digest::SHA1->new;
    $sha1->addfile(*$f);
    return $sha1->hexdigest;
}


sub process_file {
    my $dir_configs=$_[0];
    ##optimisation using -d -l -f -s just once for return and also for
+ adding

    #if current "file"(unix terminology) is a directory and the yaml c
+onfiguration
    #tells us to eliminate directories from the search we do so by ret
+urning from the
    #callback
    return if -d $File::Find::name && ! $dir_configs->{dir};
    return if -l $File::Find::name && ! $dir_configs->{link};
    return if -f $File::Find::name && ! $dir_configs->{file};
    return if -s $File::Find::name < $config->{minsize};
    unless($File::Find::name =~  /$dir_configs->{regex}/) {
        if(-d $File::Find::name) {
            $File::Find::prune=1;
        }
        return;
    }

    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
        $atime,$mtime,$ctime,$blksize,$blocks)
    = stat($File::Find::name);
    my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);
#    printf "%s %s %s %s\n",
#    $File::Find::name,
#    file2sha1($File::Find::name),
#    -s $File::Find::name,
#    $last_modif_time;

    add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $File::
+Find::name,$File::Find::name);
    #print Dumper $dir_configs;
};


for my $searched_dir_hash (@{ $config->{directories} }) {
    # we skip the entry if it does not exist or it is not a directory
    next unless (-e $searched_dir_hash->{path} && -d $searched_dir_has
+h->{path}); 
    #we pass to the process_file function the yml configuration for th
+e current directory that is searched
    find(
        {  wanted=> sub { process_file($searched_dir_hash);}  }, 
        $searched_dir_hash->{path} 
    );
}
[download]

update_database.pl

#!/usr/bin/perl 
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");

my $config_path = 'config.yml';
my $config = LoadFile($config_path);

sub add_to_db {
    my ($checksum,$last_modif_time,$size,$name)=@_;
    #maybe calculating is_* should be done in process_file
    my $is_dir  = (-d $name)?'Y':'N';
    my $is_file = (-f $name)?'Y':'N';
    my $is_link = (-l $name)?'Y':'N';
    $dbh->do(
        sprintf "INSERT INTO checksums (checksum,size,last_date_modifi
+ed,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\
+"%s\",\"%s\",\"%s\");",
        $checksum,
        $size,
        $last_modif_time->ymd,
        $name,
        $is_dir,
        $is_file,
        $is_link
    );
}

sub update {
    my ($name,$checksum,$last_modif_time)=@_;
    $dbh->do(sprintf "UPDATE checksums SET checksum=\"%s\",last_date_m
+odified=\"%s\" WHERE name=\"%s\";",$checksum,$name,$last_modif_time->
+ymd);
}

sub find_or_update {
    my ($name,$last_modif_time)=@_;
    my $s=$dbh->prepare(sprintf "SELECT last_date_modified FROM checks
+ums WHERE name=\"%s\" ;",$name);
    $s->execute;
    my $results = $s->fetchall_arrayref;
    if($results) {
        #found it in the db;
        return 2 if $last_modif_time->ymd eq $results->[0]->[0] ; #ret
+urn 2 if the entry is up to date
        update($name,file2sha1($name),$last_modif_time);
        return 1;# the entry is not up to date
    }
    return 0; #the entry has not be found- should be updated
};


sub file2sha1 {
    my $file=$_[0];
    return '' if -d $file; #have to find out if to prune when a direct
+ory is found that doesn't match the regex
    open my $f,"<$file";
    my $sha1 = Digest::SHA1->new;
    $sha1->addfile(*$f);
    return $sha1->hexdigest;
}


sub process_file {
    my $dir_configs=$_[0];
    return if -d  $File::Find::name && ! $dir_configs->{dir};
    return if -l  $File::Find::name && ! $dir_configs->{link};
    return if -f  $File::Find::name && ! $dir_configs->{file};
    return if -s  $File::Find::name < $config->{minsize};
    unless($File::Find::name =~  /$dir_configs->{regex}/) {
        if(-d $File::Find::name) {
            $File::Find::prune=1;
        }
        return;
    }

    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
        $atime,$mtime,$ctime,$blksize,$blocks)
    = stat($File::Find::name);
    my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);

    #find out if entry needs update and update it if necesary
    #find_or_update returns 0 only if it hasnt found the file in the c
+hecksum db
    unless(find_or_update($File::Find::name,$last_modif_time)) {
        add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $Fi
+le::Find::name,$File::Find::name);
        #add it to db
    };
};


for my $searched_dir_hash (@{ $config->{directories} }) {
    next unless (-e $searched_dir_hash->{path} && -d $searched_dir_has
+h->{path}); 
    find(
        {  wanted=> sub { process_file($searched_dir_hash);}  }, 
        $searched_dir_hash->{path} 
    );
}
[download]

build_duplicate_script.pl

#!/usr/bin/perl 
use strict;
use warnings;
use DBI;
use DateTime;
use Data::Dumper;

my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");

open my $script,">duplicate_erase_script.sh";

sub get_unique_checksums {
    my $sql="SELECT checksum as groupsize FROM checksums GROUP BY size
+ HAVING groupsize > 1;";
        #because groups of size 1 cannot have duplicates
    my $sth=$dbh->prepare($sql);
    $sth->execute;
    my $results=$sth->fetchall_arrayref;
    return map { $_->[0] } @{$results};
};

sub checksum2names {
    my ($checksum)=@_;
    my $sql=sprintf "SELECT name FROM checksums WHERE checksum=\"%s\";
+",$checksum;
    my $sth=$dbh->prepare($sql);
    $sth->execute;
    my $results=$sth->fetchall_arrayref;
    return map { $_->[0] } @{$results};
};


for my $checksum (get_unique_checksums()) {
    my @same_checksum=checksum2names($checksum);
    my $leader = shift @same_checksum;#take aside on element of the gr
+oup making it the leader
    print $script "# duplicates of $leader follow:\n";
    for my $name (@same_checksum) {#get all the others and write comma
+nds to delete them
        print $script "# rm $name\n";
    }
};

close $script;
[download]

EDIT: The latest version of this can be found (it has heavily mutated due to suggestions of the monks).

Back to Cool Uses for Perl