perlquestion
bojinlund
<h4>Background</h4>
<p>I want to store, in a SQLite database, information about all files in a number of directory trees. Example of information are file path, size, creation date and checksum (fingerprint). To do this I need to sweep the directory trees by accessing each file in each nested subdirectory under a specified root.</p>
<p>Below follows a script using my current function to sweep a number of file trees. I am using Windows 10.
Sweeping Windows C:/ my FS_sweep visits about 2700 node/s and using File::Find visits 2400. File::Find also generates many “Can't opendir( … ): Invalid argument” warnings.</p>
<h4>Questions</h4>
<p>- Which type of files and directories is the globe function returning?</p>
<p>- How can I improve my sweep function?</p>
<code>
use strict;
use warnings;
use 5.010;
use Benchmark qw(:all);
my @dir_skip = ( '$RECYCLE.BIN', 'System Volume Information', 'Config.Msi' );
my $dir_skip = join '|', map { quotemeta } @dir_skip;
my $dir_skip_regexp = qr {$dir_skip\$};
sub FS_sweep {
my $start_nod_arr_ref = shift;
my $sub_ref = shift; # callback
my @to_do = @$start_nod_arr_ref;
my $cnt = 0;
my $t0 = Benchmark->new;
my $sub_log = sub {
warn '!! ', shift, "\n", ' ' x 12, "cnt: $cnt\n";
};
while ( my $nod = shift @to_do ) {
if (1) { $sub_log->($nod) if not $cnt % 10000; }
$cnt++;
my $f_rv = -f $nod;
if ( !defined $f_rv ) {
$sub_log->("ERROR <$nod> No such file or directory $!");
next;
}
if ($f_rv) {
my @rv;
# using the special filehandle consisting of a solitary underline
if ( not( @rv = stat( (_) ) ) ) {
$sub_log->("ERROR Can't stat <$nod> $!");
}
else {
my $stat_ref = \@rv;
$sub_ref->( $nod, $stat_ref ); # call callback
}
}
elsif ( -d (_) ) {
if ( $nod =~ m{$dir_skip_regexp} ) {
$sub_log->("SKIPING DIR $nod");
next;
}
my @nod = (
grep { !m{[/][.][.]?$} } glob( '"' . $nod . '/.*"' ),
glob( '"' . $nod . '/*"' )
);
unshift @to_do, @nod;
}
else { $sub_log->("ERROR? $nod"); }
}
if (1) {
my $td = timediff( Benchmark->new, $t0 );
my $node_per_second = $td->cpu_p > 0 ? $cnt / $td->cpu_p : -1;
warn sprintf "\n!! FS_sweep DONE nodes: %d 1/s: %d\n",
$cnt, $node_per_second;
}
}
my @output;
sub FS_file_path {
my $file_path = shift;
my $stat_arr_ref = shift;
push @output, "$file_path";
}
sub FS_file_big {
my $file_path = shift;
my $stat_arr_ref = shift;
my $size = $stat_arr_ref->[7];
push @output, "BIG $file_path size: $size\n" if $size > 100000000;
}
warn "\n!! FS_file_path START";
FS_sweep( ['C:\Program Files\WindowsPowerShell'], \&FS_file_path );
warn "\n!! FS_file_path:\n", join "\n", @output[ 0 .. 10 ];
@output = ();
warn "\n!! FS_file_big START";
foreach my $dev (qw{ C D P Q R S }) {
my $t0 = Benchmark->new;
FS_sweep( ["$dev:"], \&FS_file_big );
my $td = timediff( Benchmark->new, $t0 );
warn sprintf "!! Device %s: %s \n\n", $dev, timestr($td);
}
warn "\n!! FS_file_big:\n", join "\n", @output;
@output = ();
</code>