http://qs321.pair.com?node_id=11145626

sroux has asked for the wisdom of the Perl Monks concerning the following question:

Dear monks,

I am working on a script for data migration purpose, it is about mapping accounts and others fields through 2 mapping tables then logging unmapped items for audit/tracing purposes. It takes ~1h30 for ~1 million rows. I am looking for some advice in order to optimize the script, quick wins or blatant code optimization advice, I am not looking for complete code rewriting (even if that would be very cool).

Thank you for any input and advice!

my $begin_time = time(); #Specify files to map #my @files = ("dataset1.dat", "dataset2.dat"); foreach $files (@ARGV) { #Load file into array (start) my $begin_time = time(); print "Processing file: ".$files."\n"; open my $handle, '<', $files; chomp( my @lines = <$handle> ); #Exclude @lines2 = grep( !/(Final|ConvP&L|ConvBsheet)/, @lines ); @lines = (); @lines = @lines2; close $handle; #Load file into array (end) my $end_time = time(); printf( "Load file into array: "."%.2f\n", $end_time - $begin_time + ); #Mapping process (start) my $begin_time = time(); # Mapping 1 open(MAPPINGFILE1, "mapping3.txt") or die print "Error: not found +in specified path\n"; # Mapping 2 open(MAPPINGFILE2, "mapping4.txt") or die print "Error: not found +in specified path\n"; #$count++ while <MAPPINGFILE1>; foreach $line (@lines) { #Mapping des charges directes seek MAPPINGFILE1, 0, 0; while (<MAPPINGFILE1>) { chomp(); # Skip blank lines and comments next if /^(\s*(#.*)?)?$/; # Split source columns @source = split /\t/, $line; # Split mapping columns @mapping = split /\t/, $_; # Account is matching with source if ($line =~ m/$mapping[0]/) { #Account substitution if($mapping[2] eq "") { $line =~ s/$mapping[0]/"Compte cible non défini !" +/; } else { $line =~ s/$mapping[0]/$mapping[2]/; } #Mapping = target Unit, Alloc_ + Unit source if($mapping[3] ne "Unit source") { #Unit substitution if($mapping[3] eq "") { $line =~ s/$source[2]/"Unit cible non définie +!"\tALLOC_$source[2]/; } else { $line =~ s/$source[2]/$mapping[3]\tALLOC_$sour +ce[2]/; } push @lines2, $line; last; } else { #Mapping : source Unit, Alloc_ + source Unit $line =~ s/$source[2]/$source[2]\tALLOC_$source[2] +/; push @lines2, $line; last; } } elsif (eof(MAPPINGFILE1)) { #Mapping des charges indirectes seek MAPPINGFILE2, 0, 0; while (<MAPPINGFILE2>) { chomp(); # Skip blank lines and comments next if /^(\s*(#.*)?)?$/; # Split mapping columns (tab) @mapping = split /\t/, $_; # Mapping is matching if ($line =~ m/$mapping[0]/) { $line =~ s/$mapping[0]/$mapping[4]/; @source = split /\t/, $line; $line =~ s/$source[2]/$mapping[2]\tALLOC_$sour +ce[2]/; push @lines2, $line; last; } elsif (eof(MAPPINGFILE2)) { push @rejects, "Lignes non mappées (Account): +"."\t".$line; } } } } } close MAPPINGFILE1; close MAPPINGFILE2; #Suppress double quote for (@lines2) { s/"//g } for (@rejects) { s/"//g } #Replace empty by Missing for (@lines2) { s/\t(?=\t)/\t#MI/g } #Generic mapping for (@lines2) { s/A2022Local/ACTUAL;FY22;Working_Central;Input;Loc +al_YTD/g } for (@lines2) { s/A2022AjConsoLocal/ACTUAL;FY22;Working_Central;Ad +j_Conso;Local_YTD/g } for (@lines2) { s/A2022InEur/ACTUAL;FY22;Working_Central;Input;Eur +_ACT2022_Rate_YTD/g } for (@lines2) { s/A2022AjConso/ACTUAL;FY22;Working_Central;Adj_Con +so;Eur_ACT2022_Rate_YTD/g } for (@lines2) { s/A2022TxB22/ACTUAL;FY22;Working_Central;Input;Eur +_BUD2022_Rate_YTD/g } for (@lines2) { s/A2022TxB22AjConso/ACTUAL;FY22;Working_Central;Ad +j_Conso;Eur_BUD2022_Rate_YTD/g } for (@lines2) { s/A2021Local/ACTUAL;FY21;Working_Central;Input;Loc +al_YTD/g } for (@lines2) { s/A2021AjConsoLocal/ACTUAL;FY21;Working_Central;Ad +j_Conso;Local_YTD/g } for (@lines2) { s/A2021InEur/ACTUAL;FY21;Working_Central;Input;Eur +_ACT2021_Rate_YTD/g } for (@lines2) { s/A2021AjConso/ACTUAL;FY21;Working_Central;Adj_Con +so;Eur_ACT2021_Rate_YTD/g } for (@lines2) { s/A2021TxB22/ACTUAL;FY21;Working_Central;Input;Eur +_BUD2021_Rate_YTD/g } for (@lines2) { s/A2021TxB22AjConso/ACTUAL;FY21;Working_Central;Ad +j_Conso;Eur_BUD2021_Rate_YTD/g } for (@lines2) { s/A2020Local/ACTUAL;FY20;Working_Central;Input;Loc +al_YTD/g } for (@lines2) { s/A2020AjConsoLocal/ACTUAL;FY20;Working_Central;Ad +j_Conso;Local_YTD/g } for (@lines2) { s/A2020InEur/ACTUAL;FY20;Working_Central;Input;Eur +_ACT2020_Rate_YTD/g } for (@lines2) { s/A2020AjConso/ACTUAL;FY20;Working_Central;Adj_Con +so;Eur_ACT2020_Rate_YTD/g } for (@lines2) { s/A2020TxB22/ACTUAL;FY20;Working_Central;Input;Eur +_BUD2020_Rate_YTD/g } for (@lines2) { s/A2020TxB22AjConso/ACTUAL;FY20;Working_Central;Ad +j_Conso;Eur_BUD2020_Rate_YTD/g } for (@lines2) { s/B2022Local/BUDGET;FY22;Working_Central;Input;Loc +al_YTD/g } for (@lines2) { s/B2022AjConsoLoc/BUDGET;FY22;Working_Central;Adj_ +Conso;Local_YTD/g } for (@lines2) { s/B2022AjTBLocal/BUDGET;FY22;Working_Central;Adj_C +onso;Local_YTD/g } for (@lines2) { s/B2022InEur/BUDGET;FY22;Working_Central;Input;Eur +_BUD2022_Rate_YTD/g } for (@lines2) { s/B2022AjConso/BUDGET;FY22;Working_Central;Adj_Con +so;Eur_BUD2022_Rate_YTD/g } for (@lines2) { s/B2022AjTB/BUDGET;FY22;Working_Central;Adj_Conso; +Eur_BUD2022_Rate_YTD/g } #Add header unshift @lines2, "Scenario;Year;Audit;Version;Vision;Entity;Unit;U +nit_Alloc;Account;Jan;Feb;Mar;Apr;May;Jun;Jul;Aug;Sep;Oct;Nov;Dec"; #Set common delimiter for (@lines2) { s/(\t|;)/\t/g } for (@rejects) { s/(\t|;)/\t/g } #Mapping process (end) my $end_time = time(); printf( "Mapping process: "."%.2f\n", $end_time - $begin_time ); #Output (start) my $begin_time = time(); open my $handle2, ">", $files."_output.txt"; print $handle2 join( "\n", @lines2 ); close $handle2; open my $handle3, ">", $files."_output.err"; print $handle3 join( "\n", @rejects ); close $handle3; #Output (end) my $end_time = time(); printf( "Output: "."%.2f\n", $end_time - $begin_time ); }