use strict; use warnings; use feature 'say'; use Path::Tiny; use Test::More; # When not testing, something like this: # my $in_file = path("/path/to/file"); # my $fh = $in_file->openr; my $script = path(__FILE__); my $out_file = path( sprintf('/tmp/%s-%s.log', $script->basename('.pl'), time) ); my %seen; my ($count_in, $count_out); { # set the line separator locally to a string that matches # the break between your "paragraphs". local $/ = "\n>sp"; while ( my $line = ) { # <$fh> $count_in++; if ( $line =~ /GN=(\S*)/ ) { next if $seen{$1}; $count_out++; $seen{$1} = 1; $out_file->append( $line ); } } } say ' Records processed: ' . $count_in; say 'Duplicates removed: ' . ($count_in - $count_out); #----------------------------------------------------# ## Test. Always test. is( $count_in, 5, 'Input read OK'); is( $count_out, 3, 'Correct number of records kept'); my @written = $out_file->lines; is( (grep { /GN/ } @written), 3, 'File has correct count for "GN"'); for my $value (qw/ TUFA Blorgle Blargle /) { is( (grep { /$value/ } @written), 1, sprintf('File has correct count for "%s"', $value) ); } done_testing; __DATA__ >sp|O24310|EFTU_PEA Elongation factor Tu, chloroplastic OS=Pisum sativum OX=3888 GN=TUFA PE=2 SV=1 MALSSTAATTSSKLKLSNPPSLSHTFTASASASVSNSTSFR >sp|O24310|EFTU_PEA Elongation factor Tu, chloroplastic OS=Pisum sativum OX=3888 GN=Blorgle PE=2 SV=1 MALSSTAATTSSKLKLSNPPSLSHTFTASASASVSNSTSFR >sp|Q43467|EFTU1_SOYBN Elongation factor Tu, chloroplastic OS=Glycine max OX=3847 GN=TUFA PE=3 SV=1 MAVSSATASSKLILLPHASSSSSLNSTPFRSSTTNTHKLTP This is a dupe but has extra content >sp|O24310|EFTU_PEA Elongation factor Tu, chloroplastic OS=Pisum sativum OX=3888 GN=Blargle PE=2 SV=1 MALSSTAATTSSKLKLSNPPSLSHTFTASASASVSNSTSFR >sp|O24310|EFTU_PEA Elongation factor Tu, chloroplastic OS=Pisum sativum OX=3888 GN=Blorgle PE=2 SV=1 MALSSTAATTSSKLKLSNPPSLSHTFTASASASVSNSTSFR