A tidied up version. Not perfect and not enough subs. Dealing with multi input and multi output subs is something I haven't mastered yet but I've learnt a lot tonight.
###This script takes lines in the following format
#Report Type:Defined Patient Summary
#Ordered By: Numeric
#Report Name:CKD all details report
#Divisions: None
#Creation Date: 27/09/2010 09:26:16
#Search Name: CKD audit population
#Search Population: 2238
#
#Mr Mickey Mouse - Patient No: 5
#
#Patient ID :5
#Date Of Birth :07/02/1960
#Sex :M
#NHS Number :1234567890
#Postcode :LL53 6EA
#AllValues
#Assigned Date,Term,Description,Value
#15/09/2010,Serum calcium,,2.43 mmol/L
#15/09/2010,Serum inorganic phosphate,,0.89 mmol/L
#15/09/2010,Serum total protein,,63 g/L
#15/09/2010,Serum albumin,,43 g/L
#First Outfile Individual test results
#5,Serum calcium,15/09/2010,2.43
#5,Serum inorganic phosphate,15/09/2010,0.89
#5,Serum total protein,15/09/2010,63
#5,Serum albumin,15/09/2010,43
#Second outfile - a register of all tests ( no duplicates)
#Serum calcium
#5,Serum inorganic phosphate
#5,Serum total protein
#5,Serum albumin
#It creates a lookup file called lookup.csv in the format
#5,1234567890,LL53 6EA,07/02/1960,M
#Usage is the input file name, output file name
use strict;
my $ID;
my $nhs;
my $postcode;
my $dob;
my $sex;
my %hash = ();
my @keys;
my @sorted;
my @lines;
if ( $#ARGV != 2 ) { die "usage $0 InputFileName OutputFileNameForData
+ OutputFileNameForTestNames \n"; }
open( INFILE, "< $ARGV[0]" ) || die "Cannot open $ARGV[0] $!\n";
open( OUTFILE, "> $ARGV[1]" ) || die "Cannot open $ARGV[1] $!\n";
open( OUTFILE2, "> $ARGV[2]" ) || die "Cannot open $ARGV[2] $!\n";
my $path = "lookup.CSV";
open( OUTFILE3, "> $path" ) || die "Cannot open LOOKUP.CSV $!\n";
my $lookup = "";
#my $n=0; # just a debug variable
while (<INFILE>) {
#$n=$n+1;
my $line = $_;
if ( $line =~ m/^Patient ID.*?(\d+)/ ) {
$ID = $1;
my $blank = "";
print $ID. "\n";
%hash = (); #Find a new patient so blank everything out
}
elsif ( $line =~ m/^NHS.*?(\d+)/ ) {
chomp;
$nhs = $1;
}
elsif ( $line =~ m/^Date Of Birth/ ) {
my $dob = $line;
$dob =~ s/Date Of Birth ://;
chomp $dob;
}
elsif ( $line =~ m/^Sex/ ) {
my $sex = $line;
$sex =~ s/Sex ://;
chomp $sex;
}
elsif ( $line =~ m/^Post/ ) {
$postcode = $line;
$postcode =~ s/Postcode :(\w+)/$1/;
chomp $postcode;
$lookup =
$lookup
. $ID . ","
. $nhs . ","
. $postcode . ","
. $dob . ","
. $sex . "\n"; #Postcode is the last value in the identifier
+ initial array, so at this point we can print out the lookup
#$age = <STDIN>; # pause
#print $lookup;
}
elsif ( $line =~ m/^\d\d\/\d\d.*?/ ) { #if a line is a value line
# print "$line";
my @names = split( ',', $line );
my $date = $names[0]; #date of test
my $index = $names[1]; # is test name
my $nonsense = $names[2]; # not used
my $val = $names[3]; # value
my $test = $names[3];
chomp $val;
chomp $test;
$test = tidyup_raw_data_test($test);
$index = tidyup_raw_data_index($index);
my $testother = "$test" . "0";
$index =~ s/\s$testother\s.*//;
$index =~ s/\s$test\s.*//;
$index =~ s/$testother//;
$index =~ s/$test//;
$index = delete_superfluous_testname_data ( $index );
$val = delete_units ( $val );
# print "-$test- substituted -$index- \n";
if ( $val != "" ) { #if the value is not blank
$hash{$index} = $hash{$index} . $val . ",";
#print $n."line no".$test."....test....".$val."\n"; a debug
#my $age = <STDIN>; # a pause for debug
push @lines,
$ID . ","
. $index . ","
. $date . ","
. $val
. "\n"; #HanDY TO TEST
push @keys, $index . "\n";
}
}
}
close(INFILE);
#print "Sorting and collating records.....takes some time.....\n Wait
+for this window to close before continuing \n ";
my @unique = ();
my %seen = ();
@unique = grep { !$seen{$_}++ } @keys;
@sorted = sort { $a cmp $b } @unique;
#print @sorted;
print OUTFILE @lines;
print "\n$. lines of data Processed. ";
close(OUTFILE);
print OUTFILE2 @sorted;
close(OUTFILE2);
print OUTFILE3 $lookup;
close(OUTFILE3);
sub tidyup_raw_data_test {
my $name=shift;
$name =~ s/\s$//;
$name =~ s/\<|(\)|\()|\/|\^|\.|\*|//g; #substitute all ope
+rators
$name =~ s/\s0//;
return $name;
}
sub delete_units { # Deletes units from the end of a value
my $name=shift;
$name =~s/10\^.*\/L//; #Why on earth has the ^ replaced * in t
+he dataset. ah well
$name =~ s/mL\/min\/1.73m2//;
$name =~ s/10\*.*//;
$name =~ s/109L//;
$name =~ tr/[0-9][.]//cd;
return $name;
}
sub delete_superfluous_testname_data {
my $name= shift;
$name =~ s/\s+$//;
$name =~ s/\.$//;
$name =~ s/\s$//;
$name =~ s/(OE -\w*)\d/$1/;
$name =~ s/\s\d+\scm$//;
$name =~ s/\s+$//;
return $name;
}
sub tidyup_raw_data_index {
my $name= shift;
$name =~ s/(\d)(\.00)(\s)/$1.$3/; #delet .00s
$name =~ s/(\d\.\d)(0)(\s)/$1.$3/; #delet .x0s
$name =~ s/(\d)(\.0)(\s)/$1.$3/; #delet e.0s
$name =~ s/\s(\.\d)/0.$1/; #add a 0 in front of a period This
+ is so that the units match the old units and that replace works
$name =~ s/\<|(\)|\()|\/|\^|\.|\*|//g;
return $name;
}