http://qs321.pair.com?node_id=360903

#This is an excerpt of the input text file:

CHROMOSOME: 1 BAC: F7G19 MODEL_NAME: 68275.m00846 BAC_LOCUS: F7G19_2 CHROMO_LOCUS: At1g09100 COMMON_NAME: putative 26S protease regulatory subunit 6A COMMENT: Similar to probable Mg-dependent ATPase (pir S56671). +ESTs gb|T46782,gb|AA04798 come from this gene PSEUDOGENE: NO MODEL_BEGIN: 2939304 MODEL_END: 2936520 ORIENTATION: R_0 NUM_EXONS: 10 .....EXON 1 (COMB_EXON): 68275.e04959 pos: 2939304 - 2939 +146 EXON 1 : LEFT_UTR : pos: 2939304 - 2939263 EXON 1 : CDS : pos: 2939262 - 2939146 .....EXON 2 (CDS) : 68275.e04958 pos: 2938801 - 2938 +685 .....EXON 3 (CDS) : 68275.e04957 pos: 2938445 - 2938 +338 .....EXON 4 (CDS) : 68275.e04956 pos: 2938219 - 2938 +028 .....EXON 5 (CDS) : 68275.e04955 pos: 2937929 - 2937 +738 .....EXON 6 (CDS) : 68275.e04954 pos: 2937641 - 2937 +523 .....EXON 7 (CDS) : 68275.e04953 pos: 2937445 - 2937 +358 .....EXON 8 (CDS) : 68275.e04952 pos: 2937276 - 2937 +157 .....EXON 9 (CDS) : 68275.e04951 pos: 2936976 - 2936 +836 .....EXON 10 (COMB_EXON): 68275.e04950 pos: 2936756 - 2936 +520 EXON 10 : CDS : pos: 2936756 - 2936679 EXON 10 : RIGHT_UTR : pos: 2936678 - 2936520 ----- CHROMOSOME: 1 BAC: F7G19 MODEL_NAME: 68275.m00847 BAC_LOCUS: F7G19_1 CHROMO_LOCUS: At1g09130 COMMON_NAME: unknown protein COMMENT: Similar to ATP-dependent Clp protease (gb D90915). EST + gb|N6546$ PSEUDOGENE: NO MODEL_BEGIN: 2942221 MODEL_END: 2940342 ORIENTATION: R_1 NUM_EXONS: 8 .....EXON 1 (CDS) : 68275.e04967 pos: 2942221 - 2941 +913 .....EXON 2 (CDS) : 68275.e04966 pos: 2941652 - 2941 +551 .....EXON 3 (CDS) : 68275.e04965 pos: 2941477 - 2941 +355 .....EXON 4 (CDS) : 68275.e04964 pos: 2941276 - 2941 +217 .....EXON 5 (CDS) : 68275.e04963 pos: 2941058 - 2940 +962 .....EXON 6 (CDS) : 68275.e04962 pos: 2940874 - 2940 +801 .....EXON 7 (CDS) : 68275.e04961 pos: 2940580 - 2940 +527 .....EXON 8 (CDS) : 68275.e04960 pos: 2940452 - 2940 +342 ----- CHROMOSOME: 1 BAC: T12M4 MODEL_NAME: 68275.m00848 BAC_LOCUS: T12M4_19 CHROMO_LOCUS: At1g09140 COMMON_NAME: putative SF2 ASF splicing modulator, Srp30 COMMENT: similar to GB:CAB42558 PSEUDOGENE: NO MODEL_BEGIN: 2945944 MODEL_END: 2943169 ORIENTATION: R_2 NUM_EXONS: 11 .....EXON 1 (COMB_EXON): 68275.e04978 pos: 2945944 - 2945 +736 EXON 1 : LEFT_UTR : pos: 2945944 - 2945823 EXON 1 : CDS : pos: 2945822 - 2945736 .....EXON 2 (CDS) : 68275.e04977 pos: 2945612 - 2945 +547 .....EXON 3 (CDS) : 68275.e04976 pos: 2945460 - 2945 +386 .....EXON 4 (CDS) : 68275.e04975 pos: 2945306 - 2945 +204 .....EXON 5 (CDS) : 68275.e04974 pos: 2945118 - 2945 +072 .....EXON 6 (CDS) : 68275.e04973 pos: 2944964 - 2944 +907 .....EXON 7 (CDS) : 68275.e04972 pos: 2944783 - 2944 +734 .....EXON 8 (CDS) : 68275.e04971 pos: 2944590 - 2944 +534 .....EXON 9 (CDS) : 68275.e04970 pos: 2944460 - 2944 +382 .....EXON 10 (CDS) : 68275.e04969 pos: 2944250 - 2944 +173 .....EXON 11 (CDS) : 68275.e04968 pos: 2943230 - 2943 +169 -----

Below is the loading program... the database has separate tables for genes, artificial chromosome data ("achroms"-this is OK, so I didn't include the input format for it), exons, subexons, and a relateGAC table that holds the many-to-many possible relationships for genes to artificial chromosomes. There are over 26,000 gene records in the gene text file. There are more in the next update that TIGR put out recently. How can this script be optimized for speed? It took a colossal 17 hrs to run this weekend, but it works. It justs gets slower with each gene/exon/subexon record. A lot of the documentation is meant for my non-programmer-type committee members.

#!/usr/bin/perl -w ################################################################ # Programmer: biograd # Program: TIGRFillTables.plx # Date begun: 8/17/01 # Description: This is a perl script made to automatically take # information from previously generated files # containing _Arabidopsis thaliana_ genomic data # from the TIGR database and place it into a local # mysql database. Since it is intended for automation, # a user name and password are hard-coded. # Input: -Files formatted by Dr. Johns, such as # /arabidopsis/bac_info_08102001.txt and # /arabidopsis/gene_info_08102001.txt # Effect: -Successful: information will be parsed from the # files and loaded into correct database areas. # # Outline: I. CONNECT # II. ACHROMS # A. Prepare statements to be used # B. Loop through each bacfile record # 1. CleanData() # 2. ParseAChroms() # 3. Check for matching record in db # a. If match, then update record # b. If no match, insert record # III. GENES, RELATEGAC, EXONS # A. Prepare statements for filling: # 1. Genes # 2. RelateGAC (genes to artificial chroms) # 3. Exons # B. Loop through each genefile record # 1. CleanData() # 2. ParseGenes() # 3. Check for matching gene record in db # a. If match, then update gene # b. If no match, insert gene info # 4. Check if an AC is listed in the gene info # a. If yes, check relateGAC table for an # exact match of gene with this AC. # 1) If match, increment count by 1 # 2) If no match, insert relation into # relateGAC table. # 5. ParseExon() # 6. Loop through dereferenced exon arrays # a. Check for matching exon record in db # 1) If match, update exon # 2) If no match, insert exon info # b. Check if sub_exon data is there # 1) If match, update sub-exon # 2) If no match, insert sub_exon # IV. DISCONNECT # A. Finish statements # B. Disconnect from database # V. SUBROUTINE DEFINITIONS # A. CleanData() # 1. Removes empty lines and newlines # 2. Returns array of good lines # B. ParseAChroms() # 1. Searches for information # 2. Plugs information into an array # 3. Returns array # C. ParseGenes() # 1. Searches for information # 2. Plugs information into an array # 3. Returns array # D. ParseExons() # 1. Searches for each exon's info # 2. Forms array for each exon # 3. Returns array of array references ################################################################ use strict; use DBI; my ($dbh, $sth_genes, $sth_achroms, $sth_rgac, $record, $sth_find_match, $sth_getcount, @count, $geneid, $current_bac, $sth_select_gene_id, $sth_updatecount, $sth_match_gene, $sth_update_genes, @record, $line, $sth_match_achrom, $sth_update_achroms, @newrecord, @gene, @achrom, @rgac, @temp, $temp, $n, @exondata, $sth_find_exon, $sth_find_subexon, $sth_insert_exon, $sth_insert_subexon, $sth_update_exon, $sth_update_subexon); $n = 0; ### CONNECT ###################################################### $dbh = DBI->connect( "DBI:mysql:arabidopsis", "", "", { PrintError => 0, RaiseError => 1}) or die "\nCouldn't connect to database: $DBI::errstr\n"; ### PREPARE SQL FOR ACHROMS TABLE #################################### # This statement is sent to the database to be parsed and planned # before any data is touched. This move saves time in the long run. # Question marks are where data will be "filled in" or bound during # each execution of this statement in the loop. $sth_achroms = $dbh -> prepare( " INSERT INTO achroms (type, ac_id, length, chr_num, chr_strt, chr_end, bac_strt, bac_end, orient) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?) "); $sth_match_achrom = $dbh -> prepare(" SELECT ac_id FROM achroms WHERE ac_id = ? "); # This statement is run incase an artificial chromosome is already # listed in the database but could have new data to be added # or changed. $sth_update_achroms = $dbh -> prepare(" UPDATE achroms SET type = ?, length = ?, chr_num = ?, chr_strt = ?, chr_end = ?, bac_strt = ?, bac_end = ?, orient = ? WHERE ac_id = ? "); ### OPEN ACHROMS DATAFILE ########################################### # These were my test files. "bac.ht" is a smaller file that holds # the head +130 and tail +130 of the "bac_info_08102001.txt" file. # open (INFILE, "bac_info_08102001.txt") or die "Couldn't open bacfil +e:$1"; open (INFILE, "bac.ht") or die "Couldn't open bacfile: $1"; ### FILL ACHROMS TABLE ########################################### # Take in whole records separated by five dashes instead of a # newline character. $/ = "-----"; while($record = <INFILE>) { # Print out a counter to show how far the filling has progressed. # print "$n\n"; # $n++; @newrecord = (); # Send the record to have any blank lines removed. @newrecord = CleanData( $record ); # If there is an array returned from CleanData(), parse # it. This removes the chance of empty records causing # extra null records in the database. @achrom = (); if (@newrecord) { # Send the clean record to be parsed for Artificial # Chromosome data only. @achrom = ParseAChroms( \@newrecord ); $sth_match_achrom -> execute( $achrom[1] ); $temp = $sth_match_achrom -> fetchrow_array(); if ( $temp ) # if there's a match, update d +ata { $sth_update_achroms -> execute( $achrom[0], $achrom[2], $achrom[3], $achrom[4], $achrom[5], $achrom[6], $achrom[7], $achrom[8], $achrom[1] ); } else # if no match, insert data { # Bind (fill in) the question marks in the prepared insert # statement. The bind data is from the returned @achrom elemen +ts. $sth_achroms -> execute( $achrom[0],$achrom[1],$achrom[2], $achrom[3], $achrom[4], $achrom[5], $achrom[6], $achrom[7], $achrom[8]); } } } close INFILE; ### END ACHROMS TABLE FILL ######################################### ### PREPARE SQL FOR GENES, RELATEGAC, AND EXONS TABLES ############### +# # Question marks are where data will be "filled in" or bound during # each execution of these statements in the loop. # This statement will insert the bind data into the genes table # in the fields listed. # GENES This statement will try to select the present tigr chromosom +al # reference number for the current gene. If it is present, then the # UPDATE statment needs to be used instead of the INSERT. $sth_match_gene = $dbh -> prepare(" SELECT gene_id FROM genes WHERE tigr_id = ? "); # GENES This statement puts new data into a new record. $sth_genes = $dbh -> prepare( " INSERT INTO genes (chrom_num, tigr_id, bac_locus, pseudogene, chrom_locus, locus_start, locus_end, orientation, read_frame, exon_count, product, notes, ac_id) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" ); # GENES This is the update statement to be used when a gene is alrea +dy # listed but new data about it might be provided. $sth_update_genes = $dbh -> prepare(" UPDATE IGNORE genes SET chrom_num = ?, bac_locus = ?, pseudogene = ?, chrom_locus = ?, locus_start = ?, locus_end = ?, orientation = ?, read_frame = ?, exon_count = ?, product = ?, notes = ?, ac_id = ? WHERE tigr_id = ? "); # GENES Just to make things easier, find the unique gene number that # goes to the current tigr gene reference. $sth_select_gene_id = $dbh -> prepare( " SELECT gene_id FROM genes WHERE chrom_locus = ? "); # RELATEGAC This statement will find whether a pre-existing relation +ship # of the kind specified by the bound data is there. The success # can be tested. $sth_find_match = $dbh -> prepare(" SELECT gene_id, ac_id FROM relateGAC WHERE gene_id = ? AND ac_id = ? "); # RELATEGAC This statement will insert related unique gene numbers g +iven # by the database and the names of artificial chromosomes. # The "count" field is to be increased any time a duplicate # relationship is found. For the insert function, it will # always be "1". It will be increased if necessary with an update # statement within the loop. $sth_rgac = $dbh -> prepare( " INSERT INTO relateGAC (gene_id, ac_id, count) VALUES ( ?, ?, 1 ) "); # RELATEGAC This statement will select the count data from the # genes/artificial chromosome relational table ("relateGAC") # where the unique gene id number is the same as that in # the requested bound data. $sth_getcount = $dbh -> prepare(" SELECT count FROM relateGAC WHERE gene_id = ? "); # RELATEGAC This statement will allow the count field to be updated # to a given bound integer when the unique gene id number # matches the given bound data. $sth_updatecount = $dbh -> prepare(" UPDATE relateGAC SET count = ? WHERE gene_id = ? "); # EXONS $sth_find_exon = $dbh -> prepare(" SELECT gene_id, exon_num FROM exons WHERE gene_id = ? AND exon_num = ? "); $sth_find_subexon = $dbh -> prepare(" SELECT sub_type, sub_locus_start FROM subexons WHERE gene_id = ? AND exon_num = ? AND sub_type = ? "); $sth_update_exon = $dbh ->prepare(" UPDATE IGNORE exons SET type = ?, tigr_id = ?, locus_start = ?, locus_end = ? WHERE gene_id = ? AND exon_num = ?"); $sth_update_subexon = $dbh -> prepare(" UPDATE IGNORE subexons SET sub_locus_start = ?, sub_locus_end = ? WHERE gene_id = ? AND exon_num = ? AND sub_type = ? "); $sth_insert_exon = $dbh -> prepare(" INSERT INTO exons (gene_id, exon_num, type, tigr_id, locus_start, locus_end) VALUES ( ?,?,?,?,?,? ) "); $sth_insert_subexon = $dbh -> prepare(" INSERT INTO subexons (gene_id, exon_num, sub_type, sub_locus_start, sub_locus_end) VALUES ( ?,?,?,?,? ) "); ### OPEN GENE DATAFILE ######################################### # These are the test chromosome data files. They have the same # format as the bac files did above. # open (INFILE, "gene_info_08102001.txt") or die "Couldn't open chrfi +le:$1"; open (INFILE, "genes.ht") or die "Couldn't open infile: $1"; ### FILL GENES TABLE ######################################### # Reset the counter to zero so we can see how many gene records # are processed. Separator is set to five dashes. $n = 0; $/ = "-----"; while($record = <INFILE>) { print "\nz$n\n"; $n++; @newrecord = (); @newrecord = CleanData( $record ); # Remove newlines from data. @gene=(); # If there is an array returned from CleanData(), parse # it. This removes the chance of empty records causing # extra null records in the database. if ( @newrecord ) { @gene = ParseGenes( \@newrecord ); $sth_match_gene -> execute( $gene[1] ); $temp = $sth_match_gene -> fetchrow_array(); if ( $temp ) { # Execute the update statement, since this tigr # reference number appears to be listed already. # The bind data are the returned elements from the # ParseGenes() subroutine. The ref# is not updated. $sth_update_genes -> execute ( $gene[0], $gene[2], $gene[3], $gene[4], $gene[5], $gene[6], $gene[7], $gene[8], $gene[9], $gene[10], $gene[11], $gene[12], $gene[1] ); } else { # Execute the previously made statement to insert the data # from this record into the genes table. $sth_genes -> execute( $gene[0], $gene[1], $gene[2], $gene[3], $gene[4], $gene[5], $gene[6], $gene[7], $gene[8], $gene[9], $gene[10], $gene[11], $gene[12] ); } } ### END GENES TABLE FILL ######################################### ### FILL RELATEGAC TABLE ######################################### # If there is a bac associated with this gene, insert the # association into the relateGAC table. The gene[12] element # is the bac name from the data. if( $gene[12] ) { $current_bac = $gene[12]; # Find the unique gene id number for the current tigr # gene by matching the locus name (element 4 of the gene # array from parsing). Usually, this would be the last # entered gene, but there may be duplicates. This method # should avoid duplication. $sth_select_gene_id -> execute ( $gene[4] ); @temp = (); @temp = $sth_select_gene_id -> fetchrow_array(); $geneid = $temp[0]; # If this statement is able to execute, continue. if ($sth_find_match-> execute( $geneid, $current_bac ) ) { # Get the data from the matching row. $temp = 0; $temp = $sth_find_match -> fetchrow_array(); # If any matching rows are found, update the count. # Do not insert another row. if ( $temp ) { @count = (); $sth_getcount -> execute( $geneid ); @count = $sth_getcount -> fetchrow_array(); $count[0]++; print "Gene id number is $geneid.\nCount is : $count[0]\n"; $sth_updatecount -> execute ( $count[0], $geneid ); } # If there are no matches (@temp is undefined), insert # the new row, with a count of "1" automatically entered # from the prepare statement, and the last used unique # gene id number. else { $sth_rgac -> execute( $sth_genes->{mysql_insertid} , $current_bac ); } } else { print "\n\$sth_find_match was unable to execute,\n $current_bac from gene $geneid was not processed : $DBI::errstr\n"; } # End if loop for insertion/count update routine } # End if loop for $gene[12] test (BAC name existance) ### END RELATEGAC TABLE FILL################################ ### FILL EXONS TABLE######################################## if ( @newrecord ) { foreach $line ( @newrecord ) { @exondata=(); next unless $line=~m/EXON\b/; if ( $line=~m/^\.{5}/ ) # line contains exon data { # Elements are ( exon_num, type, tigr_id, locus_start, locus +_end ) @exondata=ParseExon( $line ); $sth_find_exon -> execute( $geneid, $exondata[0] ); if ( $sth_find_exon -> fetchrow_array() ) { $sth_update_exon -> execute( $exondata[1],$exondata[2], $exondata[3],$exondata[4], $geneid, $exondata[0] ); # print "Updated gene $geneid 's exon num $exondata[0]\n"; } else { $sth_insert_exon -> execute( $geneid, $exondata[0], $exondata[1], $exondata[2], $exondata[3], $exondata[4] ); # print "Inserted gene $geneid 's exon num $exondata[0]\n" +; } } else { # Elements will be ( exon_num, type, locus_start, locus_end +) @exondata = ParseSubExon( $line ); $sth_find_subexon -> execute( $geneid, $exondata[0], $exonda +ta[1] ); if ( $sth_find_subexon -> fetchrow_array() ) { $sth_update_subexon -> execute( $exondata[2], $exondata[3] +, $geneid, $exondata[0], $exondata[1]); # print "Updated gene $geneid 's exon $exondata[0] 's subexo +n (type =$ } else { $sth_insert_subexon -> execute( $geneid, $exondata[0] +, $exondata[1], $exondata[2] +, $exondata[3] ); # print "Inserted gene $geneid 's exon $exondata[0] 's subex +on (type $ } } } # End foreach block for @newrecord } # End exon handling } # End this record's entire while loop close INFILE; ### END EXONS TABLE FILL ######################################### ### DISCONNECT NICELY ############################################ # Finishing these statements is not absolutely necessary, but # it avoids an error. Sometimes the statements are not used. # If the statement isn't executed, then it's memory won't be # deallocated until the program ends ( which causes an error). $sth_match_achrom -> finish(); $sth_update_achroms -> finish(); $sth_match_gene -> finish(); $sth_update_genes -> finish(); $sth_find_match -> finish(); $sth_getcount -> finish(); $sth_updatecount -> finish(); $sth_select_gene_id -> finish(); $sth_find_exon -> finish(); $sth_find_subexon -> finish(); $dbh->disconnect or die "Couldn't disconnect from database: $DBI::errstr\n"; exit; ### SUBROUTINES START HERE ####################################### sub CleanData { my ($record, @cleanrecord, $line, @record); $record = $_[0]; ### Separate the record by newlines. @record = split /\n/, $record; ### Remove empty lines if they exist. foreach $line ( @record ) { chomp( $line ); next if ( $line =~ m/^\s*$/ ); push @cleanrecord, $line; } return ( @cleanrecord ); } ### PARSE ARTIFICIAL CHROMOSOMES ############################ sub ParseAChroms { my (@achrom, $line, $type, $ac_id, $length, $chr_num, $chr_strt, $chr_end, $bac_strt, $bac_end, $orient); my $achromref = $_[0]; foreach $line (@$achromref) { # Get type. Right now, all are BAC's. if ($line =~ m/\b((B|Y)AC)\b/) { $type = $1; } # Get reference number. if ($line =~ m/BAC:\s*(.*)\s*$/) { $ac_id = $1; } # Get artificial chromosome length if ($line =~ m/LENGTH:\s*(.*)\s*$/) { $length = $1; } # Get associated chromosome number if ($line =~ m/OSOME:\s*(.*)\s*$/) { $chr_num = $1; } # Get associated chromosome start base. if ($line =~ m/CHR_START:\s*(.*)\s*$/) { $chr_strt = $1; } # Get associated chromosome ending base. if ($line =~ m/CHR_END:\s*(.*)\s*$/) { $chr_end = $1; } # Get artificial chromosome start base. if ($line =~ m/BAC_START:\s*(.*)\s*$/) { $bac_strt = $1; } # Get artificial chromosome ending base. if ($line =~ m/BAC_END:\s*(.*)\s*$/) { $bac_end = $1; } # Get associate chromosome's orientation on the ac. if ($line =~ m/ORIENTATION:\s*(\w)\w*\s*$/) { $orient = $1; } } push @achrom, $type, $ac_id, $length, $chr_num, $chr_strt, $chr_end, $bac_strt, $bac_end, $orient; return @achrom; } ########## PARSE GENES ########################################## sub ParseGenes { my ( $line, @gene, $chrome_num, $bac, $tigr_id, $bac_locus, $chromo_locus, $model_begin, $model_end, $orient, $read_frame, $num_exons, $prod, $notes, $pseudo); my $generef = $_[0]; foreach $line (@$generef) { # Get chromosome number if ($line =~ m/CHROMOSOME:\s*(\d*)\s*$/) { $chrome_num = $1; } # Get TIGR model's reference id if ($line =~ m/MODEL_NAME:\s*(.*)$/) { $tigr_id = $1; } # Get the locus on the bac where the gene was cloned. if ($line =~ m/BAC_LOCUS:\s*(.*)\s*$/) { $bac_locus = $1; } # Get full name if ($line =~ m/COMMON_NAME:\s*(.*)$/) { $prod = $1; } if ( $line =~ m/PSEUDOGENE:\s*(\w*)\s*$/) { $pseudo = $1; } # Get chromosome locus identifier if ($line =~ m/CHROMO_LOCUS:\s*(.*)\s*$/) { $chromo_locus = $1; } # Get starting place if ($line =~ m/MODEL_BEGIN:\s*(\d+)\s*$/) { $model_begin = $1; } # Get ending place if ($line =~ m/MODEL_END:\s*(\d+)\s*$/) { $model_end = $1; } # Get whether forward or reverse if ($line =~ m/ORIENTATION:\s*(\w)_(\d)\s*$/) { $orient = $1; $read_frame = $2; } # Get number of exons in the gene if ($line =~ m/NUM_EXONS:\s*(\d+)\s*$/) { $num_exons = $1; } # Get any notes attached to the gene if ($line =~ m/COMMENT:\s*(.*)$/) { $notes = $1; } # Get the bac where the gene was cloned. if ($line =~ m/BAC:\s*(.*)\s*$/) { $bac = $1; } } push @gene, $chrome_num, $tigr_id, $bac_locus, $pseudo, $chromo_locu +s, $model_begin, $model_end, $orient, $read_frame, $num_exons, $pr +od, $notes, $bac; return @gene; } ########## PARSE EXONS ########################################## sub ParseExon { my $line = $_[0]; my @exon=(); # Get exon's order number. $line=~m/EXON\s(\d+)/; push @exon, $1; # Get the exon type from inside the parentheses $line=~m/\((.*)\)/; push @exon, $1; # Get TIGR's model number $line=~m/\b(\d*\.e\d*)\b/; push @exon, $1; # Get start position $line=~m/pos:\s*\b(\d*)\b/; push @exon, $1; # Get end position $line=~m/\b(\d*)\s*$/; push @exon, $1; return (@exon); } ########## PARSE SUB EXONS ########################################## sub ParseSubExon { my $line = $_[0]; my @subexon=(); # Get parent exon's order number. $line=~m/EXON\s(\d+)/; push @subexon, $1; # Get the subexon type $line=~m/\d\s*:\s*(.*)\s*:\s*pos/; push @subexon, $1; # Get start position $line=~m/pos:\s*(\d*)\b/; push @subexon, $1; # Get end position $line=~m/\d-\s*(\d*)\s*$/; push @subexon, $1; return ( @subexon ); }