biograd's scratchpad

Public Scratchpad

#This is an excerpt of the input text file:

CHROMOSOME:     1
BAC:            F7G19
MODEL_NAME:     68275.m00846
BAC_LOCUS:      F7G19_2
CHROMO_LOCUS:   At1g09100
COMMON_NAME:    putative 26S protease regulatory subunit 6A
COMMENT:        Similar to probable Mg-dependent ATPase (pir S56671). 
+ESTs gb|T46782,gb|AA04798 come from this gene
PSEUDOGENE:     NO
MODEL_BEGIN:    2939304
MODEL_END:      2936520
ORIENTATION:    R_0
NUM_EXONS:      10
.....EXON 1 (COMB_EXON):        68275.e04959   pos:   2939304  -  2939
+146
          EXON 1 : LEFT_UTR    :   pos: 2939304  -  2939263
          EXON 1 : CDS         :   pos: 2939262  -  2939146
.....EXON 2 (CDS)      :        68275.e04958   pos:   2938801  -  2938
+685
.....EXON 3 (CDS)      :        68275.e04957   pos:   2938445  -  2938
+338
.....EXON 4 (CDS)      :        68275.e04956   pos:   2938219  -  2938
+028
.....EXON 5 (CDS)      :        68275.e04955   pos:   2937929  -  2937
+738
.....EXON 6 (CDS)      :        68275.e04954   pos:   2937641  -  2937
+523
.....EXON 7 (CDS)      :        68275.e04953   pos:   2937445  -  2937
+358
.....EXON 8 (CDS)      :        68275.e04952   pos:   2937276  -  2937
+157
.....EXON 9 (CDS)      :        68275.e04951   pos:   2936976  -  2936
+836
.....EXON 10 (COMB_EXON):       68275.e04950   pos:   2936756  -  2936
+520
          EXON 10 : CDS         :   pos: 2936756  -  2936679
          EXON 10 : RIGHT_UTR   :   pos: 2936678  -  2936520
-----
CHROMOSOME:     1
BAC:            F7G19
MODEL_NAME:     68275.m00847
BAC_LOCUS:      F7G19_1
CHROMO_LOCUS:   At1g09130
COMMON_NAME:    unknown protein
COMMENT:        Similar to ATP-dependent Clp protease (gb D90915). EST
+ gb|N6546$
PSEUDOGENE:     NO
MODEL_BEGIN:    2942221
MODEL_END:      2940342
ORIENTATION:    R_1
NUM_EXONS:      8
.....EXON 1 (CDS)      :        68275.e04967   pos:   2942221  -  2941
+913
.....EXON 2 (CDS)      :        68275.e04966   pos:   2941652  -  2941
+551
.....EXON 3 (CDS)      :        68275.e04965   pos:   2941477  -  2941
+355
.....EXON 4 (CDS)      :        68275.e04964   pos:   2941276  -  2941
+217
.....EXON 5 (CDS)      :        68275.e04963   pos:   2941058  -  2940
+962
.....EXON 6 (CDS)      :        68275.e04962   pos:   2940874  -  2940
+801
.....EXON 7 (CDS)      :        68275.e04961   pos:   2940580  -  2940
+527
.....EXON 8 (CDS)      :        68275.e04960   pos:   2940452  -  2940
+342
-----
CHROMOSOME:     1
BAC:            T12M4
MODEL_NAME:     68275.m00848
BAC_LOCUS:      T12M4_19
CHROMO_LOCUS:   At1g09140
COMMON_NAME:    putative SF2 ASF splicing modulator, Srp30
COMMENT:        similar to GB:CAB42558
PSEUDOGENE:     NO
MODEL_BEGIN:    2945944
MODEL_END:      2943169
ORIENTATION:    R_2
NUM_EXONS:      11
.....EXON 1 (COMB_EXON):        68275.e04978   pos:   2945944  -  2945
+736
          EXON 1 : LEFT_UTR    :   pos: 2945944  -  2945823
          EXON 1 : CDS         :   pos: 2945822  -  2945736
.....EXON 2 (CDS)      :        68275.e04977   pos:   2945612  -  2945
+547
.....EXON 3 (CDS)      :        68275.e04976   pos:   2945460  -  2945
+386
.....EXON 4 (CDS)      :        68275.e04975   pos:   2945306  -  2945
+204
.....EXON 5 (CDS)      :        68275.e04974   pos:   2945118  -  2945
+072
.....EXON 6 (CDS)      :        68275.e04973   pos:   2944964  -  2944
+907
.....EXON 7 (CDS)      :        68275.e04972   pos:   2944783  -  2944
+734
.....EXON 8 (CDS)      :        68275.e04971   pos:   2944590  -  2944
+534
.....EXON 9 (CDS)      :        68275.e04970   pos:   2944460  -  2944
+382
.....EXON 10 (CDS)      :       68275.e04969   pos:   2944250  -  2944
+173
.....EXON 11 (CDS)      :       68275.e04968   pos:   2943230  -  2943
+169
-----
[download]

Below is the loading program... the database has separate tables for genes, artificial chromosome data ("achroms"-this is OK, so I didn't include the input format for it), exons, subexons, and a relateGAC table that holds the many-to-many possible relationships for genes to artificial chromosomes. There are over 26,000 gene records in the gene text file. There are more in the next update that TIGR put out recently. How can this script be optimized for speed? It took a colossal 17 hrs to run this weekend, but it works. It justs gets slower with each gene/exon/subexon record. A lot of the documentation is meant for my non-programmer-type committee members.

#!/usr/bin/perl -w

################################################################
# Programmer:   biograd
# Program:      TIGRFillTables.plx
# Date begun:   8/17/01
# Description:  This is a perl script made to automatically take
#               information from previously generated files
#               containing _Arabidopsis thaliana_ genomic data
#               from the TIGR database and place it into a local
#               mysql database. Since it is intended for automation,
#               a user name and password are hard-coded.
# Input:        -Files formatted by Dr. Johns, such as
#                /arabidopsis/bac_info_08102001.txt  and
#                /arabidopsis/gene_info_08102001.txt
# Effect:       -Successful: information will be parsed from the
#                files and loaded into correct database areas.
#                
# Outline:      I.   CONNECT
#               II.  ACHROMS
#                    A. Prepare statements to be used
#                    B. Loop through each bacfile record
#                       1. CleanData()
#                       2. ParseAChroms()
#                       3. Check for matching record in db
#                          a. If match, then update record
#                          b. If no match, insert record
#               III. GENES, RELATEGAC, EXONS
#                    A. Prepare statements for filling:
#                       1. Genes
#                       2. RelateGAC (genes to artificial chroms)
#                       3. Exons
#                    B. Loop through each genefile record
#                       1. CleanData()
#                       2. ParseGenes()
#                       3. Check for matching gene record in db
#                          a. If match, then update gene  
#                          b. If no match, insert gene info
#                       4. Check if an AC is listed in the gene info
#                          a. If yes, check relateGAC table for an
#                             exact match of gene with this AC.
#                             1) If match, increment count by 1
#                             2) If no match, insert relation into
#                                relateGAC table.
#                       5. ParseExon()
#                       6. Loop through dereferenced exon arrays
#                         a.  Check for matching exon record in db
#                              1) If match, update exon
#                              2) If no match, insert exon info
#              b.  Check if sub_exon data is there
#                   1) If match, update sub-exon
#                   2) If no match, insert sub_exon
#               IV.  DISCONNECT
#                    A. Finish statements
#                    B. Disconnect from database
#               V.   SUBROUTINE DEFINITIONS
#                    A. CleanData()
#                       1. Removes empty lines and newlines
#                       2. Returns array of good lines
#                    B. ParseAChroms()
#                       1. Searches for information
#                       2. Plugs information into an array
#                       3. Returns array
#                    C. ParseGenes()
#                       1. Searches for information
#                       2. Plugs information into an array
#                       3. Returns array
#                    D. ParseExons()
#                       1. Searches for each exon's info
#                       2. Forms array for each exon
#                       3. Returns array of array references
################################################################

use strict;
use DBI;
my ($dbh, $sth_genes, $sth_achroms, $sth_rgac, $record,
    $sth_find_match, $sth_getcount, @count, $geneid,
    $current_bac, $sth_select_gene_id, $sth_updatecount,
    $sth_match_gene, $sth_update_genes, @record, $line,   
    $sth_match_achrom, $sth_update_achroms,
    @newrecord, @gene, @achrom, @rgac, @temp, $temp, $n,
    @exondata, $sth_find_exon, $sth_find_subexon,  
    $sth_insert_exon, $sth_insert_subexon,
    $sth_update_exon, $sth_update_subexon);

$n = 0;

### CONNECT ######################################################


  $dbh = DBI->connect( "DBI:mysql:arabidopsis", "", "",
         { PrintError => 0, RaiseError => 1})
         or die "\nCouldn't connect to database: $DBI::errstr\n";


### PREPARE SQL FOR ACHROMS TABLE ####################################

  # This statement is sent to the database to be parsed and planned
  # before any data is touched. This move saves time in the long run.
  # Question marks are where data will be "filled in" or bound during
  # each execution of this statement in the loop.
  $sth_achroms = $dbh -> prepare( "
                INSERT INTO achroms (type, ac_id, length,
                chr_num, chr_strt, chr_end, bac_strt, bac_end,
                orient) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?,
                ?) ");

  $sth_match_achrom = $dbh -> prepare("
                SELECT ac_id
                FROM achroms
                WHERE ac_id = ? ");

  # This statement is run incase an artificial chromosome is already  
  # listed in the database but could have new data to be added
  # or changed.
  $sth_update_achroms = $dbh -> prepare("
                UPDATE achroms
                SET type = ?, length = ?, chr_num = ?, chr_strt = ?,
                chr_end = ?, bac_strt = ?, bac_end = ?, orient = ?
                WHERE ac_id = ? ");


### OPEN ACHROMS DATAFILE ###########################################

  # These were my test files. "bac.ht" is a smaller file that holds
  # the head +130 and tail +130 of the "bac_info_08102001.txt" file.
#  open (INFILE, "bac_info_08102001.txt") or die "Couldn't open bacfil
+e:$1";
  open (INFILE, "bac.ht") or die "Couldn't open bacfile: $1";

### FILL ACHROMS TABLE ###########################################    

  # Take in whole records separated by five dashes instead of a
  # newline character.
  $/ = "-----";
  while($record = <INFILE>)
  {
    # Print out a counter to show how far the filling has progressed.
#    print "$n\n";
#    $n++;

    @newrecord = ();
    # Send the record to have any blank lines removed.
    @newrecord = CleanData( $record );

    # If there is an array returned from CleanData(), parse  
    # it. This removes the chance of empty records causing
    # extra null records in the database.
    @achrom = ();
    if (@newrecord)
    {
      # Send the clean record to be parsed for Artificial
      # Chromosome data only.
      @achrom = ParseAChroms( \@newrecord );

      $sth_match_achrom -> execute( $achrom[1] );
      $temp = $sth_match_achrom -> fetchrow_array();

      if ( $temp )                      # if there's a match, update d
+ata
      {
        $sth_update_achroms -> execute( $achrom[0], $achrom[2],
                $achrom[3], $achrom[4], $achrom[5], $achrom[6],
                $achrom[7], $achrom[8], $achrom[1] );
      }
      else                              # if no match, insert data
      {
        # Bind (fill in) the question marks in the prepared insert
        # statement. The bind data is from the returned @achrom elemen
+ts.
        $sth_achroms -> execute( $achrom[0],$achrom[1],$achrom[2],
                     $achrom[3], $achrom[4], $achrom[5], $achrom[6],
                     $achrom[7], $achrom[8]);
      }
    }
  }
  close INFILE;

### END ACHROMS TABLE FILL #########################################

### PREPARE SQL FOR GENES, RELATEGAC, AND EXONS TABLES ###############
+#

  # Question marks are where data will be "filled in" or bound during
  # each execution of these statements in the loop.
  # This statement will insert the bind data into the genes table
  # in the fields listed.

  # GENES This statement will try to select the present tigr chromosom
+al
  # reference number for the current gene. If it is present, then the
  # UPDATE statment needs to be used instead of the INSERT.
  $sth_match_gene = $dbh -> prepare("
                SELECT gene_id
                FROM genes
                WHERE tigr_id = ? ");

  # GENES This statement puts new data into a new record.
  $sth_genes = $dbh -> prepare( "
                INSERT INTO genes
                (chrom_num, tigr_id, bac_locus, pseudogene,
                chrom_locus, locus_start, locus_end, orientation,
                read_frame, exon_count, product, notes, ac_id)
                VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" );

  # GENES This is the update statement to be used when a gene is alrea
+dy
  # listed but new data about it might be provided.
  $sth_update_genes = $dbh -> prepare("
                UPDATE IGNORE genes
                SET chrom_num = ?, bac_locus = ?, pseudogene = ?,
                chrom_locus = ?, locus_start = ?, locus_end = ?,
                orientation = ?, read_frame = ?, exon_count = ?,
                product = ?, notes = ?, ac_id = ?
                WHERE tigr_id = ? ");

  # GENES Just to make things easier, find the unique gene number that
  # goes to the current tigr gene reference.
  $sth_select_gene_id = $dbh -> prepare( "
                SELECT gene_id
                FROM genes
                WHERE chrom_locus = ? ");

  # RELATEGAC This statement will find whether a pre-existing relation
+ship
  # of the kind specified by the bound data is there. The success
  # can be tested.
  $sth_find_match = $dbh -> prepare("
                SELECT gene_id, ac_id FROM relateGAC
                WHERE gene_id = ?  AND ac_id = ? ");

  # RELATEGAC This statement will insert related unique gene numbers g
+iven
  # by the database and the names of artificial chromosomes.
  # The "count" field is to be increased any time a duplicate
  # relationship is found. For the insert function, it will
  # always be "1". It will be increased if necessary with an update
  # statement within the loop.
  $sth_rgac = $dbh -> prepare( "
                INSERT INTO relateGAC (gene_id, ac_id, count)
                VALUES ( ?, ?, 1 ) ");

  # RELATEGAC This statement will select the count data from the
  # genes/artificial chromosome relational table ("relateGAC")   
  # where the unique gene id number is the same as that in
  # the requested bound data.
  $sth_getcount = $dbh -> prepare("
                SELECT count FROM relateGAC
                WHERE gene_id = ? ");

  # RELATEGAC This statement will allow the count field to be updated
  # to a given bound integer when the unique gene id number  
  # matches the given bound data.
  $sth_updatecount = $dbh -> prepare("
                UPDATE relateGAC
                SET count = ?   
                WHERE gene_id = ? ");

  # EXONS

  $sth_find_exon = $dbh -> prepare("
                SELECT gene_id, exon_num
                FROM exons   
                WHERE gene_id = ? AND exon_num = ? ");

  $sth_find_subexon = $dbh -> prepare("
                SELECT sub_type, sub_locus_start
                FROM subexons
                WHERE gene_id = ? AND exon_num = ?
                                  AND sub_type = ? ");

  $sth_update_exon = $dbh ->prepare("
                UPDATE IGNORE exons
                SET type = ?, tigr_id = ?, locus_start = ?,
                    locus_end = ?
                WHERE gene_id = ? AND exon_num = ?");

  $sth_update_subexon = $dbh -> prepare("
                UPDATE IGNORE subexons  
                SET sub_locus_start = ?,
                    sub_locus_end = ?
                WHERE gene_id = ? AND exon_num = ?
                                  AND sub_type = ? ");

  $sth_insert_exon = $dbh -> prepare("
                INSERT INTO exons
                (gene_id, exon_num, type, tigr_id,
                 locus_start, locus_end)
                VALUES ( ?,?,?,?,?,? ) ");

  $sth_insert_subexon = $dbh -> prepare("
                INSERT INTO subexons
                (gene_id, exon_num, sub_type,
                 sub_locus_start, sub_locus_end)
                VALUES ( ?,?,?,?,? ) "); 


### OPEN GENE DATAFILE  #########################################

  # These are the test chromosome data files. They have the same
  # format as the bac files did above.
#  open (INFILE, "gene_info_08102001.txt") or die "Couldn't open chrfi
+le:$1";
    open (INFILE, "genes.ht") or die "Couldn't open infile: $1";

### FILL GENES TABLE #########################################

  # Reset the counter to zero so we can see how many gene records
  # are processed. Separator is set to five dashes.
  $n = 0;
  $/ = "-----";
  while($record = <INFILE>)
  {
    print "\nz$n\n";
    $n++;
    @newrecord = ();
    @newrecord = CleanData( $record );  # Remove newlines from data.
    @gene=();

    # If there is an array returned from CleanData(), parse
    # it. This removes the chance of empty records causing
    # extra null records in the database.
    if ( @newrecord )
    {
      @gene = ParseGenes( \@newrecord );

      $sth_match_gene -> execute( $gene[1] );
      $temp = $sth_match_gene -> fetchrow_array();

      if ( $temp )
      {
        # Execute the update statement, since this tigr
        # reference number appears to be listed already.
        # The bind data are the returned elements from the
        # ParseGenes() subroutine. The ref# is not updated.
        $sth_update_genes -> execute ( $gene[0], $gene[2],
                   $gene[3], $gene[4], $gene[5], $gene[6], 
                   $gene[7], $gene[8], $gene[9], $gene[10],
                   $gene[11], $gene[12], $gene[1] );
      }
      else
      {
        # Execute the previously made statement to insert the data
        # from this record into the genes table.
        $sth_genes -> execute( $gene[0], $gene[1], $gene[2],
                     $gene[3], $gene[4], $gene[5], $gene[6],
                     $gene[7], $gene[8], $gene[9], $gene[10],
                     $gene[11], $gene[12] );
      }
    }

### END GENES TABLE FILL #########################################

### FILL RELATEGAC TABLE #########################################

    # If there is a bac associated with this gene, insert the
    # association into the relateGAC table. The gene[12] element
    # is the bac name from the data.
    if( $gene[12] )
    {
      $current_bac = $gene[12];

      # Find the unique gene id number for the current tigr 
      # gene by matching the locus name (element 4 of the gene
      # array from parsing). Usually, this would be the last
      # entered gene, but there may be duplicates. This method
      # should avoid duplication.
      $sth_select_gene_id -> execute ( $gene[4] );
      @temp = ();
      @temp = $sth_select_gene_id -> fetchrow_array();
      $geneid = $temp[0];

      # If this statement is able to execute, continue.
      if ($sth_find_match-> execute( $geneid, $current_bac ) )  
      {
        # Get the data from the matching row.
        $temp = 0;
        $temp = $sth_find_match -> fetchrow_array();

        # If any matching rows are found, update the count. 
        # Do not insert another row.
        if ( $temp )
        {
         @count = ();
          $sth_getcount -> execute( $geneid );
          @count = $sth_getcount -> fetchrow_array();
          $count[0]++;
          print "Gene id number is $geneid.\nCount is : $count[0]\n";
          $sth_updatecount -> execute ( $count[0], $geneid );
        }
        # If there are no matches (@temp is undefined), insert  
        # the new row, with a count of "1" automatically entered
        # from the prepare statement, and the last used unique
        # gene id number.
        else
        {
          $sth_rgac -> execute( $sth_genes->{mysql_insertid} ,
                                $current_bac );
        }
      }  
      else
      {
        print "\n\$sth_find_match was unable to execute,\n
               $current_bac from gene $geneid
               was not processed : $DBI::errstr\n";

      } # End if loop for insertion/count update routine

    } # End if loop for $gene[12] test (BAC name existance)

### END RELATEGAC TABLE FILL################################

### FILL EXONS TABLE########################################
    if ( @newrecord )
    {    
      foreach $line ( @newrecord )
      {
        @exondata=();
        next unless $line=~m/EXON\b/;

        if ( $line=~m/^\.{5}/ )         # line contains exon data
        {
          # Elements are ( exon_num, type, tigr_id, locus_start, locus
+_end )
          @exondata=ParseExon( $line );

          $sth_find_exon -> execute( $geneid, $exondata[0] );
          if ( $sth_find_exon -> fetchrow_array() )
          {
            $sth_update_exon -> execute( $exondata[1],$exondata[2],
                                         $exondata[3],$exondata[4],
                                         $geneid,     $exondata[0] );
#           print "Updated gene $geneid 's exon num   $exondata[0]\n";
          }
          else
          {
            $sth_insert_exon -> execute( $geneid, $exondata[0],
                                         $exondata[1], $exondata[2],
                                         $exondata[3], $exondata[4] );
#           print "Inserted gene $geneid 's exon num   $exondata[0]\n"
+;
          }
        }
        else
        {
          # Elements will be ( exon_num, type, locus_start, locus_end 
+)
          @exondata = ParseSubExon( $line );
          $sth_find_subexon -> execute( $geneid, $exondata[0], $exonda
+ta[1] );

          if ( $sth_find_subexon -> fetchrow_array() )
          {
            $sth_update_subexon -> execute( $exondata[2], $exondata[3]
+,
                                            $geneid, $exondata[0],
                                            $exondata[1]);
#           print "Updated gene $geneid 's exon $exondata[0] 's subexo
+n (type =$          }
          else
          {
            $sth_insert_subexon -> execute( $geneid,      $exondata[0]
+,
                                            $exondata[1], $exondata[2]
+,
                                            $exondata[3] );
#           print "Inserted gene $geneid 's exon $exondata[0] 's subex
+on (type $          }
        }
      } # End foreach block for @newrecord
    } # End exon handling
  } # End this record's entire while loop

  close INFILE;

### END EXONS TABLE FILL #########################################

### DISCONNECT NICELY ############################################

  # Finishing these statements is not absolutely necessary, but
  # it avoids an error. Sometimes the statements are not used.
  # If the statement isn't executed, then it's memory won't be
  # deallocated until the program ends ( which causes an error).
  $sth_match_achrom   -> finish();
  $sth_update_achroms -> finish();
  $sth_match_gene     -> finish();
  $sth_update_genes   -> finish();
  $sth_find_match     -> finish();
  $sth_getcount       -> finish();
  $sth_updatecount    -> finish();
  $sth_select_gene_id -> finish();
  $sth_find_exon      -> finish();
  $sth_find_subexon   -> finish();

  $dbh->disconnect
        or die "Couldn't disconnect from database: $DBI::errstr\n";
  exit;


### SUBROUTINES START HERE #######################################

sub CleanData
{
    my ($record, @cleanrecord, $line, @record);
    $record = $_[0];

    ### Separate the record by newlines.
    @record = split /\n/, $record;

    ### Remove empty lines if they exist.
    foreach $line ( @record )
    {
      chomp( $line );
      next if ( $line =~ m/^\s*$/ );
      push @cleanrecord, $line;
    }
    return ( @cleanrecord );
}

### PARSE ARTIFICIAL CHROMOSOMES ############################
sub ParseAChroms
{
  my (@achrom, $line, $type, $ac_id, $length, $chr_num,
      $chr_strt, $chr_end, $bac_strt, $bac_end, $orient);

  my $achromref = $_[0];
  foreach $line (@$achromref)
  {
    # Get type. Right now, all are BAC's.
    if ($line =~ m/\b((B|Y)AC)\b/)
    {
      $type = $1;
    }
    
    # Get reference number.
    if ($line =~ m/BAC:\s*(.*)\s*$/)
    {
      $ac_id = $1;
    }

    # Get artificial chromosome length
    if ($line =~ m/LENGTH:\s*(.*)\s*$/)
    {
      $length = $1;
    }

    # Get associated chromosome number
    if ($line =~ m/OSOME:\s*(.*)\s*$/)
    {
      $chr_num = $1;
    }

    # Get associated chromosome start base.
    if ($line =~ m/CHR_START:\s*(.*)\s*$/)
    {
      $chr_strt = $1;
    }

    # Get associated chromosome ending base.
    if ($line =~ m/CHR_END:\s*(.*)\s*$/)
    {
      $chr_end = $1;
    }

    # Get artificial chromosome start base.
    if ($line =~ m/BAC_START:\s*(.*)\s*$/)
    {
      $bac_strt = $1;
    }

    # Get artificial chromosome ending base.
    if ($line =~ m/BAC_END:\s*(.*)\s*$/)
    {
      $bac_end = $1;
    }

    # Get associate chromosome's orientation on the ac.
    if ($line =~ m/ORIENTATION:\s*(\w)\w*\s*$/)
    {
      $orient = $1;
    }
  }
  push @achrom, $type, $ac_id, $length, $chr_num,
      $chr_strt, $chr_end, $bac_strt, $bac_end, $orient;

  return @achrom;
}

########## PARSE GENES ##########################################
sub ParseGenes
{
  my ( $line, @gene, $chrome_num, $bac, $tigr_id,
       $bac_locus, $chromo_locus, $model_begin, $model_end,
       $orient, $read_frame, $num_exons, $prod, $notes, $pseudo);

  my $generef = $_[0];

  foreach $line (@$generef)
  {  
    # Get chromosome number
    if ($line =~ m/CHROMOSOME:\s*(\d*)\s*$/)
    {
      $chrome_num = $1;
    }

    # Get TIGR model's reference id
    if ($line =~ m/MODEL_NAME:\s*(.*)$/)
    {
      $tigr_id = $1;
    }

    # Get the locus on the bac where the gene was cloned.
    if ($line =~ m/BAC_LOCUS:\s*(.*)\s*$/)
    {
      $bac_locus = $1;
    }

    # Get full name
    if ($line =~ m/COMMON_NAME:\s*(.*)$/)
    {
      $prod = $1;
    }

    if ( $line =~ m/PSEUDOGENE:\s*(\w*)\s*$/)
    {
      $pseudo = $1;
    }

    # Get chromosome locus identifier
    if ($line =~ m/CHROMO_LOCUS:\s*(.*)\s*$/)
    {
      $chromo_locus = $1;
    }

    # Get starting place
    if ($line =~ m/MODEL_BEGIN:\s*(\d+)\s*$/)
    {
      $model_begin = $1;
    }

    # Get ending place
    if ($line =~ m/MODEL_END:\s*(\d+)\s*$/)
    {
      $model_end = $1;
    }

    # Get whether forward or reverse
    if ($line =~ m/ORIENTATION:\s*(\w)_(\d)\s*$/)
    {
      $orient = $1;
      $read_frame = $2;
    }

    # Get number of exons in the gene
    if ($line =~ m/NUM_EXONS:\s*(\d+)\s*$/)
    {
      $num_exons = $1;
    }

    # Get any notes attached to the gene
    if ($line =~ m/COMMENT:\s*(.*)$/)
    {
      $notes = $1;
    }

    # Get the bac where the gene was cloned.
    if ($line =~ m/BAC:\s*(.*)\s*$/)
    {
      $bac = $1;
    }
  }
  push @gene, $chrome_num, $tigr_id, $bac_locus, $pseudo, $chromo_locu
+s,
       $model_begin, $model_end, $orient, $read_frame, $num_exons, $pr
+od,
       $notes, $bac;
  return @gene;
}

########## PARSE EXONS ##########################################

sub ParseExon
{
  my $line = $_[0];
  my @exon=();

  # Get exon's order number.
  $line=~m/EXON\s(\d+)/;
  push @exon, $1;

  # Get the exon type from inside the parentheses
  $line=~m/\((.*)\)/;
  push @exon, $1;

  # Get TIGR's model number
  $line=~m/\b(\d*\.e\d*)\b/;
  push @exon, $1;

  # Get start position
  $line=~m/pos:\s*\b(\d*)\b/;
  push @exon, $1;

  # Get end position
  $line=~m/\b(\d*)\s*$/;

  push @exon, $1;

  return (@exon);
}

########## PARSE SUB EXONS ##########################################

sub ParseSubExon
{
  my $line = $_[0];
  my @subexon=();

  # Get parent exon's order number.
  $line=~m/EXON\s(\d+)/;
  push @subexon, $1;

  # Get the subexon type
  $line=~m/\d\s*:\s*(.*)\s*:\s*pos/;
  push @subexon, $1;
  # Get start position
  $line=~m/pos:\s*(\d*)\b/;
  push @subexon, $1;

  # Get end position
  $line=~m/\d-\s*(\d*)\s*$/;
  push @subexon, $1;

  return ( @subexon );

}
[download]