Just thought I finish off the code by actually returning the hash back to perl...
#!/usr/bin/perl
use Inline C;
use Benchmark;
my $gen = "atgcgc"x500000; #3 million characters
my $h_ref;
$tests{"inline"} = sub { $h_ref = string_inline_c($gen, length($gen))
+};
$tests{"hash_string"} = sub { hash_string($gen) };
timethese(2, \%tests);
sub hash_string {
my ($genome) = @_;
my %count;
$count{ substr($genome, $_, 2) }++ for (0..length($genome)-2);
}
__END__
__C__
SV* string_inline_c(char *genome, int len)
{
int i;
int hash[96];
HV* perl_hash=newHV();
/* The hashing function is simply 4*(first char - 'a') + second ch
+ar - 'a' */
/* i.e. the bucket for gg is 4*('g'-'a')+'g'-'a' = 30 */
/*initialize our 'C' hash buckets which will get used*/
/*aa*/ /*ac*/ /*ag*/ /*at*/
hash[ 0] = hash[ 2] = hash[ 6] = hash[19] = 0;
/*ca*/ /*cc*/ /*cg*/ /*ct*/
hash[ 8] = hash[10] = hash[14] = hash[27] = 0;
/*ga*/ /*gc*/ /*gg*/ /*gt*/
hash[24] = hash[26] = hash[30] = hash[43] = 0;
/*ta*/ /*tc*/ /*tg*/ /*tt*/
hash[76] = hash[78] = hash[82] = hash[95] = 0;
for(i=0;i<len-1;i++)
{
hash[4*(genome[i]-'a')+(genome[i+1]-'a')]++;
}
/*move our values over from the 'C' hash to the perl hash*/
#define h(c,i) (hv_store(perl_hash, (c), sizeof((c))-1, newSViv(hash[(
+i)]), 0))
h("aa", 0); h("ac", 2); h("ag", 6); h("at",19);
h("ca", 8); h("cc",10); h("cg",14); h("ct",27);
h("ga",24); h("gc",26); h("gg",30); h("gt",43);
h("ta",76); h("tc",78); h("tg",82); h("tt",95);
return newRV_noinc((SV*) perl_hash); /*return a ref to a hash*/
}
|