#!/usr/bin/perl use strict; use warnings; use Bio::SeqIO; my $in = Bio::SeqIO->new( -file => "dmel-all-chromosome-r6.18.fasta" , -format => 'fasta'); my $fasta; while ( my $seq = $in->next_seq() ) { $fasta .= $seq->seq; } my %seen; for my $i (0 .. length($fasta) - 21) { my $kmer = substr $fasta, $i, 21; next unless substr($kmer, -2) eq 'GG'; my $match = substr($kmer, -12); $seen{$match}{count}++; $seen{$match}{kmer} = $kmer; } my $crispr; for my $key (keys %seen) { next unless $seen{$key}{count} == 1; print "crispr_", ++$crispr, "\n"; print $seen{$key}{kmer}, "\n"; } __END__ *** ouput crispr_1 TTTAGACTCCCCTTGTACAGG crispr_2 TCTTCAGTCTCCAGTCTCCGG crispr_3 TTGCGTTGCGGAGCATACTGG crispr_4 TGCCACCAGTGGTTCCAAGGG crispr_5 TTATGTTTGTACGAGGGGGGG crispr_6 TCTCTTTGGTTTACGGATGGG crispr_7 TTGGCAAGGAGACGGTCCTGG crispr_8 TGAATTAAAGCTTGCGCGAGG crispr_9 GGAAGAGGCATCAACGAGGGG crispr_10 TGCAGCGGCCTAACAAGGCGG crispr_11 CTGCCCGATCCTAACTCCAGG crispr_12 ATATATGTTTGACCGTCGGGG ... crispr_126892 TTGCTTGGCACTAAGGCGGGG crispr_126893 CACCAAAAAGGACTTGCGTGG crispr_126894 GTGCCCCTCACTCATGCGGGG crispr_126895 TAAAAAGCGACGCAGTATTGG crispr_126896 CCGGATTTCTTCGTACAGGGG crispr_126897 GGTGGCTATGCTATGGTACGG crispr_126898 CTGCGTTGATGTTAGGTAGGG crispr_126899 GCTGGGACCCGAATACGTAGG