Hi amitgsir
Not wanting to assume what your criteria is for UMIs that comprise a cluster; could you expand on what you mean by:
2. Column 3 must also be similar, i.e. in each cluster lines with similar UMI allowing 2 mismatch will be clustered together
We need a bit more to go on:
- What do the [up to 3] UMIs need to have in common to be considered similar?
- Why is TCACGGTG in the first cluster instead of TCAAAATG?
If it's all the same and it doesn't matter what the UMIs are as long as you have them in sets of 3s; you could try this:
#!perl -slw
use strict;
my ($chromosomes,$DELIMITER) = (undef,'CLUSTER');
while ( <DATA> ) {
s/\R//g; # remove line breaks;
my $record = [split /\s+/];
push @{$chromosomes->{$record->[0]}->{$record->[2]}},[$record->[1]
+,$record->[3]];
}
foreach my $chrM (sort keys %{$chromosomes}) {
my $cnt = 0; # used to print delimiter
foreach my $UMI (sort {$a cmp $b} keys %{$chromosomes->{$chrM}}) {
print $DELIMITER unless $cnt++ % 3;
print "$chrM\t$_->[0]\t$UMI\t$_->[1]"
foreach (sort {$a->[0] <=> $b->[0] or
$a->[1] <=> $b->[1]} @{$chromosomes->{$chrM}->{$UMI
+}});
}
}
__DATA__
chrM:307 0 AGCGGGGA 129
chrM:307 0 AGCGGGGA 130
chrM:307 0 AGCGGGGA 129
chrM:308 0 AGCGGGGA 129
chrM:308 0 AGCGGGGA 130
chrM:308 0 AGCGGGGA 129
chrM:309 0 AGCGGGGA 129
chrM:309 0 AGCGGGGA 130
chrM:309 0 AGCGGGGA 129
chrM:307 0 TCAAAATG 130
chrM:308 0 TCAAAATG 130
chrM:309 0 TCAAAATG 130
chrM:307 0 TCACGGTG 130
chrM:308 0 TCACGGTG 130
chrM:309 0 TCACGGTG 130
chrM:307 0 TCAGCCTG 129
chrM:308 0 TCAGCCTG 129
chrM:309 0 TCAGCCTG 129
chrM:307 0 TCAGGGAG 130
chrM:308 0 TCAGGGAG 130
chrM:309 0 TCAGGGAG 130
chrM:307 1 TCAGGGTG 106
chrM:307 2 TCAGGGTG 130
chrM:307 2 TCAGGGTG 129
chrM:308 1 TCAGGGTG 106
chrM:308 2 TCAGGGTG 130
chrM:308 2 TCAGGGTG 129
chrM:309 1 TCAGGGTG 106
chrM:309 2 TCAGGGTG 130
chrM:309 2 TCAGGGTG 129
Output
C:\code\perlmonks>perl pm_1198131.pl
CLUSTER
chrM:307 0 AGCGGGGA 129
chrM:307 0 AGCGGGGA 129
chrM:307 0 AGCGGGGA 130
chrM:307 0 TCAAAATG 130
chrM:307 0 TCACGGTG 130
CLUSTER
chrM:307 0 TCAGCCTG 129
chrM:307 0 TCAGGGAG 130
chrM:307 1 TCAGGGTG 106
chrM:307 2 TCAGGGTG 129
chrM:307 2 TCAGGGTG 130
CLUSTER
chrM:308 0 AGCGGGGA 129
chrM:308 0 AGCGGGGA 129
chrM:308 0 AGCGGGGA 130
chrM:308 0 TCAAAATG 130
chrM:308 0 TCACGGTG 130
CLUSTER
chrM:308 0 TCAGCCTG 129
chrM:308 0 TCAGGGAG 130
chrM:308 1 TCAGGGTG 106
chrM:308 2 TCAGGGTG 129
chrM:308 2 TCAGGGTG 130
CLUSTER
chrM:309 0 AGCGGGGA 129
chrM:309 0 AGCGGGGA 129
chrM:309 0 AGCGGGGA 130
chrM:309 0 TCAAAATG 130
chrM:309 0 TCACGGTG 130
CLUSTER
chrM:309 0 TCAGCCTG 129
chrM:309 0 TCAGGGAG 130
chrM:309 1 TCAGGGTG 106
chrM:309 2 TCAGGGTG 129
chrM:309 2 TCAGGGTG 130
Cheers
Shadowsong