Tinkering today I wrote a quick toy to generate strings of unique, overlapping kmers. Not particularly efficient, but possibly handy nonetheless.
It takes a given k size, a configurable overlap and optionally the bases to use. First it generates a list of all the kmers then it recursively scans for matching overlapping kmers and extends a seed, terminating the recursion and printing if all kmers have been used.
Run it like so:
./kmer-overlap -k=3 -overlap=2 ACTG
#!/usr/local/bin/perl
#########
# Author: rmp
# Created: 2013-05-15
# Last Modified: $Date$
# Id: $Id$
# HeadURL: $HeadURL$
#
use strict;
use warnings;
use Getopt::Long;
use Readonly;
use English qw(-no_match_vars);
Readonly::Scalar our $DEFAULT_K => 3;
Readonly::Scalar our $DEFAULT_BASES => [qw(A C T G)];
my $opts = {};
GetOptions($opts, qw(k=s rand help));
if($opts->{help}) {
print < <"EOT"; $PROGRAM_NAME - rmp 2013-05-15 Usage: $PROGRAM_NAME -k=3 -overlap=2 -rand ACTG EOT exit; } my $k = $opts->{k} || $DEFAULT_K;
my $overlap = $opts->{overlap} || $k-1;
my $bases = $DEFAULT_BASES;
if(scalar @ARGV) {
$bases = [grep { $_ } map {split //smx} @ARGV];
}
#########
# Build all available kmers
#
my $kmers = [];
for my $base1 (@{$bases}) {
build($base1, $bases, $kmers);
}
#########
# optionally randomise the seeds
#
if($opts->{rand}) {
shuffle($kmers);
}
#########
# start with a seed
#
for my $seed (@{$kmers}) {
my $seen = {
$seed => 1,
};
solve($seed, $seen);
}
sub build {
my ($seq, $bases, $kmers) = @_;
if(length $seq == $k) {
#########
# reached target k - store & terminate
#
push @{$kmers}, $seq;
return 1;
}
for my $base (@{$bases}) {
########
# extend and descend
#
build("$seq$base", $bases, $kmers);
}
return;
}
sub solve {
my ($seq_in, $seen) = @_;
if(scalar keys %{$seen} == scalar @{$kmers}) {
#########
# exhausted all kmers - completed!
#
print $seq_in, "\n";
return 1;
}
my $seq_tail = substr $seq_in, -$overlap, $overlap;
my $overlapping = [grep { $_ =~ /^$seq_tail/smx } # filter in only seqs which overlap the seed tail
grep { !$seen->{$_} } # filter out kmers we've seen already
@{$kmers}];
if(!scalar @{$overlapping}) {
#########
# no available overlapping kmers - terminate!
#
return;
}
if($opts->{rand}) {
shuffle($overlapping);
}
my $overhang = $k-$overlap;
for my $overlap_seq (@{$overlapping}) {
#########
# extend and descend
#
my $seq_out = $seq_in . substr $overlap_seq, -$overhang, $overhang;
solve($seq_out, {%{$seen}, $overlap_seq => 1});
}
return;
}
sub shuffle {
my ($arr_in) = @_;
for my $i (0..scalar @{$arr_in}-1) {
my $j = int rand $i;
($arr_in->[$i], $arr_in->[$j]) = ($arr_in->[$j], $arr_in->[$i]);
}
}
Output looks like this:
epiphyte:~ rmp$ ./kmer-overlap -k=2 AC
AACCA
ACCAA
CAACC
CCAAC