我认为这是一个有趣的问题,因此我编写了一个基于“折叠”的程序,它从不同的“折叠点”向外扫描可能的对称匹配。如果N是核苷酸数量,M是“maxInterval-minInterval”,则运行时间应该为O(N*M)。我可能会错过一些边界情况,所以请小心使用代码,但它确实适用于提供的示例。请注意,我使用填充的中间缓冲区存储基因组,因为这减少了内部循环中所需的边界情况比较次数;这通过额外的内存分配来换取更好的速度。如果您进行任何更正或改进,请随意编辑帖子。
class Program
{
public sealed class Pairing
{
public int Index { get; private set; }
public int Length { get; private set; }
public int Offset { get; private set; }
public Pairing(int index, int length, int offset)
{
Index = index;
Length = length;
Offset = offset;
}
}
public static IEnumerable<Pairing> FindPairings(string genome, int minLen, int maxLen, int intervalMinLen, int intervalMaxLen)
{
int n = genome.Length;
var padding = new string((char)0, maxLen);
var padded = string.Concat(padding, genome, padding);
int start = (intervalMinLen + minLen)/2 + maxLen;
int end = n - (intervalMinLen + minLen)/2 + maxLen;
for (int i=start; i<end; i++)
{
int k = (intervalMinLen+2)/2;
int maxK = (intervalMaxLen + 2)/2;
while (k<=maxK)
{
int matchLength = 0;
while (IsPaired(padded[i - k], padded[i + k]) && (k <= (maxK+maxLen)))
{
matchLength++;
if (matchLength >= minLen && matchLength <= maxLen)
{
yield return new Pairing(i-k - maxLen, matchLength, 2*k - (matchLength-1));
}
k++;
}
k++;
}
k = (intervalMinLen+1)/2;
while (k <= maxK)
{
int matchLength = 0;
while (IsPaired(padded[i - (k+1)], padded[i + k]) && (k<=maxK+maxLen))
{
matchLength++;
if (matchLength >= minLen && matchLength <= maxLen)
{
yield return new Pairing(i - (k+1) - maxLen, matchLength, 2*k + 1 - (matchLength-1));
}
k++;
}
k++;
}
}
}
private const int SumAT = 'A' + 'T';
private const int SumGC = 'G' + 'C';
private static bool IsPaired(char a, char b)
{
return (a + b) == SumAT || (a + b) == SumGC;
}
static void Main(string[] args)
{
string genome = "ATCAGGACCATACGCCTGAT";
foreach (var pairing in FindPairings(genome, 4, 5, 9, 10))
{
Console.WriteLine("'{0}' pair with '{1}'",
genome.Substring(pairing.Index, pairing.Length),
genome.Substring(pairing.Index + pairing.Offset, pairing.Length));
}
Console.ReadKey();
}
}