我认为可以通过在其中一个字符串(例如“conair”)上应用
最长公共子串/子序列算法,并将另一个字符串附加到其本身一次(例如“aircon”->“airconaircon”)来轻松解决此问题。
以下是C语言示例代码:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
size_t llcs(const char* s1, const char* s2)
{
size_t len[3];
if (*s1 == '\0' || *s2 == '\0') return 0;
len[0] = (*s1 == *s2) + llcs(s1 + 1, s2 + 1);
len[1] = llcs(s1 + 1, s2);
len[2] = llcs(s1, s2 + 1);
if (len[0] < len[1]) len[0] = len[1];
if (len[0] < len[2]) len[0] = len[2];
return len[0];
}
double similarity(const char* s1, const char* s2)
{
size_t s1len = strlen(s1);
size_t s2len = strlen(s2);
double sim;
if (s1len == 0 && s2len == 0)
{
sim = 1;
}
else
{
size_t len;
char* s1s1 = malloc(s1len * 2 + 1);
strcpy(s1s1, s1);
strcpy(s1s1 + s1len, s1);
len = llcs(s1s1, s2);
if (len > s1len) len = s1len;
len *= 2;
if (len == s1len + s2len && strcmp(s1, s2) != 0) len--;
sim = (double)len / (s1len + s2len);
free(s1s1);
}
return sim;
}
int main(int argc, char** argv)
{
if (argc == 3)
printf("Similarity of \"%s\" and \"%s\" is %.2f%%\n",
argv[1], argv[2], 100 * similarity(argv[1], argv[2]));
else
printf("Usage:\n %s string1 string2\n",
argv[0]);
return 0;
}
样本输出:
Similarity of "123" and "123" is 100.00%
Similarity of "123" and "1234" is 85.71%
Similarity of "0123" and "123" is 85.71%
Similarity of "a" and "aa" is 66.67%
Similarity of "aa" and "a" is 66.67%
Similarity of "aaaaaaa" and "aaaaaa" is 92.31%
Similarity of "aaaaaa" and "aaaaaaa" is 92.31%
Similarity of "aircon" and "conair" is 91.67%
Similarity of "spit" and "pits" is 87.50%
Similarity of "pits" and "spit" is 87.50%
Similarity of "spits" and "pits" is 88.89%
Similarity of "pits" and "spits" is 88.89%