有没有现成的.NET算法,能够从预定义单词列表中检测出错别字?
例如,假设我的列表中有单词"Stuff",而有人输入了"Stuf"、"sutff"、"stff"或者"stiff",我希望能够建议这个人"Stuff"可能是正确的单词。
我不是在谈论语法或者更多的东西,只是指一个字母丢失、替换或混淆的情况。
我的目标是防止同一个单词以不同的打法出现在列表中。大小写对我来说并不是问题,因为所有字母都是小写。
有没有现成的.NET算法,能够从预定义单词列表中检测出错别字?
例如,假设我的列表中有单词"Stuff",而有人输入了"Stuf"、"sutff"、"stff"或者"stiff",我希望能够建议这个人"Stuff"可能是正确的单词。
我不是在谈论语法或者更多的东西,只是指一个字母丢失、替换或混淆的情况。
我的目标是防止同一个单词以不同的打法出现在列表中。大小写对我来说并不是问题,因为所有字母都是小写。
这里有一个使用Python实现的逐步指南,可以实现您想要的内容,但也提供了C#和其他语言的实现链接。
我在C#中发布了我的实现,允许长度大于等于2的字符串。 它检查2个单词是否匹配,无视换位,替换,删除(适用于删除后长度大于等于2的单词)和重复(多次出现)。
public bool CheckWordsSameWithTypo(string word1, string word2)
{
if (word1.Length < 2 || word2.Length < 2) return false;
//transposition "thme" instead of "them"
bool matchWithTrans = false;
#region transLogic
var transOptions1 = new List<string>();
var transOptions2 = new List<string>();
for (int i = 1; i < word1.Length; i++)
{
var wordArr = word1.ToArray();
char letter1 = wordArr[i -1];
char letter2 = wordArr[i];
wordArr[i -1] = letter2;
wordArr[i] = letter1;
transOptions1.Add(new string(wordArr));
}
for (int i = 1; i < word2.Length; i++)
{
var wordArr = word2.ToArray();
char letter1 = wordArr[i -1];
char letter2 = wordArr[i];
wordArr[i -1] = letter2;
wordArr[i] = letter1;
transOptions2.Add(new string(wordArr));
}
if (transOptions1.Any(p => p.Equals(word2)) || transOptions2.Any(p => p.Equals(word1))) matchWithTrans = true;
#endregion
//substitution "arithmatic" instead of "arithmetic"
bool matchWithSubst = false;
#region substLogic
var substOptionPatterns1 = new List<string>();
var substOptionPatterns2 = new List<string>();
for (int i = 0; i < word1.Length; i++)
{
var wordArr = word1.ToArray();
wordArr[i] = '.';
substOptionPatterns1.Add(new string(wordArr));
}
for (int i = 0; i < word2.Length; i++)
{
var wordArr = word2.ToArray();
wordArr[i] = '.';
substOptionPatterns2.Add(new string(wordArr));
}
foreach(var patt in substOptionPatterns1)
{
Regex regex = new Regex(patt);
if (regex.IsMatch(word2)) matchWithSubst = true;
}
foreach(var patt in substOptionPatterns2)
{
Regex regex = new Regex(patt);
if (regex.IsMatch(word1)) matchWithSubst = true;
}
#endregion
//deletion "helo" instead of "hello"
bool matchWithDeletion = false;
#region deletionLogic
var delOptions1 = new List<string>();
var delOptions2 = new List<string>();
for (int i = 0; i < word1.Length; i++)
{
delOptions1.Add(word1.Remove(i, 1));
}
for (int i = 0; i < word2.Length; i++)
{
delOptions2.Add(word2.Remove(i, 1));
}
if (delOptions1.Any(p => p.Length>1 && p.Equals(word2)) || delOptions2.Any(p => p.Length>1 && p.Equals(word1))) matchWithDeletion = true;
#endregion
//repetition "tommorrow" instead of "tomorow"
bool matchWithRepetition = false;
#region repsLogic
StringBuilder word1_distinctBuilder = new StringBuilder(word1.Substring(0, 1));
for (int i = 1; i < word1.Length; i++)
{
string currentLetter = word1.Substring(i, 1);
if(!word1_distinctBuilder.ToString().Substring(word1_distinctBuilder.ToString().Length-1, 1).Equals(currentLetter))
{
word1_distinctBuilder.Append(currentLetter);
}
}
StringBuilder word2_distinctBuilder = new StringBuilder(word2.Substring(0, 1));
for (int i = 1; i < word2.Length; i++)
{
string currentLetter = word2.Substring(i, 1);
if(!word2_distinctBuilder.ToString().Substring(word2_distinctBuilder.ToString().Length-1, 1).Equals(currentLetter))
{
word2_distinctBuilder.Append(currentLetter);
}
}
matchWithRepetition = word1_distinctBuilder.ToString().Equals(word2_distinctBuilder.ToString());
#endregion
return matchWithTrans || matchWithSubst || matchWithDeletion || matchWithRepetition;
}