我在mysql数据库中有大约2000行数据。
每行最多包含300个字符,包含一到两个句子。
我使用mysql内置的全文搜索来搜索这些行。
如果可能的话,我想添加一个功能,以便纠正拼写错误和误拼字。
例如,如果有人在搜索框中键入“right shlder”,则执行搜索时应将其视为“right shoulder”。
您对添加此类功能的最简单方法有什么建议? 值得添加某种外部搜索引擎,如lucene吗? (对于如此小的数据集,似乎过于复杂。)还是有更简单的方法?
我在mysql数据库中有大约2000行数据。
每行最多包含300个字符,包含一到两个句子。
我使用mysql内置的全文搜索来搜索这些行。
如果可能的话,我想添加一个功能,以便纠正拼写错误和误拼字。
例如,如果有人在搜索框中键入“right shlder”,则执行搜索时应将其视为“right shoulder”。
您对添加此类功能的最简单方法有什么建议? 值得添加某种外部搜索引擎,如lucene吗? (对于如此小的数据集,似乎过于复杂。)还是有更简单的方法?
我认为你应该使用SOUNDS LIKE
或SOUNDEX()
由于你的数据集非常小,一种解决方案是创建一个新表来存储每个文本字段中包含的单词或音标值,并在该表上使用SOUNDS LIKE
。
例如:
SELECT * FROM table where id IN
(
SELECT refid FROM tableofwords
WHERE column SOUNDS LIKE 'right' OR column SOUNDS LIKE 'shlder'
)
请参见:http://dev.mysql.com/doc/refman/5.0/en/string-functions.html
我相信无法使用通配符搜索字符串 :(
MySQL不支持全文索引中的SOUNDEX搜索。
如果你想要实现类似Lucene的框架,就意味着你需要将所有文档分割成单词,并为每个单词建立索引。
当某人搜索“right shlder”时,你需要在单词表中进行SOUNDEX搜索:
$search = 'right shlder';
preg_match_all('(\w+)', $search, $matches);
if (!empty($matches[0]))
$sounds = array_map('soundex', $matches[0]);
$query = 'SELECT word FROM words_list
WHERE SOUNDEX(word) IN(\''.join('\',\'',$sounds).'\')';
然后进行全文搜索:
$query2 = 'SELECT * FROM table
WHERE MATCH(fultextcolumn)
AGAINST ('.join (' OR ', $resuls).' IN BINARY MODE)';
$result是一个包含第一次查询结果的数组。
PHP实际上有两个内置函数可以实现这一功能,第一个是similar_text,另一个称为levenshtein,这应该可以帮助你解决问题。你需要测试一下它是否足够快速满足你的需求。
SELECT fuzzy_match_first_word('catema', `word`, 80) FROM `dictionary` WHERE (`word` LIKE 'c%') AND (fuzzy_match_first_word('catema', `word`, 80)>=80)
SELECT fuzzy_match('catema', `subject`, 80) FROM `dictionary` WHERE (fuzzy_match('catema', `subject`, 80)>=80)
存储过程:
DELIMITER //
CREATE OR REPLACE FUNCTION `fuzzy_match_first_word`(`str_needle` VARCHAR(64), `str_haystack` VARCHAR(4096), `minimum_quality` INT(11)) RETURNS INT(11)
DETERMINISTIC
BEGIN
DECLARE needleLen, haystackLen, iIdx, cLen, mLen, penalty, checkSpan, shiftAmount INT DEFAULT 0;
DECLARE sChar, subCharNeedle CHAR(1) DEFAULT ' ';
DECLARE res INT DEFAULT 100;
DECLARE n INT DEFAULT 2; -- assume first letter to be ok, needs to be checked by outer like on indexed field
DECLARE shifted INT DEFAULT 4; -- how often we allow letters being moved
SET needleLen = CHAR_LENGTH(str_needle);
SET haystackLen = CHAR_LENGTH(str_haystack);
SET checkSpan = 2; -- Check_span decides how wide to check - Min: 1, Max: Not sensible beyond 5.
IF (needleLen < 1) OR (haystackLen < 1) THEN SET res = 0; ELSE
SET sChar= LEFT(str_needle,1);
IF (haystackLen <= needleLen) THEN
SET cLen = haystackLen;
SET res = res-(20*(needleLen-haystackLen)); -- 30 penalty for each missing letter
if(res < minimum_quality) THEN RETURN 0; END IF;
SET mLen = cLen;
ELSE
SET cLen = needleLen;
SET mLen = haystackLen;
END IF;
WHILE n <= cLen DO
SET subCharNeedle = SUBSTRING(str_needle, n, 1);
IF(SUBSTRING(str_haystack, n + shiftAmount, 1) <> subCharNeedle) THEN
`fail_check`:
BEGIN -- check if not correct
SET penalty = 20; -- 20% reduction for each missed letter, 5% for closeness a close hit
FOR i IN 1..checkSpan DO
-- positive (assume missing letter more likely than a added letter)
SET iIdx = (n + i);
IF (iIdx > 0) AND (iIdx <= mLen) THEN
IF (SUBSTRING(str_haystack, iIdx + shiftAmount, 1) = subCharNeedle) THEN
SET penalty = 5*i;
IF shifted > 0 THEN
SET shifted = shifted-1;
SET shiftAmount = i + shiftAmount;
END IF;
LEAVE `fail_check`;
END IF;
END IF;
-- negative
SET iIdx = (n - i);
IF (iIdx > 0) AND (iIdx <= mLen) THEN
IF (SUBSTRING(str_haystack, iIdx + shiftAmount, 1) = subCharNeedle) THEN
SET penalty = 5*i;
IF shifted > 0 THEN
SET shifted = shifted-1;
SET shiftAmount = -i + shiftAmount;
END IF;
LEAVE `fail_check`;
END IF;
END IF;
END FOR;
END; -- end of fail_check
SET res = res - penalty;
if(res < minimum_quality) THEN RETURN 0; END IF;
END IF;
SET n = n + 1;
END WHILE;
END IF;
RETURN res;
END //
DELIMITER ;
DELIMITER //
CREATE OR REPLACE FUNCTION fuzzy_match(str_needle VARCHAR(64), str_haystack VARCHAR(4096), minimum_quality INT)
RETURNS INT DETERMINISTIC CONTAINS SQL
BEGIN
DECLARE needle_len, haystack_len, cIdx, iIdx, cLen, loop_abort, n INT DEFAULT 0;
DECLARE sub_len, check_span INT;
DECLARE sSub VARCHAR(4096);
DECLARE sChar, subChar_needle, subChar_tmp CHAR(1) DEFAULT ' ';
DECLARE res, rmatch_score, minq FLOAT DEFAULT 0;
SET str_needle = UPPER(REPLACE(TRIM(str_needle),' ',''));
SET str_haystack = UPPER(REPLACE(TRIM(str_haystack),' ',''));
SET needle_len = CHAR_LENGTH(str_needle);
SET haystack_len = CHAR_LENGTH(str_haystack);
SET minq = (minimum_quality / 100.0);
SET check_span = 2; -- Check_span decides how wide to check - Min: 1, Max: Not sensible beyond 5.
SET sChar= LEFT(str_needle,1);
IF (needle_len > 0) AND (haystack_len > 0) THEN
REPEAT
SET cIdx = IFNULL(LOCATE(sChar, str_haystack, cIdx+1), 0);
IF (cIdx > 0) THEN
SET sSub = SUBSTRING(str_haystack, cIdx, needle_len+1);
SET cLen = CHAR_LENGTH(sSub);
SET sub_len = CHAR_LENGTH(sSub);
SET cLen = (sub_len * (sub_len < needle_len)) + (needle_len * (sub_len >= needle_len));
SET rmatch_score = 0;
WHILE (loop_abort = 0) AND (n < cLen) DO
SET n = n + 1;
SET subChar_needle = SUBSTRING(str_needle, n, 1);
IF (subChar_tmp <> subChar_needle) THEN
SET subChar_tmp = subChar_needle;
FOR i IN -check_span..check_span DO
SET iIdx = (n + i - 1);
IF (iIdx >= 0) AND (iIdx < cLen) THEN
IF (subChar_needle = SUBSTRING(sSub, iIdx + 1, 1)) THEN
SET rmatch_score = rmatch_score + (check_span + 1 - ABS(i));
END IF;
END IF;
END FOR;
SET loop_abort = ((rmatch_score / (check_span * n)) < minq);
ELSE
SET rmatch_score = rmatch_score + check_span;
END IF;
END WHILE;
SET res = (rmatch_score / ((check_span + 1) * needle_len));
END IF;
UNTIL (cIdx <= 0) OR (res >= 1) END REPEAT;
END IF;
RETURN (res >= minq) * ROUND(res * 100);
END //
DELIMITER ;