这是我的测试结果。Phash使用Python中的imagehash库计算,并以两个BIGINTs的形式存储在数据库中。
在不使用分片的mariadb数据库中运行了858,433个图像的测试。我发现分片实际上会减慢进程,但是这是使用函数方法进行的,因此在没有它或较大的数据库上可能会有所不同。
这些正在运行的表是一个仅内存表。本地表被保存,并且在启动数据库时,id、phash1和phash2被复制到内存表中。一旦找到某些内容,就会返回id以匹配innodb表。
总图像数:858433
图像1:ece0455d6b8e9470
函数HAMMINGDISTANCE_16:
RETURN BIT_COUNT(A0 ^ B0) + BIT_COUNT(A1 ^ B1)
方法:HAMMINGDISTANCE_16 函数
查询:
SELECT `id` FROM `phashs` WHERE HAMMINGDISTANCE_16(filephash_1, filephash_2, CONV(SUBSTRING('ece0455d6b8e9470', 1, 8), 16, 10), CONV(SUBSTRING('ece0455d6b8e9470', 9, 8), 16, 10)) <= 3;
时间:2.1760秒
方法:BIT_COUNT
查询:
SELECT `id` FROM `phashs` WHERE BIT_COUNT(filephash_1 ^ CONV(SUBSTRING('ece0455d6b8e9470', 1, 8), 16, 10)) + BIT_COUNT(filephash_2 ^ CONV(SUBSTRING('ece0455d6b8e9470', 9, 8), 16, 10)) <= 3;
时间:0.1547秒
方法:多选BIT_COUNT内部是filephash_1
查询:
SELECT `id` FROM ( SELECT `id`, `filephash_2`, BIT_COUNT(filephash_1 ^ CONV(SUBSTRING('ece0455d6b8e9470', 1, 8), 16, 10)) as BC0 FROM `phashs` WHERE BIT_COUNT(filephash_1 ^ CONV(SUBSTRING('ece0455d6b8e9470', 1, 8), 16, 10)) <= 3 ) BCQ0 WHERE BC0 + BIT_COUNT(filephash_2 ^ CONV(SUBSTRING('ece0455d6b8e9470', 9, 8), 16, 10)) <= 3;
时间:0.1878秒
方法:多选BIT_COUNT内部是filephash_2
查询:
SELECT `id` FROM (SELECT `id`, `filephash_1`, BIT_COUNT(filephash_2 ^ CONV(SUBSTRING('ece0455d6b8e9470', 9, 8), 16, 10)) as BC1 FROM `phashs` WHERE BIT_COUNT(filephash_2 ^ CONV(SUBSTRING('ece0455d6b8e9470', 9, 8), 16, 10)) <= 3) BCQ1 WHERE BIT_COUNT(filephash_1 ^ CONV(SUBSTRING('ece0455d6b8e9470', 1, 8), 16, 10)) + BC1 <= 3;
时间:0.1860秒
图片2:813ed36913ec8639
函数HAMMINGDISTANCE_16:
RETURN BIT_COUNT(A0 ^ B0) + BIT_COUNT(A1 ^ B1)
方法:HAMMINGDISTANCE_16 函数
查询:
SELECT `id` FROM `phashs` WHERE HAMMINGDISTANCE_16(filephash_1, filephash_2, CONV(SUBSTRING('813ed36913ec8639', 1, 8), 16, 10), CONV(SUBSTRING('813ed36913ec8639', 9, 8), 16, 10)) <= 3;
时间:2.1440秒
方法:BIT_COUNT
查询:
SELECT `id` FROM `phashs` WHERE BIT_COUNT(filephash_1 ^ CONV(SUBSTRING('813ed36913ec8639', 1, 8), 16, 10)) + BIT_COUNT(filephash_2 ^ CONV(SUBSTRING('813ed36913ec8639', 9, 8), 16, 10)) <= 3;
时间:0.1588秒
方法:多选BIT_COUNT内的是filephash_1
查询:
SELECT `id` FROM ( SELECT `id`, `filephash_2`, BIT_COUNT(filephash_1 ^ CONV(SUBSTRING('813ed36913ec8639', 1, 8), 16, 10)) as BC0 FROM `phashs` WHERE BIT_COUNT(filephash_1 ^ CONV(SUBSTRING('813ed36913ec8639', 1, 8), 16, 10)) <= 3 ) BCQ0 WHERE BC0 + BIT_COUNT(filephash_2 ^ CONV(SUBSTRING('813ed36913ec8639', 9, 8), 16, 10)) <= 3;
时间:0.1671秒
方法:多选BIT_COUNT内部是filephash_2
查询:
SELECT `id` FROM (SELECT `id`, `filephash_1`, BIT_COUNT(filephash_2 ^ CONV(SUBSTRING('813ed36913ec8639', 9, 8), 16, 10)) as BC1 FROM `phashs` WHERE BIT_COUNT(filephash_2 ^ CONV(SUBSTRING('813ed36913ec8639', 9, 8), 16, 10)) <= 3) BCQ1 WHERE BIT_COUNT(filephash_1 ^ CONV(SUBSTRING('813ed36913ec8639', 1, 8), 16, 10)) + BC1 <= 3;
时间:0.1686秒