我正在使用C语言实现哈希表,并测试字符串的哈希函数。
首先,我尝试了将ASCII码相加并使用模运算(% 100
),但是在第一组包含130个单词的数据测试中,结果很差:40个冲突。
最终输入的数据将包含8000个单词(这是一个存储在文件中的字典)。哈希表声明为int table[10000]
,其中包含单词在.txt文件中的位置。
- 什么是最适合哈希字符串的算法?
- 如何确定哈希表的大小?
我正在使用C语言实现哈希表,并测试字符串的哈希函数。
首先,我尝试了将ASCII码相加并使用模运算(% 100
),但是在第一组包含130个单词的数据测试中,结果很差:40个冲突。
最终输入的数据将包含8000个单词(这是一个存储在文件中的字典)。哈希表声明为int table[10000]
,其中包含单词在.txt文件中的位置。
我已经使用Dan Bernstein的djb2
获得了不错的结果。
unsigned long
hash(unsigned char *str)
{
unsigned long hash = 5381;
int c;
while (c = *str++)
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
return hash;
}
size_t
或其他类似的无符号值(例如此代码中的unsigned long)。调用者负责对结果取模以适应哈希表。调用者控制哈希到的表槽;而不是函数。它只返回一些无符号数字。 - WhozCraig首先,通常情况下你不会想要在哈希表中使用加密哈希。即使是按照加密标准非常快速的算法,在哈希表的标准下也仍旧是异常缓慢。
其次,你需要确保输入的每一位都能/将影响到结果。一种简单的方法是将当前结果向左循环移动一定数量的位数,然后将当前哈希码与当前字节进行异或运算。重复此过程,直到到达字符串的末尾。请注意,通常不要将旋转值设置为字节大小的偶数倍。
例如,假设按照8位字节的常见情况,你可以左移5位:
int hash(char const *input) {
int result = 0x55555555;
while (*input) {
result ^= *input++;
result = rol(result, 5);
}
}
编辑:还要注意,10000个槽位很少是哈希表大小的好选择。通常有两种情况:您要么需要一个质数作为大小(对某些类型的哈希解析确保正确性),要么需要2的幂次方(这样可以通过简单的位掩码将值缩小到正确的范围)。
static inline unsigned rol(unsigned r, int k) {return (r << k) | (r >> (32 - k));}
(假设是32位整数)。至少在x86-64上,GCC会将其编译为一条指令。 - vonbrandresult
只需要是非0的;1就可以了。好的,对输入进行异或运算比加法更好。使用 +=
来进行 ROL
操作比使用 =
更好。如果需要,ROL
可以是 SHL
。将5改为7可以减少冲突。两行最终处理器 h += h<<17; h ^= h>>12;
会稍微扩散一下,提高均匀性。因此,这可能是最好的最小32位哈希/校验和函数,只需8个指令,超过了类似的函数如 ELFHash/BKDRHash/DJB2。虽然在64位机器上不够优化。 - brycHash function | Collisions | Time (words) | Time (file) | Avalanching
===============================================================================
CRC32 | 23 (0.005%) | 96.1 ms | 120.9 ms | 99.8%
MurmurOAAT | 26 (0.006%) | 17.2 ms | 19.4 ms | 100.0%
FNV hash | 32 (0.007%) | 16.4 ms | 14.0 ms | 98.6%
Jenkins OAAT | 36 (0.008%) | 19.9 ms | 18.5 ms | 100.0%
DJB2 hash | 344 (0.074%) | 18.2 ms | 12.7 ms | 92.9%
K&R V2 | 356 (0.076%) | 18.0 ms | 11.9 ms | 92.8%
Coffin | 763 (0.164%) | 16.3 ms | 11.7 ms | 91.4%
x17 hash | 2242 (0.481%) | 18.6 ms | 20.0 ms | 91.5%
-------------------------------------------------------------------------------
large_prime | 25 (0.005%) | 16.5 ms | 18.6 ms | 96.6%
MurmurHash3_x86_32 | 19 (0.004%) | 20.5 ms | 4.4 ms | 100.0%
gcc -O2
编译):#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define MODE 1 // 1 = hash each word individually; 2 = hash the entire file
#define MAXLEN 50 // maximum length of a word
#define MAXWORDS 500000 // maximum number of words in a file
#define SEED 0x12345678
#if MODE == 1
char words[MAXWORDS][MAXLEN];
#else
char file[MAXWORDS * MAXLEN];
#endif
uint32_t DJB2_hash(const uint8_t *str)
{
uint32_t hash = 5381;
uint8_t c;
while ((c = *str++))
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
return hash;
}
uint32_t FNV(const void* key, uint32_t h)
{
// See: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
h ^= 2166136261UL;
const uint8_t* data = (const uint8_t*)key;
for(int i = 0; data[i] != '\0'; i++)
{
h ^= data[i];
h *= 16777619;
}
return h;
}
uint32_t MurmurOAAT_32(const char* str, uint32_t h)
{
// One-byte-at-a-time hash based on Murmur's mix
// Source: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
for (; *str; ++str) {
h ^= *str;
h *= 0x5bd1e995;
h ^= h >> 15;
}
return h;
}
uint32_t KR_v2_hash(const char *s)
{
// Source: https://dev59.com/PWsz5IYBdhLWcg3wxq4V#45641002
// a.k.a. Java String hashCode()
uint32_t hashval = 0;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31*hashval;
return hashval;
}
uint32_t Jenkins_one_at_a_time_hash(const char *str)
{
uint32_t hash, i;
for (hash = i = 0; str[i] != '\0'; ++i)
{
hash += str[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
uint32_t crc32b(const uint8_t *str) {
// Source: https://dev59.com/oGEi5IYBdhLWcg3wuuUk#21001712
unsigned int byte, crc, mask;
int i = 0, j;
crc = 0xFFFFFFFF;
while (str[i] != 0) {
byte = str[i];
crc = crc ^ byte;
for (j = 7; j >= 0; j--) {
mask = -(crc & 1);
crc = (crc >> 1) ^ (0xEDB88320 & mask);
}
i = i + 1;
}
return ~crc;
}
inline uint32_t _rotl32(uint32_t x, int32_t bits)
{
return x<<bits | x>>(32-bits); // C idiom: will be optimized to a single operation
}
uint32_t Coffin_hash(char const *input) {
// Source: https://dev59.com/PWsz5IYBdhLWcg3wxq4V#7666668
uint32_t result = 0x55555555;
while (*input) {
result ^= *input++;
result = _rotl32(result, 5);
}
return result;
}
uint32_t x17(const void * key, uint32_t h)
{
// See: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
const uint8_t * data = (const uint8_t*)key;
for (int i = 0; data[i] != '\0'; ++i)
{
h = 17 * h + (data[i] - ' ');
}
return h ^ (h >> 16);
}
uint32_t large_prime(const uint8_t *s, uint32_t hash)
{
// Better alternative to x33: multiply by a large co-prime instead of a small one
while (*s != '\0')
hash = 17000069*hash + *s++;
return hash;
}
void readfile(FILE* fp, char* buffer) {
fseek(fp, 0, SEEK_END);
size_t size = ftell(fp);
rewind(fp);
fread(buffer, size, 1, fp);
buffer[size] = '\0';
}
struct timeval stop, start;
void timing_start() {
gettimeofday(&start, NULL);
}
void timing_end() {
gettimeofday(&stop, NULL);
printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
}
uint32_t apply_hash(int hash, const char* line)
{
uint32_t result;
#if MODE == 2
timing_start();
#endif
switch (hash) {
case 1: result = crc32b((const uint8_t*)line); break;
case 2: result = large_prime((const uint8_t *)line, SEED); break;
case 3: result = MurmurOAAT_32(line, SEED); break;
case 4: result = FNV(line, SEED); break;
case 5: result = Jenkins_one_at_a_time_hash(line); break;
case 6: result = DJB2_hash((const uint8_t*)line); break;
case 7: result = KR_v2_hash(line); break;
case 8: result = Coffin_hash(line); break;
case 9: result = x17(line, SEED); break;
default: break;
}
#if MODE == 2
timing_end();
#endif
return result;
}
int main(int argc, char* argv[])
{
// Read arguments
const int hash_choice = atoi(argv[1]);
char const* const fname = argv[2];
printf("Algorithm: %d\n", hash_choice);
printf("File name: %s\n", fname);
// Open file
FILE* f = fopen(fname, "r");
#if MODE == 1
char line[MAXLEN];
size_t word_count = 0;
uint32_t hashes[MAXWORDS];
// Read file line by line
while (fgets(line, sizeof(line), f)) {
line[strcspn(line, "\n")] = '\0'; // strip newline
strcpy(words[word_count], line);
word_count++;
}
fclose(f);
// Calculate hashes
timing_start();
for (size_t i = 0; i < word_count; i++) {
hashes[i] = apply_hash(hash_choice, words[i]);
}
timing_end();
// Output hashes
for (size_t i = 0; i < word_count; i++) {
printf("%08x\n", hashes[i]);
}
#else
// Read entire file
readfile(f, file);
fclose(f);
printf("Len: %lu\n", strlen(file));
// Calculate hash
uint32_t hash = apply_hash(hash_choice, file);
printf("%08x\n", hash);
#endif
return 0;
}
维基百科介绍了一个称为 Jenkins One At A Time Hash 的不错的字符串哈希函数。它还引用了这个哈希函数的改进版本。
uint32_t jenkins_one_at_a_time_hash(char *key, size_t len)
{
uint32_t hash, i;
for(hash = i = 0; i < len; ++i)
{
hash += key[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
djb2
算法很好虽然djb2
算法,正如cnicutar在stackoverflow上介绍的那样,几乎肯定更好些,但我认为值得展示一下K&R哈希函数:
unsigned long hash(unsigned char *str)
{
unsigned int hash = 0;
int c;
while (c = *str++)
hash += c;
return hash;
}
% HASHSIZE
from the return statement if you plan on doing the modulus sizing-to-your-array-length outside the hash algorithm. Also, I recommend you make the return and "hashval" type unsigned long
, or even better: uint32_t
or uint64_t
, instead of the simple unsigned
(int). This is a simple algorithm which takes into account byte order of each byte in the string by doing this style of algorithm: hashvalue = new_byte + 31*hashvalue
, for all bytes in the string:
unsigned hash(char *s)
{
unsigned hashval;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31*hashval;
return hashval % HASHSIZE;
}
// Implementation of Murmur hash for 32-bit size_t.
size_t _Hash_bytes(const void* ptr, size_t len, size_t seed)
{
const size_t m = 0x5bd1e995;
size_t hash = seed ^ len;
const char* buf = static_cast<const char*>(ptr);
// Mix 4 bytes at a time into the hash.
while (len >= 4)
{
size_t k = unaligned_load(buf);
k *= m;
k ^= k >> 24;
k *= m;
hash *= m;
hash ^= k;
buf += 4;
len -= 4;
}
// Handle the last few bytes of the input array.
switch (len)
{
case 3:
hash ^= static_cast<unsigned char>(buf[2]) << 16;
[[gnu::fallthrough]];
case 2:
hash ^= static_cast<unsigned char>(buf[1]) << 8;
[[gnu::fallthrough]];
case 1:
hash ^= static_cast<unsigned char>(buf[0]);
hash *= m;
};
// Do a few final mixes of the hash.
hash ^= hash >> 13;
hash *= m;
hash ^= hash >> 15;
return hash;
}
std::unordered_map<>
哈希表更好。不仅如此,Austin还将MurmerHash3发布到了公共领域。在这里查看我关于此问题的其他答案:C ++ std :: unordered_map中使用的默认哈希函数是什么?。
uint32_t inline MurmurOAAT32 ( const char * key)
{
uint32_t h(3323198485ul);
for (;*key;++key) {
h ^= *key;
h *= 0x5bd1e995;
h ^= h >> 15;
}
return h;
}
uint64_t inline MurmurOAAT64 ( const char * key)
{
uint64_t h(525201411107845655ull);
for (;*key;++key) {
h ^= *key;
h *= 0x5bd1e9955bd1e995;
h ^= h >> 47;
}
return h;
}
n - m * (1 - ((m-1)/m)^n) = 57.075...
。40 次碰撞比随机情况下预期的要好(在 p 得分为0.999时,为46到70)。所讨论的哈希函数比随机更均匀,或者我们正在目睹一个非常罕见的事件。 - Wolfgang Brehm我尝试了这些哈希函数,并得到了以下结果。我的数据有大约960^3个条目,每个条目长64字节,包含不同顺序的64个字符,哈希值为32位。代码来自这里。
Hash function | collision rate | how many minutes to finish
==============================================================
MurmurHash3 | 6.?% | 4m15s
Jenkins One.. | 6.1% | 6m54s
Bob, 1st in link | 6.16% | 5m34s
SuperFastHash | 10% | 4m58s
bernstein | 20% | 14s only finish 1/20
one_at_a_time | 6.16% | 7m5s
crc | 6.16% | 7m56s
有一件奇怪的事情是,几乎所有哈希函数对我的数据都有6%的冲突率。
14s only finish 1/20
是什么意思吗?我不理解这句话。 - Gabriel Staples我用过的一种有效方法是以下内容(我不知道它是否已经被提到,因为我记不起它的名字)。
您可以预先计算一个表T,其中包含密钥字母表[0,255]中每个字符的随机数。通过使用T[k0] xor T[k1] xor ... xor T[kN]来哈希您的密钥'k0 k1 k2 ... kN'。您可以轻松地证明这与您的随机数生成器一样随机,并且在计算上非常可行,如果您真的遇到了很多冲突的情况,您可以使用新的一批随机数重复整个过程。