确保HashSet中的哈希均匀分布,它是如何工作的?

6

以下是《Java编程入门》(Liang)中的一个例子:

import java.util.LinkedList;

public class MyHashSet<E> implements MySet<E> {
  // Define the default hash table size. Must be a power of 2
  private static int DEFAULT_INITIAL_CAPACITY = 16;

  // Define the maximum hash table size. 1 << 30 is same as 2^30
  private static int MAXIMUM_CAPACITY = 1 << 30; 

  // Current hash table capacity. Capacity is a power of 2
  private int capacity;

  // Define default load factor
  private static float DEFAULT_MAX_LOAD_FACTOR = 0.75f; 

  // Specify a load factor threshold used in the hash table
  private float loadFactorThreshold; 

  // The number of entries in the set
  private int size = 0; 

  // Hash table is an array with each cell that is a linked list
  private LinkedList<E>[] table;

  /** Construct a set with the default capacity and load factor */
  public MyHashSet() {  
    this(DEFAULT_INITIAL_CAPACITY, DEFAULT_MAX_LOAD_FACTOR);    
  }

  /** Construct a set with the specified initial capacity and 
   * default load factor */
  public MyHashSet(int initialCapacity) { 
    this(initialCapacity, DEFAULT_MAX_LOAD_FACTOR);    
  }

  /** Construct a set with the specified initial capacity 
   * and load factor */
  public MyHashSet(int initialCapacity, float loadFactorThreshold) { 
    if (initialCapacity > MAXIMUM_CAPACITY)
      this.capacity = MAXIMUM_CAPACITY;
    else
      this.capacity = trimToPowerOf2(initialCapacity);

    this.loadFactorThreshold = loadFactorThreshold;    
    table = new LinkedList[capacity];
  }

  /** Remove all elements from this set */ 
  public void clear() {
    size = 0;
    removeElements();
  }

  /** Return true if the element is in the set */
  public boolean contains(E e) {
    int bucketIndex = hash(e.hashCode());
    if (table[bucketIndex] != null) {
      LinkedList<E> bucket = table[bucketIndex]; 
      for (E element: bucket)
        if (element.equals(e)) 
          return true;
    }

    return false;
  }

  /** Add an element to the set */
  public boolean add(E e) {
    if (contains(e)) 
      return false;

    if (size > capacity * loadFactorThreshold) {
      if (capacity == MAXIMUM_CAPACITY)
        throw new RuntimeException("Exceeding maximum capacity");

      rehash();
    }

    int bucketIndex = hash(e.hashCode());

    // Create a linked list for the bucket if it is not created
    if (table[bucketIndex] == null) {
      table[bucketIndex] = new LinkedList<E>();
    }

    // Add e to hashTable[index]
    table[bucketIndex].add(e);

    size++; // Increase size

    return true;
  }

  /** Remove the element from the set */
  public boolean remove(E e) {
    if (!contains(e))
      return false;

    int bucketIndex = hash(e.hashCode());

    // Create a linked list for the bucket if it is not created
    if (table[bucketIndex] != null) {
      LinkedList<E> bucket = table[bucketIndex]; 
      for (E element: bucket)
        if (e.equals(element)) {
          bucket.remove(element);
          break;
        }
    }

    size--; // Decrease size

    return true;
  }

  /** Return true if the set contains no elements */
  public boolean isEmpty() {
    return size == 0;
  }

  /** Return the number of elements in the set */
  public int size() {
    return size;
  }

  /** Return an iterator for the elements in this set */
  public java.util.Iterator<E> iterator() {
    return new MyHashSetIterator(this);
  }

  /** Inner class for iterator */
  private class MyHashSetIterator implements java.util.Iterator<E> {
    // Store the elements in a list
    private java.util.ArrayList<E> list;
    private int current = 0; // Point to the current element in list
    MyHashSet<E> set;

    /** Create a list from the set */
    public MyHashSetIterator(MyHashSet<E> set) {
      this.set = set;
      list = setToList();
    }

    /** Next element for traversing? */
    public boolean hasNext() {
      if (current < list.size())
        return true;

      return false;
    }

    /** Get the current element and move cursor to the next */
    public E next() {
      return list.get(current++);
    }

    /** Remove the current element and refresh the list */
    public void remove() {
      // Delete the current element from the hash set
      set.remove(list.get(current)); 
      list.remove(current); // Remove the current element from the list
    }
  }  

  /** Hash function */
  private int hash(int hashCode) {
    return supplementalHash(hashCode) & (capacity - 1);
  }

  /** Ensure the hashing is evenly distributed */
  private static int supplementalHash(int h) {
    h ^= (h >>> 20) ^ (h >>> 12);
    return h ^ (h >>> 7) ^ (h >>> 4);
  }

  /** Return a power of 2 for initialCapacity */
  private int trimToPowerOf2(int initialCapacity) {
    int capacity = 1;
    while (capacity < initialCapacity) {
      capacity <<= 1;
    }

    return capacity;
  }

  /** Remove all e from each bucket */
  private void removeElements() {
    for (int i = 0; i < capacity; i++) {
      if (table[i] != null) {
        table[i].clear();
      }
    }
  }

  /** Rehash the set */
  private void rehash() {
    java.util.ArrayList<E> list = setToList(); // Copy to a list
    capacity <<= 1; // Double capacity      
    table = new LinkedList[capacity]; // Create a new hash table
    size = 0;

    for (E element: list) {
      add(element); // Add from the old table to the new table
    }
  }

  /** Copy elements in the hash set to an array list */
  private java.util.ArrayList<E> setToList() {
    java.util.ArrayList<E> list = new java.util.ArrayList<E>();

    for (int i = 0; i < capacity; i++) {
      if (table[i] != null) {
        for (E e: table[i]) {
          list.add(e);
        }
      }
    }  

    return list;
  }

  /** Return a string representation for this set */
  public String toString() {
    java.util.ArrayList<E> list = setToList();
    StringBuilder builder = new StringBuilder("[");

    // Add the elements except the last one to the string builder
    for (int i = 0; i < list.size() - 1; i++) {
      builder.append(list.get(i) + ", ");
    }

    // Add the last element in the list to the string builder
    if (list.size() == 0)
      builder.append("]");
    else
      builder.append(list.get(list.size() - 1) + "]");

    return builder.toString();
  }
}

我不太理解这一部分的意思:
  /** Ensure the hashing is evenly distributed */
  private static int supplementalHash(int h) {
    h ^= (h >>> 20) ^ (h >>> 12);
    return h ^ (h >>> 7) ^ (h >>> 4);
  }

操作都很清晰,但是它们如何确保哈希均匀分布呢?
关于这段代码还有一个问题,在这一部分中:
  /** Add an element to the set */
  public boolean add(E e) {
    if (contains(e)) 
      return false;

    if (size > capacity * loadFactorThreshold) {
      if (capacity == MAXIMUM_CAPACITY)
        throw new RuntimeException("Exceeding maximum capacity");

      rehash();
    }

    int bucketIndex = hash(e.hashCode());

    // Create a linked list for the bucket if it is not created
    if (table[bucketIndex] == null) {
      table[bucketIndex] = new LinkedList<E>();
    }

    // Add e to hashTable[index]
    table[bucketIndex].add(e);

    size++; // Increase size

    return true;
  }

为什么不在 size++ 后面放置大小检查和重新散列块?

类似的问题,实际上是非常不同的。 - qed
你问道:“这些操作都很清晰,但是它们如何确保哈希均匀分布?”- 这在回答中已经解释了。如果回答不够满意,可能需要缩小问题的范围。 - Robert Horvick
没有确定性哈希函数能够单独保证在桶之间的“均匀分布”,因此实现注释(严格来说)是不正确的。以这种方式对哈希值进行后处理的动机在链接的问题中有解释(简单地说,它弥补了特定类型哈希函数的缺点)。 - meriton
1个回答

2
所有操作都很清晰,但它们如何确保哈希均匀分布呢?
这并不能确保,它只是简单地努力随机排列位,特别是低位,因此您可以获得合理随机的位排列,而不会过于复杂。
不幸的是,它未考虑到移位实际上是一种昂贵的操作,特别是当有多个移位时,它可能会阻塞CPU流水线。使用乘法和加法以及一个移位可以获得良好的结果,并且速度更快。乘法和加法还可以提高更高位数的随机性。
请注意:将在输入哈希的总共9个位之间对低位进行“^”,但顶部位,特别是最高的4位,将不受此过程的影响。
这不是问题,因为hash()将掩码低位(就像这里所做的那样),或者使用%来取模,这更加昂贵,但再次只需要合理随机的低位,假设模数不太大。
为什么不将大小检查和重新哈希块放在size++之后?
调整大小是昂贵的,您可以先添加该元素,然后调整大小,但这意味着在调整大小之前和作为调整大小过程的一部分添加该元素会触发调整大小两次。

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接