似乎有一篇关于哈希碰撞概率的文章,可以在这里找到。
尝试使用xxhash64(从spark-3开始),md5,sha2
函数来获取唯一的哈希值。
例如:
spark.sql("""select xxhash64('pipohecho@hotmail.com'),
xxhash64('rozas_huertas@hotmail.com'),
xxhash64('miguelilloooooooooouu@hotmail.com'),
xxhash64('rjdzpmsyi@hotmail.com'),
xxhash64('pepe@hotmail.com')""").show()
#+-------------------------------+-----------------------------------+-------------------------------------------+-------------------------------+--------------------------+
#|xxhash64(pipohecho@hotmail.com)|xxhash64(rozas_huertas@hotmail.com)|xxhash64(miguelilloooooooooouu@hotmail.com)|xxhash64(rjdzpmsyi@hotmail.com)|xxhash64(pepe@hotmail.com)|
#+-------------------------------+-----------------------------------+-------------------------------------------+-------------------------------+--------------------------+
#|6332927369894443419 |-8140372026824474906 |-9124920009896762502 |1936246589584419991 |954028670536665140 |
#+-------------------------------+-----------------------------------+-------------------------------------------+-------------------------------+--------------------------+
spark.sql("""select md5('pipohecho@hotmail.com'),
md5('rozas_huertas@hotmail.com'),
md5('miguelilloooooooooouu@hotmail.com'),
md5('rjdzpmsyi@hotmail.com'),
md5('pepe@hotmail.com')""").show()
#+------------------------------------------+----------------------------------------------+------------------------------------------------------+------------------------------------------+-------------------------------------+
#|md5(CAST(pipohecho@hotmail.com AS BINARY))|md5(CAST(rozas_huertas@hotmail.com AS BINARY))|md5(CAST(miguelilloooooooooouu@hotmail.com AS BINARY))|md5(CAST(rjdzpmsyi@hotmail.com AS BINARY))|md5(CAST(pepe@hotmail.com AS BINARY))|
#+------------------------------------------+----------------------------------------------+------------------------------------------------------+------------------------------------------+-------------------------------------+
#|7ce30aa0209335873f79e64c2eb465ff |9d58c495ab87f2e3a4a9adc6c8fbbb76 |c283a7c6f09712fc5ba4ea30334e2c25 |6766da691171aa5c56a70b89bd4590fa |ab888b1a15b420b410d23b927a370013 |
#+------------------------------------------+----------------------------------------------+------------------------------------------------------+------------------------------------------+-------------------------------------+
spark.sql("""select sha2('pipohecho@hotmail.com',256),
sha2('rozas_huertas@hotmail.com',256),
sha2('miguelilloooooooooouu@hotmail.com',256),
sha2('rjdzpmsyi@hotmail.com',256),
sha2('pepe@hotmail.com',256)""").show()
#+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
#|sha2(CAST(pipohecho@hotmail.com AS BINARY), 256) |sha2(CAST(rozas_huertas@hotmail.com AS BINARY), 256) |sha2(CAST(miguelilloooooooooouu@hotmail.com AS BINARY), 256) |sha2(CAST(rjdzpmsyi@hotmail.com AS BINARY), 256) |sha2(CAST(pepe@hotmail.com AS BINARY), 256) |
#+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+
#|02068bc029cd26888a4ba630ecfa91b4afc2bf72c4adeabcfcd32459529c61bb|391af34e53d82ce8f12a1396d5ae74d96f3ea583cf3fd864816b29586ed002f8|fde18d7d27497717a8a77a0eace29ad5dbcb7319637be033c3e66a068a2bd983|b07300bee7e68326143c40f75b608201f5db667a18bb73b63f9f909454521753|921efc4884d3c8a32899c079024386641564ec0d0966cc059429bbd33770e421|
#+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------+