MD5哈希计算函数失败

5

我正在尝试在R中编写一个不调用任何C例程的md5哈希函数。虽然我的代码执行得很好,但输出永远无法与tools::md5sum相匹配(而这与各种在线文档提供的示例匹配)。我怀疑有些地方存在字节顺序(或字词顺序)问题;正如下面提供的代码所示,我尝试了一些翻转操作,但没有成功。 我已经检查了输出中的简单不匹配情况,例如正确的字节但是顺序错误,但没有运气。

要运行此函数,您需要使用库RmpfrbigBits以及此处提供的函数(flip32flip16bigRotate)。

# need these to run:
library(Rmpfr)
library(bigBits)

mymd5 <- function(msg){

  if(!is(msg,'raw')) msg <- charToRaw(as.character(msg))
# a const to be used when truncating values > ffffffff
two32 <- as.bigz(2^(as.bigz(32))) 

sidx <- c( 7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22 , 5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20 , 4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23 , 6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21 )

#  K-values verified to be correct.
 K <- as.bigz(2^32 * abs(sin(1:64))) 

# IETF  shows these values as-is, i.e. don't change Endian on these calculations
#ietf.org page says A is 01234567, ie lower bytes first.  

adef <- '67452301'
bdef <- 'efcdab89'
cdef <- '98badcfe'
ddef <- '10325476'

# then optionally
adef <- flip32(adef)
bdef <- flip32(bdef)
cdef <- flip32(cdef)
ddef <- flip32(ddef)

a0 <- (base2base(adef,16,10))[[1]]
b0 <-(base2base(bdef,16,10))[[1]]
c0 <-(base2base(cdef,16,10))[[1]]
d0 <-(base2base(ddef,16,10))[[1]]

#  append 0x80  and pad with 0x00 bytes so that the message length in bytes ≡ 56 (mod 64).
  origlenbit <- length(msg) * 8
  msg <- c(msg, as.raw('0x80') )
  bytemod <- length(msg)%%64
  raw00 <- as.raw('0x00')
#  append  '0x00' 56-bytemod times
    if (bytemod <  56 && bytemod > 0 ) {
        msg <- c(msg,rep(raw00,times=(56-bytemod)))
    } else if (bytemod > 56) {
        msg <- c(msg, rep(raw00, times = bytemod-56))
    }
    
 # https://datatracker.ietf.org/doc/html/rfc1321#section-3.4 says
 #  A 64-bit representation of b (the length of the message before the
   # padding bits were added) is appended to the result of the previous
   # step. In the unlikely event that b is greater than 2^64, then only
   # the low-order 64 bits of b are used. (These bits are appended as two
   # 32-bit words and appended low-order word first in accordance with the
   # previous conventions.)
    len16 <-    base2base(origlenbit,10,16)[[1]]
    lenbit2 <- base2base(origlenbit,10,2)[[1]]
    if(nchar(lenbit2) < 64 ){
        lenbit2 <- paste0(c(rep('0',times = 64 -nchar(lenbit2)) ,lenbit2), collapse='')
    } else if (nchar(lenbit2) > 64) lenbit2 <- rev(rev(lenbit2)[1:64])
    newbytes2 <- substring(lenbit2, seq(1,nchar(lenbit2)-7,by=8), last = seq(8, nchar(lenbit2),by = 8) )    
    newdec <- base2base(newbytes2,2,10)
    newdbl <- rep(0,times= 8)
    for (jd in 1: 8) newdbl[jd] <- as.numeric(newdec[[jd]])
    newraw <- as.raw(newdbl)
#  which order is correct? 
#   msg <- c(msg,newraw[c(7,8,5,6,3,4,1,2)])
    msg <- c(msg, newraw[5:8], newraw[1:4])
#   msg <- c(msg, newraw[1:4], newraw[5:8])
    chunkit <- as.bigz(as.numeric(msg ) ) # all these will be <= 0xff
##  turn groups of 512 BITS of msg into 16 words of 32 bits each.
    for(jchunk in 1: (length(msg)/64)) {  
        thischunk <- matrix(chunkit[ ((jchunk-1)*64)+(1:64)],ncol=16)
#  should I reverse the bytes of this chunk? 
        thisM <- as.bigz(rep(0,16) )
        for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(256^3,256^2,256,1) )
#       for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(1,256,256^2,256^3) )
      # Initialize hash value for this chunk:
         Ah<- a0
         Bh<- b0
         Ch<- c0
         Dh<- d0
#now the chunk loop, which I make into 4 sep'te loops 
        for (jone in 1:16) {
            Fh <- bigOr (bigAnd(Bh,Ch, inTwo=FALSE)[[1]] ,bigAnd(bigNot(Bh, inTwo=FALSE, outTwo=FALSE)[[1]],Dh, inTwo=FALSE)[[1]] )[[1]]
            g <- jone
            Fh<- (Fh + Ah + K[jone] + thisM[g]) %%two32  # truncate
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[jone], inTwo=FALSE)[[1]] )%%two32   
        }

        for (j2 in 17:32) {
            Fh <- bigOr( bigAnd(Dh,Bh,inTwo=FALSE)[[1]] ,bigAnd(bigNot(Dh[[1]],inTwo=FALSE, outTwo=FALSE), Ch, inTwo=FALSE)[[1]] ,inTwo=FALSE)[[1]] 
            g <-(1 + 5*(j2-1))%%16 +1 # plus one due to indexing from 1
            Fh<- (Fh + Ah + K[j2] + thisM[g])%%two32  
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[j2], inTwo=FALSE)[[1]])%%two32   
        }
        for(j3 in 33:48){
            Fh <- bigXor(Bh,bigXor(Ch,Dh, inTwo=FALSE)[[1]], inTwo=FALSE)[[1]]
            g <-(5 + 3*(j3-1))%%16 +1
            Fh<- (Fh + Ah + K[j3] + thisM[g])%%two32  
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[j3],inTwo=FALSE)[[1]])%%two32 
        }
        for(j4 in 49:64){
            Fh <- bigXor(Ch,bigOr(Bh,bigNot(Dh,inTwo=FALSE, outTwo=FALSE)[[1]],inTwo=FALSE)[[1]],inTwo=FALSE)[[1]]
            g <- (7 * (j4 -1))%%16 +1
            Fh<- (Fh + Ah + K[j4] + thisM[g])%%two32  
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[j4], inTwo=FALSE)[[1]])%%two32 
        }
           # Add this chunk's hash to result so far:
        a0<- (a0 + Ah)%%two32
        b0<- (b0 + Bh)%%two32
        c0<- (c0 + Ch)%%two32
        d0<- (d0 + Dh)%%two32
    }   # end of jchunk

#finally, string the values together.
# watch for those leading zeros
a0x <-(base2base(a0,10,16))
a0x <- paste0(c(rep(0,times=max(8-(nchar(a0x)),0)), a0x), collapse='')
b0x <-(base2base(b0,10,16))
b0x <- paste0(c(rep(0,times=max(8-(nchar(b0x)),0)), b0x), collapse='')
c0x <-(base2base(c0,10,16))
c0x <- paste0(c(rep(0,times=max(8-(nchar(c0x)),0)), c0x), collapse='')
d0x <-(base2base(d0,10,16))
d0x <- paste0(c(rep(0,times=max(8-(nchar(d0x)),0)), d0x), collapse='')


thesum <- paste0(c(a0x,b0x,c0x,d0x) , sep='',collapse='')
return(thesum)
} 

# helper func to test reordering bytes. e.g.  01234567 from 67452301

flip32 <- function(x){
    xsep <- unlist(strsplit(x,'') )
    xout <- paste0(c(xsep[c(7,8,5,6,3,4,1,2)]),collapse='')
    return(xout)
}
flip16 <- function(x) {
        xsep <- unlist(strsplit(x,''))
    xout <- paste0(c(xsep[c(3,4,1,2)]),collapse='')
    return(xout)
}


bigRotate <- function(x, shift,  inBase = 10,binSize = 32, outBase = 10, inTwosComp = TRUE) {
#default binary size is 32 to match bitwShiftR  
 shift = floor(shift[1])

 out <- rep('0',times = length(x))  # gmp::as.bigz(rep(0,times=length(x)))
    for(jj in 1:length(x)) {
        bintmp <- base2base(x[[jj]],inBase,2, binSize = binSize, inTwosComp = inTwosComp, outTwosComp = FALSE)[[1]]
#now all inputs will have a neg sign if negative.
        isPos = TRUE
        if(length(grep('-',bintmp)) ) {
            isPos = FALSE
            bintmp <- gsub('^-','',bintmp)
        }
        bintmp <- strsplit(bintmp,'')[[1]]
        xlen = length(bintmp)
    # now rotate the bits.  
        otmp  <- unlist(paste0(c(bintmp[ ((1:xlen) + shift -1) %%(xlen) + 1]),collapse=''))
        out[jj] <- base2base(otmp, 2, outBase, binSize=binSize, outTwosComp = FALSE, classOut = "character")[[1]]
        if(!isPos) out[jj] <- paste0('-',out[jj], collapse='')
    }
    # provide output in source 'inBase'
    if(is(x,'mpfr') || is(x,'mpfr1')) {
        out <- Rmpfr::.bigz2mpfr(gmp::as.bigz(out))
    } else if(is(x,'numeric')) {
        out <- as.numeric(out)
    } else if(is(x,'bigz')) out <- gmp::as.bigz(out)
    
 return(out) 
}
1个回答

5

下面列出了几个问题。

1. 移除了 flip32

2. 添加填充位的问题

已经重写为

msg <- c(msg, as.raw('0x80') )
while (length(msg) %% 64 != 56) {
  msg <- c(msg, as.raw(0))
}

3. 需要反转 thischunk 矩阵的行。
thischunk <- thischunk[nrow(thischunk):1,]

4. bigRotate的问题

这可以表示为((x<<amount) | (x>>(32-amount))) & 0xFFFFFFFF,因此已经被重写为

bigRotate <- function(x, amount) {
  two32 <- as.bigz(2^(as.bigz(32)))
  x <- x %% two32
  lshift <- bigShiftL(x, amount)
  rshift <- bigShiftR(x, 32-amount)
  
  # bigOr errors if either value is zero so handle those cases explicitly
  if (lshift == 0) {
    rshift
  } else if (rshift == 0) {
    lshift
  } else {
    bigOr(lshift, rshift) %% two32
  }
}

5. 将abcd进行了重写,并且字节顺序被交换。
swap_endianness <- function(hex) {
  chars <- strsplit(hex, "")[[1]]
  pairs <- paste0(chars[c(TRUE, FALSE)], chars[c(FALSE, TRUE)])
  paste0(rev(pairs), collapse = "")
}

thesum <- sum(
    bigShiftL(a0, 32 * 0),
    bigShiftL(b0, 32 * 1),
    bigShiftL(c0, 32 * 2),
    bigShiftL(d0, 32 * 3)
)
hex <- base2base(thesum, frombase = 10, tobase = 16)[[1]]
output <- swap_endianness(hex)

修正代码

实施上述更改产生的结果是

library(bigBits)
library(gmp)


mymd5 <- function(msg){
  if(!is(msg,'raw')) msg <- charToRaw(as.character(msg))
  
  # a const to be used when truncating values > ffffffff
  two32 <- as.bigz(2^(as.bigz(32)))
  sidx <- c(7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22 , 5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20 , 4, 11, 16, 23,  4, 11, 16, 23, 4, 11, 16, 23,  4, 11, 16, 23 , 6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21)
  K <- as.bigz(2^32 * abs(sin(1:64))) 
  
  adef <- '67452301'
  bdef <- 'efcdab89'
  cdef <- '98badcfe'
  ddef <- '10325476'
  
  a0 <- base2base(adef, 16, 10)[[1]]
  b0 <- base2base(bdef, 16, 10)[[1]]
  c0 <- base2base(cdef, 16, 10)[[1]]
  d0 <- base2base(ddef, 16, 10)[[1]]

  origlenbit <- length(msg) * 8
  
  msg <- c(msg, as.raw('0x80') )
  while (length(msg) %% 64 != 56) {
    msg <- c(msg, as.raw(0))
  }

  length_bytes <- writeBin(as.integer(origlenbit), raw(), size = 8, endian = "little")
  msg <- c(msg, length_bytes, rep(0, length.out = 8 - length(length_bytes)))
  
  chunkit <- as.bigz(as.numeric(msg ) ) # all these will be <= 0xff
  
  ##  turn groups of 512 BITS of msg into 16 words of 32 bits each.
  for(jchunk in 1: (length(msg)/64)) {  
    thischunk <- matrix(chunkit[ ((jchunk-1)*64)+(1:64)],ncol=16)
    thischunk <- thischunk[nrow(thischunk):1,]
    thisM <- as.bigz(rep(0,16) )
    for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(256^3,256^2,256,1))

    # Initialize hash value for this chunk:
    Ah <- a0
    Bh <- b0
    Ch <- c0
    Dh <- d0
    
    #now the chunk loop, which I make into 4 sep'te loops 
    for (jone in 1:16) {
      Fh <- bigOr (bigAnd(Bh,Ch, inTwo=FALSE)[[1]] ,bigAnd(bigNot(Bh, inTwo=FALSE, outTwo=FALSE)[[1]],Dh, inTwo=FALSE)[[1]] )[[1]]
      g <- jone
      Fh <- (Fh + Ah + K[jone] + thisM[g]) %% two32
      Ah <- Dh
      Dh <- Ch
      Ch <- Bh
      Bh <- (Bh + bigRotate(Fh, sidx[jone])) %% two32   
    }
    
    for (j2 in 17:32) {
      Fh <- bigOr( bigAnd(Dh,Bh,inTwo=FALSE)[[1]] ,bigAnd(bigNot(Dh[[1]],inTwo=FALSE, outTwo=FALSE), Ch, inTwo=FALSE)[[1]] ,inTwo=FALSE)[[1]] 
      g <-(1 + 5*(j2-1))%%16 +1 # plus one due to indexing from 1
      Fh <- (Fh + Ah + K[j2] + thisM[g]) %% two32  
      Ah <- Dh
      Dh <- Ch
      Ch <- Bh
      Bh <- (Bh + bigRotate(Fh, sidx[j2])) %% two32
    }
    for(j3 in 33:48){
      Fh <- bigXor(Bh,bigXor(Ch,Dh, inTwo=FALSE)[[1]], inTwo=FALSE)[[1]]
      g <-(5 + 3*(j3-1))%%16 +1
      Fh<- (Fh + Ah + K[j3] + thisM[g]) %% two32  
      Ah<- Dh
      Dh<- Ch
      Ch<- Bh
      Bh<- (Bh + bigRotate(Fh, sidx[j3]))%%two32 
    }
    for(j4 in 49:64){
      Fh <- bigXor(Ch,bigOr(Bh,bigNot(Dh,inTwo=FALSE, outTwo=FALSE)[[1]],inTwo=FALSE)[[1]],inTwo=FALSE)[[1]]
      g <- (7 * (j4 -1))%%16 +1
      Fh<- (Fh + Ah + K[j4] + thisM[g]) %% two32  
      Ah<- Dh
      Dh<- Ch
      Ch<- Bh
      Bh<- (Bh + bigRotate(Fh, sidx[j4])) %% two32 
    }
    
    # Add this chunk's hash to result so far:
    a0 <- (a0 + Ah) %% two32
    b0 <- (b0 + Bh) %% two32
    c0 <- (c0 + Ch) %% two32
    d0 <- (d0 + Dh) %% two32
  }   # end of jchunk

  thesum <- sum(
    bigShiftL(a0, 32 * 0),
    bigShiftL(b0, 32 * 1),
    bigShiftL(c0, 32 * 2),
    bigShiftL(d0, 32 * 3)
  )
  
  hex <- base2base(thesum, frombase = 10, tobase = 16)[[1]]
  swap_endianness(hex)
}


# ((x<<amount) | (x>>(32-amount))) & 0xFFFFFFFF
bigRotate <- function(x, amount) {
  two32 <- as.bigz(2^(as.bigz(32)))
  x <- x %% two32
  lshift <- bigShiftL(x, amount)
  rshift <- bigShiftR(x, 32-amount)
  
  # bigOr errors if either value is zero so handle those cases explicitly
  if (lshift == 0) {
    rshift
  } else if (rshift == 0) {
    lshift
  } else {
    bigOr(lshift, rshift) %% two32
  }
}


swap_endianness <- function(hex) {
  chars <- strsplit(hex, "")[[1]]
  pairs <- paste0(chars[c(TRUE, FALSE)], chars[c(FALSE, TRUE)])
  paste0(rev(pairs), collapse = "")
}

通过一些单元测试
library(testthat)

test_that("md5 works", {
  expect_equal(mymd5(""), "d41d8cd98f00b204e9800998ecf8427e")
  expect_equal(mymd5("a"), "0cc175b9c0f1b6a831c399e269772661")
  expect_equal(mymd5("abc"), "900150983cd24fb0d6963f7d28e17f72")
  expect_equal(mymd5("message digest"), "f96b697d7cb7938d525a2f31aaf161d0")
  expect_equal(mymd5("abcdefghijklmnopqrstuvwxyz"), "c3fcd3d76192e4007dfb496cca67e13b")
  expect_equal(mymd5("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), "d174ab98d277d9f5a5611c2c9f419d9f")
  expect_equal(mymd5("12345678901234567890123456789012345678901234567890123456789012345678901234567890"), "57edf4a22be3c955ac49da2e2107b67a")
})
#> Test passed

1
哇!谢谢!我已经发现了其中的一些问题(比如在thischunk中反转字节),但是你对细节的敏锐关注令人印象深刻。我需要仔细研究你简化过的bigRotate,因为我希望那个函数能够通用,即适用于任意大小的值,而不仅限于32位。 - Carl Witthoft
给那些“爬取代码”的人一个快速的跟进说明——当修改bigRotate以允许任意大小的二进制时,它的运行速度比我在问题中提供的代码要慢得多。它们都能产生相同且正确的输出。 - Carl Witthoft

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接