def getR1(VVm,VVs,HHm,HHs):
t0=time.time()
R=VVs.flatten()[numpy.newaxis,:]-VVm.flatten()[:,numpy.newaxis]
R*=R
R1=HHs.flatten()[numpy.newaxis,:]-HHm.flatten()[:,numpy.newaxis]
R1*=R1
R+=R1
del R1
print "R1\t",time.time()-t0, R.shape, #11.7576191425 (108225, 10500)
print numpy.max(R) #4176.26290975
# uses 17.5Gb ram
return R
def getR2(VVm,VVs,HHm,HHs):
t0=time.time()
precomputed_flat = numpy.column_stack((VVs.flatten(), HHs.flatten()))
measured_flat = numpy.column_stack((VVm.flatten(), HHm.flatten()))
deltas = precomputed_flat[None,:,:] - measured_flat[:, None, :]
#print time.time()-t0, deltas.shape # 5.861109972 (108225, 10500, 2)
R = numpy.einsum('ijk,ijk->ij', deltas, deltas)
print "R2\t",time.time()-t0,R.shape, #14.5291359425 (108225, 10500)
print numpy.max(R) #4176.26290975
# uses 26Gb ram
return R
def getR3(VVm,VVs,HHm,HHs):
from numpy.core.umath_tests import inner1d
t0=time.time()
precomputed_flat = numpy.column_stack((VVs.flatten(), HHs.flatten()))
measured_flat = numpy.column_stack((VVm.flatten(), HHm.flatten()))
deltas = precomputed_flat[None,:,:] - measured_flat[:, None, :]
#print time.time()-t0, deltas.shape # 5.861109972 (108225, 10500, 2)
R = inner1d(deltas, deltas)
print "R3\t",time.time()-t0, R.shape, #12.6972110271 (108225, 10500)
print numpy.max(R) #4176.26290975
#Uses 26Gb
return R
def getR4(VVm,VVs,HHm,HHs):
from scipy.spatial.distance import cdist
t0=time.time()
precomputed_flat = numpy.column_stack((VVs.flatten(), HHs.flatten()))
measured_flat = numpy.column_stack((VVm.flatten(), HHm.flatten()))
R=spdist.cdist(precomputed_flat,measured_flat, 'sqeuclidean') #.T
print "R4\t",time.time()-t0, R.shape, #17.7022118568 (108225, 10500)
print numpy.max(R) #4176.26290975
# uses 9 Gb ram
return R
def getR5(VVm,VVs,HHm,HHs):
from scipy.spatial.distance import cdist
t0=time.time()
precomputed_flat = numpy.column_stack((VVs.flatten(), HHs.flatten()))
measured_flat = numpy.column_stack((VVm.flatten(), HHm.flatten()))
R=spdist.cdist(precomputed_flat,measured_flat, 'euclidean') #.T
print "R5\t",time.time()-t0, R.shape, #15.6070930958 (108225, 10500)
print numpy.max(R) #64.6240118667
# uses only 9 Gb ram
return R
def getR6(VVm,VVs,HHm,HHs):
from scipy.weave import blitz
t0=time.time()
R=VVs.flatten()[numpy.newaxis,:]-VVm.flatten()[:,numpy.newaxis]
blitz("R=R*R") # R*=R
R1=HHs.flatten()[numpy.newaxis,:]-HHm.flatten()[:,numpy.newaxis]
blitz("R1=R1*R1") # R1*=R1
blitz("R=R+R1") # R+=R1
del R1
print "R6\t",time.time()-t0, R.shape, #11.7576191425 (108225, 10500)
print numpy.max(R) #4176.26290975
return R
以下是结果的时间:
R1 11.7737319469 (108225, 10500) 4909.66881791
R2 15.1279799938 (108225, 10500) 4909.66881791
R3 12.7408981323 (108225, 10500) 4909.66881791
R4 17.3336868286 (10500, 108225) 4909.66881791
R5 15.7530870438 (10500, 108225) 70.0690289494
R6 11.670968771 (108225, 10500) 4909.66881791
虽然最后一个公式是sqrt((VVm-VVs)^2+(HHm-HHs)^2),但其他的都是(VVm-VVs)^2+(HHm-HHs)^2。这并不重要,因为在我的代码中我会对每个i的R[i,:]取最小值,而sqrt并不影响最小值(如果我想要距离,我只需要取sqrt(value),而不是对整个数组进行sqrt,所以时间上没有任何差异。
问题是:第一个解决方案为什么是最好的呢?(第二和第三个较慢的原因是因为deltas=...需要5.8秒(这也是这两种方法需要26GB的原因),而sqeuclidean比euclidean更慢?
sqeuclidean应该只需执行(VVm-VVs)^2+(HHm-HHs)^2,但我认为它执行了不同的操作。有人知道如何找到该方法的源代码(C或底层语言)吗?我认为它执行了sqrt((VVm-VVs)^2+(HHm-HHs)^2)^2(我能想到唯一的原因是它比(VVm-VVs)^2+(HHm-HHs)^2慢——我知道这是一个愚蠢的理由,有没有更合理的原因?
由于我对C一无所知,我该如何用scipy.weave内联它?那个代码是否可以像普通的Python一样编译?还是我需要安装特殊的东西?
编辑:好吧,我用scipy.weave.blitz尝试了一下(R6方法),速度稍微快了一些,但我认为懂得C语言的人仍然可以提高这个速度。我只取了形如a+=b或*=的行,并查找了它们在C中的表现形式,并将它们放入blitz语句中,但我想如果我也将带有flatten和newaxis语句的行放入C中,那么速度应该会更快,但我不知道如何做到这一点(懂C语言的人可能解释一下吗?)。目前,使用blitz和第一个方法之间的差异并不大,我猜这不是由于C与numpy之间的差异造成的?
我猜其他方法,比如deltas=...,如果我将其放入C中,速度也会更快?
R
分配内存(即只使用R1 += R3
)来节省几秒钟的时间。 - bogatron