这里是使用
blas
进行了优化的版本。
blas
拥有针对对称或厄米矩阵的特殊线性代数例程。这些例程使用专门的打包存储,仅保留上三角或下三角,并省略(冗余的)镜像条目。这样,blas不仅节约了大约一半的存储空间,还节约了大约一半的浮点运算次数。我添加了相当多的注释以使其易于阅读。
import numpy as np
import itertools
from scipy.linalg.blas import zhpr, dspr2, zhpmv
def acc_vect(pos, mas):
n = mas.size
d2 = pos@(-2*pos.T)
diag = -0.5 * np.einsum('ii->i', d2)
d2 += diag + diag[:, None]
np.einsum('ii->i', d2)[...] = 1
return np.nansum((pos[:, None, :] - pos) * (mas[:, None] * d2**-1.5)[..., None], axis=0)
def acc_blas(pos, mas):
n = mas.size
trck = np.zeros((3, n * (n + 1) // 2), complex)
for a, p in zip(trck, pos.T - 1j):
zhpr(n, -2, p, a, 1, 0, 0, 1)
ppT = trck.real.sum(0) + 6
dspr2(n, -0.5, ppT[np.r_[0, 2:n+1].cumsum()], np.ones((n,)), ppT,
1, 0, 1, 0, 0, 1)
np.divide(trck.imag, ppT*np.sqrt(ppT), where=ppT.astype(bool),
out=trck.imag)
out = np.zeros((3, n), complex)
for a, o in zip(trck, out):
zhpmv(n, 0.5, a, mas*-1j, 1, 0, 0, o, 1, 0, 0, 1)
return out.real.T
def accelerations(positions, masses, epsilon=1e-6, gravitational_constant=1.0):
'''Params:
- positions: numpy array of size (n,3)
- masses: numpy array of size (n,)
'''
n_bodies = len(masses)
accelerations = np.zeros([n_bodies,3])
D = np.zeros([n_bodies,n_bodies,3])
for i, j in itertools.product(range(n_bodies), range(n_bodies)):
D[i][j] = positions[j]-positions[i]
A = np.zeros((n_bodies, n_bodies,3))
for i, j in itertools.product(range(n_bodies), range(n_bodies)):
if np.linalg.norm(D[i][j]) > epsilon:
A[i][j] = gravitational_constant * masses[j] * D[i][j] \
/ np.linalg.norm(D[i][j])**3
accelerations = np.sum(A, axis=1)
return accelerations
from numpy.linalg import norm
def acc_pm(positions, masses, G=1):
'''Params:
- positions: numpy array of size (n,3)
- masses: numpy array of size (n,)
'''
mass_matrix = masses.reshape((1, -1, 1))*masses.reshape((-1, 1, 1))
disps = positions.reshape((1, -1, 3)) - positions.reshape((-1, 1, 3))
dists = norm(disps, axis=2)
dists[dists == 0] = 1
forces = G*disps*mass_matrix/np.expand_dims(dists, 2)**3
return forces.sum(axis=1)/masses.reshape(-1, 1)
n = 500
pos = np.random.random((n, 3))
mas = np.random.random((n,))
from timeit import timeit
print(f"loops: {timeit('accelerations(pos, mas)', globals=globals(), number=1)*1000:10.3f} ms")
print(f"pmende: {timeit('acc_pm(pos, mas)', globals=globals(), number=10)*100:10.3f} ms")
print(f"vectorized: {timeit('acc_vect(pos, mas)', globals=globals(), number=10)*100:10.3f} ms")
print(f"blas: {timeit('acc_blas(pos, mas)', globals=globals(), number=10)*100:10.3f} ms")
A = accelerations(pos, mas)
AV = acc_vect(pos, mas)
AB = acc_blas(pos, mas)
AP = acc_pm(pos, mas)
assert np.allclose(A, AV) and np.allclose(AB, AV) and np.allclose(AP, AV)
样例运行; 与OP、我的纯numpy向量化和@P Mende的比较。
loops: 3213.130 ms
pmende: 41.480 ms
vectorized: 43.860 ms
blas: 7.726 ms
我们可以看到:
1)P Mende在矢量化方面略胜过我;
2)
blas
速度大约快了
~5倍,请注意我的blas不是很好。我怀疑如果使用优化过的blas,你可能会得到更好的结果(numpy在更好的blas上也应该运行得更快,但需要注意的是)。
3)无论哪个答案都比循环要快得多。
np.where
的方法,结合一个“质量矩阵”,可以通过执行以下操作获得:masses.reshape((1, -1))*masses.reshape((-1, 1))
。 - PMende