def unique2d(arr,consider_sort=False,return_index=False,return_inverse=False):
"""Get unique values along an axis for 2D arrays.
input:
arr:
2D array
consider_sort:
Does permutation of the values within the axis matter?
Two rows can contain the same values but with
different arrangements. If consider_sort
is True then those rows would be considered equal
return_index:
Similar to numpy unique
return_inverse:
Similar to numpy unique
returns:
2D array of unique rows
If return_index is True also returns indices
If return_inverse is True also returns the inverse array
"""
if consider_sort is True:
a = np.sort(arr,axis=1)
else:
a = arr
b = np.ascontiguousarray(a).view(np.dtype((np.void,
a.dtype.itemsize * a.shape[1])))
if return_inverse is False:
_, idx = np.unique(b, return_index=True)
else:
_, idx, inv = np.unique(b, return_index=True, return_inverse=True)
if return_index == False and return_inverse == False:
return arr[idx]
elif return_index == True and return_inverse == False:
return arr[idx], idx
elif return_index == False and return_inverse == True:
return arr[idx], inv
else:
return arr[idx], idx, inv
我们现在可以将我们的映射定义如下:
def row_mapper(a,b,consider_sort=False):
"""Given two 2D numpy arrays returns mappers idx_a and idx_b
such that a[idx_a] = b[idx_b] """
assert a.dtype == b.dtype
assert a.shape == b.shape
c = np.concatenate((a,b))
_, inv = unique2d(c, consider_sort=consider_sort, return_inverse=True)
mapper_a = inv[:b.shape[0]]
mapper_b = inv[b.shape[0]:]
return np.argsort(mapper_a), np.argsort(mapper_b)
验证:
n = 100000
A = np.arange(n).reshape(n//4,4)
B = A[::-1,:]
idx_a, idx_b = row_mapper(A,B)
print np.all(A[idx_a]==B[idx_b])
基准测试:
针对 @ali_m 的解决方案进行基准测试
%timeit find_row_mapping(A,B) # ali_m's solution
%timeit row_mapper(A,B) # current solution
# n = 100
100000 loops, best of 3: 12.2 µs per loop
10000 loops, best of 3: 47.3 µs per loop
# n = 1000
10000 loops, best of 3: 49.1 µs per loop
10000 loops, best of 3: 148 µs per loop
# n = 10000
1000 loops, best of 3: 548 µs per loop
1000 loops, best of 3: 1.6 ms per loop
# n = 100000
100 loops, best of 3: 6.96 ms per loop
100 loops, best of 3: 19.3 ms per loop
# n = 1000000
10 loops, best of 3: 160 ms per loop
1 loops, best of 3: 372 ms per loop
# n = 10000000
1 loops, best of 3: 2.54 s per loop
1 loops, best of 3: 5.92 s per loop
虽然当前的解决方案也许还有改进的空间,但是与ali_m的解决方案相比,当前的方案要慢2-3倍,而且可能会更加混乱,因为两个数组都需要映射。只是想提供一个备选方案。
B[:] = A
有什么问题吗? - user2357112A
和B
的行分别映射到数组C
和D
,而A
和B
的(行)顺序决定了C
和D
中的值,所以我不能搞乱顺序。 - romeric