从一个2D numpy数组中删除行

Question

从一个2D numpy数组中删除行

8

给定一个二维的numpy数组：

00111100110111
01110011000110
00111110001000
01101101001110

有没有一种高效的方法来替换长度大于等于N的连续的1？

例如，如果N=3

00222200110222
02220011000110
00222220001000
01101101002220

实际上，这个二维数组是二进制的，我想用0替换连续的1，但为了更清晰，我在上面的示例中用2代替它们。

可运行的示例：http://runnable.com/U6q0q-TFWzxVd_Uf/numpy-replace-runs-for-python 我目前使用的代码看起来有点笨拙，我觉得可能有一些神奇的numpy方法可以做到这一点：

更新：我知道我改变了示例，以一个不能处理边角情况的版本为例。那只是一个小的实现错误（现已修复）。我更感兴趣的是是否有一种更快的方法来做到这一点。

import numpy as np
import time

def replace_runs(a, search, run_length, replace = 2):
  a_copy = a.copy() # Don't modify original
  for i, row in enumerate(a):
    runs = []
    current_run = []
    for j, val in enumerate(row):
      if val == search:
        current_run.append(j)
      else:
        if len(current_run) >= run_length or j == len(row) -1:
          runs.append(current_run)
        current_run = []

    if len(current_run) >= run_length or j == len(row) -1:
      runs.append(current_run)

    for run in runs:
      for col in run:
        a_copy[i][col] = replace

  return a_copy

arr = np.array([
  [0,0,1,1,1,1,0,0,1,1,0,1,1,1],
  [0,1,1,1,0,0,1,1,0,0,0,1,1,0],
  [0,0,1,1,1,1,1,0,0,0,1,0,0,0],
  [0,1,1,0,1,1,0,1,0,0,1,1,1,0],
  [1,1,1,1,1,1,1,1,1,1,1,1,1,1],
  [0,0,0,0,0,0,0,0,0,0,0,0,0,0],
  [1,1,1,1,1,1,1,1,1,1,1,1,1,0],
  [0,1,1,1,1,1,1,1,1,1,1,1,1,1],
])

print arr
print replace_runs(arr, 1, 3)

iterations = 100000

t0 = time.time()
for i in range(0,iterations):
  replace_runs(arr, 1, 3)
t1 = time.time()

print "replace_runs: %d iterations took %.3fs" % (iterations, t1 - t0)

输出：

[[0 0 1 1 1 1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 1 0 0 0 1 1 0]
 [0 0 1 1 1 1 1 0 0 0 1 0 0 0]
 [0 1 1 0 1 1 0 1 0 0 1 1 1 0]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 0]
 [0 1 1 1 1 1 1 1 1 1 1 1 1 1]]

[[0 0 2 2 2 2 0 0 1 1 0 2 2 2]
 [0 2 2 2 0 0 1 1 0 0 0 2 2 0]
 [0 0 2 2 2 2 2 0 0 0 1 0 0 0]
 [0 1 1 0 1 1 0 1 0 0 2 2 2 0]
 [2 2 2 2 2 2 2 2 2 2 2 2 2 2]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [2 2 2 2 2 2 2 2 2 2 2 2 2 0]
 [0 2 2 2 2 2 2 2 2 2 2 2 2 2]]

replace_runs: 100000 iterations took 14.406s

- Pete Hamilton

如果可能的话，请不要编辑您的问题，我不希望它变成维基百科（在另外两次编辑后会发生）。请留下评论；大家（包括我）都努力为您提供答案。谢谢！ - Ali

当然，抱歉。我仍需要仔细查看答案并执行一些测试，我只是进行了编辑以尝试解决实现错误，因为人们似乎在证明给我“已修复”的版本时出现了误解和运行缓慢的情况，而我真正想要的是快速实现（尽管显然如你所指出，我的代码也没有完全做到我需要的）。我只是不想让人们走上错误的道路。 - Pete Hamilton

我的解决方案非常快速;) - Ali

6个回答

1

纯Python

你可能想测试你的代码，因为它似乎没有达到你的预期。请运行这个脚本，将你的代码与我的进行比较并检查输出：

import numpy as np

def find_first(a, index, value):
    while index<a.size and a[index]!=value:
        index += 1
    return index

def find_end(a, index, value):
    while index<a.size and a[index]==value:
        index += 1
    return index

def replace_run(a, begin, end, threshold, replace):
    if end-begin+1 > threshold:
        a[begin:end] = replace

def process_row(a, value, threshold, replace):
    first = 0
    while first < a.size:
        if a[first]==value:
            end = find_end(a, first, value)
            replace_run(a, first, end, threshold, replace)
            first = end
        else:
            first = find_first(a, first, value)

def replace_py(a, value, length, replace):
    mat = a.copy()
    for row in mat:
        process_row(row, value, length, replace)
    return mat

################################################################################
# Your code as posted in the question:

def replace_runs(a, search, run_length, replace = 2):
  a_copy = a.copy() # Don't modify original
  for i, row in enumerate(a):
    runs = []
    current_run = []
    for j, val in enumerate(row):
      if val == search:
        current_run.append(j)
      else:
        if len(current_run) >= run_length or j == len(row) -1:
          runs.append(current_run)
        current_run = []

    if len(current_run) >= run_length or j == len(row) -1:
      runs.append(current_run)

    for run in runs:
      for col in run:
        a_copy[i][col] = replace

  return a_copy

# End of your code
################################################################################

def print_mismatch(a, b):
    print 'Elementwise equals'
    mat_equals = a==b
    print  mat_equals
    print 'Reduced to rows'
    for i, outcome in enumerate(np.logical_and.reduce(mat_equals, 1)):
        print i, outcome

if __name__=='__main__':
    np.random.seed(31)
    shape = (20, 10)
    mat = np.asarray(a=np.random.binomial(1, p=0.5, size=shape), dtype=np.int32)
    mat.reshape(shape)
    runs = replace_runs(mat, 1, 3, 2)
    py = replace_py(mat, 1, 3, 2)

    print 'Original'
    print mat
    print 'replace_runs()'
    print runs
    print 'replace_py()'
    print py

    print 'Mismatch between replace_runs() and replace_py()'
    print_mismatch(runs, py)

在你的代码不固定之前，基准测试没有意义。因此，我将使用我的replace_py()函数进行基准测试。

我认为replace_py()实现了你想要的功能，但它不符合Pythonic，有许多反模式。尽管如此，它似乎是正确的。

时间：

np.random.seed(31)
shape = (100000, 10)
mat = np.asarray(a=np.random.binomial(1, p=0.5, size=shape), dtype=np.int32)
mat.reshape(shape)
%timeit replace_py(mat, 1, 3, 2)
1 loops, best of 3: 9.49 s per loop

Cython

我认为你的问题不容易重写成使用Numpy和矢量化技术的形式。也许一位Numpy专家可以做到，但我担心代码会变得非常晦涩或缓慢（或两者兼而有之）。引用Numpy开发人员之一的话：

[...] 当需要使用NumPy-ology博士学位才能将解决方案向量化，或者结果会导致太多的内存开销时，您可以使用Cython[...]。

因此，我使用类型化内存视图在Cython中重新编写了replace_py()及其调用的函数：

# cython: infer_types=True
# cython: boundscheck=False
# cython: wraparound=False
import numpy as np
cimport numpy as np

cdef inline int find_first(int[:] a, int index, int n, int value) nogil:
    while index<n and a[index]!=value:
        index += 1
    return index

cdef inline int find_end(int[:] a, int index, int n, int value) nogil:
    while index<n and a[index]==value:
        index += 1
    return index

cdef inline void replace_run(int[:] a, int begin, int end, int threshold, int replace) nogil:
    if end-begin+1 > threshold:
        for i in xrange(begin, end):
            a[i] = replace

cdef inline void process_row(int[:] a, int value, int threshold, int replace) nogil:
    cdef int first, end, n
    first = 0
    n = a.shape[0]
    while first < n:
        if a[first]==value:
            end = find_end(a, first, n, value)
            replace_run(a, first, end, threshold, replace)
            first = end
        else:
            first = find_first(a, first, n, value)

def replace_cy(np.ndarray[np.int32_t, ndim=2] a, int value, int length, int replace):
    cdef int[:, ::1] vmat
    cdef int i, n
    mat = a.copy()
    vmat = mat
    n = vmat.shape[0]
    for i in xrange(n):
        process_row(vmat[i], value, length, replace)
    return mat

需要进行一些调整，代码比上面给出的Python代码更加混乱。但是这不需要太多的工作，而且非常直接。

时间：

np.random.seed(31)
shape = (100000, 10)
mat = np.asarray(a=np.random.binomial(1, p=0.5, size=shape), dtype=np.int32)
mat.reshape(shape)
%timeit replace_cy(mat, 1, 3, 2)
100 loops, best of 3: 8.16 ms per loop

这是一个1163倍的加速！

Numba

我在 Github 上得到了帮助，现在Numba版本也可以工作了；我只是在纯 Python 代码中添加了@autojit，除了a[begin:end] = replace之外，参见我在 Github 上得到的讨论。

import numpy as np
from numba import autojit

@autojit
def find_first(a, index, value):
    while index<a.size and a[index]!=value:
        index += 1
    return index

@autojit
def find_end(a, index, value):
    while index<a.size and a[index]==value:
        index += 1
    return index

@autojit
def replace_run(a, begin, end, threshold, replace):
    if end-begin+1 > threshold:
        for i in xrange(begin, end):
            a[i] = replace

@autojit        
def process_row(a, value, threshold, replace):
    first = 0
    while first < a.size:
        if a[first]==value:
            end = find_end(a, first, value)
            replace_run(a, first, end, threshold, replace)
            first = end
        else:
            first = find_first(a, first, value)

@autojit            
def replace_numba(a, value, length, replace):
    mat = a.copy()
    for row in mat:
        process_row(row, value, length, replace)
    return mat

时间（使用上述常规输入，代码已省略）：

1 loops, best of 3: 86.5 ms per loop

这与纯Python代码相比是一个110倍的加速！！！Numba版本仍然比Cython慢10倍，这很可能是由于没有内联小函数，但我认为这基本上是免费获得这种加速，而不会弄乱我们的Python代码！

- Ali

1

我将考虑输入为一维数组，因为这适用于二维数组。

在二进制中，你可以通过使用 & 符号来检查两个项是否都为 1。在 numpy 中，你可以通过切片有效地“移动”数组。因此，创建第二个数组，在你想要取消设置（或更改为 2）的所有位置上都有一个 1。然后使用 ^ 或 + 将其与原始数组结合起来，具体取决于你是想将它们变成零还是两个：

def unset_ones(a, n):
    match = a[:-n].copy()
    for i in range(1, n): # find 1s that have n-1 1s following
        match &= a[i:i-n]
    matchall = match.copy()
    matchall.resize(match.size + n)
    for i in range(1, n): # make the following n-1 1s as well
        matchall[i:i-n] |= match
    b = a.copy()
    b ^= matchall # xor into the original data; replace by + to make 2s
    return b

例子：

>>> unset_ones(np.array([0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0]), 3)
array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0])

- otus

这听起来很有趣，你能举个用法的例子吗？当我尝试使用时，numpy 出现了一些形状广播错误。对于 &= 操作符 (我认为是这个操作符)，也许索引没有完全正确。我也会进行调查。 - Pete Hamilton

1

@PeterHamilton，是的，我有一些愚蠢的错误，现在它实际上可以工作了。我认为这对于短数组来说不会很快，但对于大数组来说可能会比较快。 - otus

这似乎在一些边缘情况下会失败，比如当一行以[...，1，1，1]结尾时。否则看起来很有前途，将进行测试和基准测试。 - Pete Hamilton

1

首先，你的代码无法正常工作...它会将第二行末尾仅有两个1的簇替换为2。话虽如此，以下代码可以实现你描述的功能：

def replace_runs_bis(arr, search=1, n=3, val=2):
    ret = np.array(arr) # this makes a copy by default
    rows, cols = arr.shape
    # Fast convolution with an all 1's kernel
    arr_cum = np.cumsum(arr == search, axis=1)
    arr_win = np.empty((rows, cols-n+1), dtype=np.intp)
    arr_win[:, 0] = arr_cum[:, n-1]
    arr_win[:, 1:] = arr_cum[:, n:] - arr_cum[:, :-n]
    mask_win = arr_win >= n
    # mask_win is True for n item windows all full of searchs, expand to pixels
    mask = np.zeros_like(arr, dtype=np.bool)
    for j in range(n):
        sl_end = -n+j+1
        sl_end = sl_end if sl_end else None
        mask[:, j:sl_end] |= mask_win
    #replace values
    ret[mask] = val

    return ret

对于您的示例数组，它快约2倍，但我猜对于更大的数组，只要保持n小，速度会更快。

In [23]: %timeit replace_runs(arr, 1, 3)
10000 loops, best of 3: 163 µs per loop

In [24]: %timeit replace_runs_bis(arr, 1, 3)
10000 loops, best of 3: 80.9 µs per loop

- Jaime

0

这比 OP 稍微快一点，但仍然有些 hacky：

def replace2(originalM) :
    m = originalM.copy()
    for v in m :
        idx = 0
        for (key,n) in ( (key, sum(1 for _ in group)) for (key,group) in itertools.groupby(v) ) :
            if key and n>=3 :
                v[idx:idx+n] = 2
            idx += n
    return m

%%timeit
replace_runs(arr, 1, 3)
10000 loops, best of 3: 61.8 µs per loop

%%timeit
replace2(arr)
10000 loops, best of 3: 48 µs per loop

- usual me

0

toine的卷积方法也是一个不错的选择。基于这些答案，你可以使用groupy来得到你想要的。

from itertools import groupby, repeat, chain
run_length = 3
new_value = 2
# Groups the element by successive repetition
grouped = [(k, sum(1 for _ in v)) for k, v in groupby(arr[0])]
# [(0, 2), (1, 4), (0, 2), (1, 2), (0, 1), (1, 3)]
output = list(chain(*[list(repeat(k if v < run_length else new_value, v)) for k, v in grouped]))
# [0, 0, 2, 2, 2, 2, 0, 0, 1, 1, 0, 2, 2, 2]

你只需要对 arr 中的每一行进行操作。如果你想要更高效，可以根据自己的需求进行调整（例如删除列表创建）。

使用我在链接答案中提供的 Paul 的示例，你可以按照以下方式进行操作：

import numpy as np
new_value = 2
run_length = 3
# Pad with values outside the possible values
diff = np.concatenate(([2], np.diff(arr[0]), [-1]))
# Get the array difference (every number substracted from the preceding)
idx_diff = np.where(diff)[0]
# Get values where groups are longer than 2 and value is 1
idx = np.where((np.diff(idx_diff) >= run_length) & arr[0][idx_diff[:-1]])[0]
# Set every group to its new value
for i in idx:
    arr[0][idx_diff[i]:idx_diff[i+1]] = new_value

这只是一个思路。使用这种方法，可以在一次运行中完成整个矩阵，并直接修改数组，这应该是有效的。对于这个想法的原始状态，我感到很抱歉。希望它能给你启示。一个好的加速提示是删除for循环。

当然，如果你想为了清晰而牺牲清晰度，那么这就是一个选择。在我的看法中，在Python中，你想快速地原型化想法时，很少会出现这种情况。如果你有一个已经被证明正确且必须快速的算法，请用C（或Cython）编写它，并在你的Python程序中使用它（使用ctypes或CFFI）。

- Soravux

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- toine · Accepted Answer

使用卷积进行模式匹配:

def replace_runs(a, N, replace = 2):
    a_copy = a.copy()
    pattern = np.ones(N, dtype=int)
    M = a_copy.shape[1]

    for i, row in enumerate(a_copy):
        conv = np.convolve(row, pattern, mode='same')
        match = np.where(conv==N)

        a_copy[i][match]=replace
        a_copy[i][match[0][match[0]-1>0]-1]=replace
        a_copy[i][match[0][match[0]+1<M]+1]=replace
    return a_copy

新的 replace_runs 比原版慢了3倍，但可以检测到一些特殊情况（例如所提出的基于字符串的方法）。

在我的机器上：

原版 replace_runs：100000 次迭代花费了 12.792 秒。

变体 replace_runs：100000 次迭代花费了 33.112 秒。