我正在使用两个不同的标准差计算方法对 pandas Series 对象进行异常值检测。但是,我使用了两个循环,速度非常慢。我想知道是否有任何 pandas 的“技巧”可以加快此步骤。
这是我使用的代码(警告:代码非常丑陋!):
def find_outlier(point, window, n):
return np.abs(point - nanmean(window)) >= n * nanstd(window)
def despike(self, std1=2, std2=20, block=100, keep=0):
res = self.values.copy()
# First run with std1:
for k, point in enumerate(res):
if k <= block:
window = res[k:k + block]
elif k >= len(res) - block:
window = res[k - block:k]
else:
window = res[k - block:k + block]
window = window[~np.isnan(window)]
if np.abs(point - window.mean()) >= std1 * window.std():
res[k] = np.NaN
# Second run with std2:
for k, point in enumerate(res):
if k <= block:
window = res[k:k + block]
elif k >= len(res) - block:
window = res[k - block:k]
else:
window = res[k - block:k + block]
window = window[~np.isnan(window)]
if np.abs(point - window.mean()) >= std2 * window.std():
res[k] = np.NaN
return Series(res, index=self.index, name=self.name)