我曾经遇到一个问题,需要展示数百个传感器每分钟采集的几年时间序列数据。有时候(比如在清理数据时),我希望看到所有的异常值,而有时候我更关心趋势。因此,我编写了一个函数,可以使用两种方法来减少数据点的数量:Visvalingam和Douglas-Peucker。前者倾向于去除异常值,后者则保留它们。我已经优化了这个函数,使其能够处理大型数据集。
在意识到所有的绘图方法都无法处理这么多的数据点,并且那些能够处理的方法会以一种我无法控制的方式减少数据集之后,我才开始编写这个函数。该函数如下:
function [X, Y, indices, relevance] = lineSimplificationI(X,Y,N,method,option)
if nargin < 3
error('Line points positions X, Y and target point count N MUST be specified');
end
if nargin < 4
method='DouglasPeucker';
end
if nargin < 5
option='silent';
end
doDisplay=strcmp(option,'verbose');
X=double(X(:));
Y=double(Y(:));
indices=1:length(Y);
if length(X)~=length(Y)
error('Vectors X and Y MUST have the same number of elements');
end
if N>=length(Y)
relevance=ones(length(Y),1);
if doDisplay
disp('N is greater or equal than the number of points in the line. Original X,Y were returned. Relevances were not computed.')
end
return
end
repeatedNaNs= isnan(Y(2:end)) & isnan(Y(1:end-1));
repeatedNaNs=[isnan(Y(1)); repeatedNaNs(:)];
Y=Y(~repeatedNaNs);
X=X(~repeatedNaNs);
indices=indices(~repeatedNaNs);
if isnan(Y(end))
Y=Y(1:end-1);
X=X(1:end-1);
indices=indices(1:end-1);
end
pCount=length(X);
if doDisplay
disp(['Initial point count = ' num2str(pCount)])
disp(['Non repeated NaN count in data = ' num2str(sum(isnan(Y)))])
end
iterCount=0;
while pCount>N
iterCount=iterCount+1;
twiceTriangleArea =abs((X(1:end-2).*(Y(2:end-1)-Y(3:end))+X(2:end-1).*(Y(3:end)-Y(1:end-2))+X(3:end).*(Y(1:end-2)-Y(2:end-1))));
switch method
case 'Visvalingam'
relevance=twiceTriangleArea/2;
case 'DouglasPeucker'
neighborDistances=ppDistance([X(1:end-2) Y(1:end-2)],[X(3:end) Y(3:end)]);
relevance=twiceTriangleArea./neighborDistances;
otherwise
error(['Unknown method: ' method]);
end
relevance=[Inf; relevance; Inf];
[srelevance, sortorder]= sort(relevance,'descend');
firstFinite=find(isfinite(srelevance),1,'first');
startPos=uint32(firstFinite+N+1);
toRemove=sort(sortorder(startPos:end));
if isempty(toRemove)
break;
end
contiguousToKeep=[diff(toRemove(:))==1; false] | [false; (toRemove(1:end-1)-toRemove(2:end))==-1];
notContiguous=~contiguousToKeep;
contRel=relevance(toRemove);
nContiguous=length(contiguousToKeep);
for paddingMode=1:2
if paddingMode==1
if mod(nContiguous,2)
pcontiguous=[contiguousToKeep; false];
pcontRel=[contRel; -Inf];
else
pcontiguous=contiguousToKeep;
pcontRel=contRel;
end
else
if mod(nContiguous,2)
pcontiguous=[false; contiguousToKeep];
pcontRel=[-Inf; contRel];
else
pcontiguous=[false; contiguousToKeep(1:end-1)];
pcontRel=[-Inf; contRel(1:end-1)];
end
end
contiguousPairs=reshape(pcontiguous,2,[]);
pcontRel=reshape(pcontRel,2,[]);
contCols=all(contiguousPairs);
if ~any(contCols) && paddingMode==2
break;
end
[~, lesserElementRow]=max(pcontRel);
if paddingMode==1
firstElementIdx=((1:size(contiguousPairs,2))*2)-1;
else
firstElementIdx=((1:size(contiguousPairs,2))*2)-2;
end
lesserElementIdx=firstElementIdx+lesserElementRow-1;
contiguousToKeep(lesserElementIdx(contCols))=false;
end
toRemove=toRemove(contiguousToKeep | notContiguous);
if any(diff(toRemove(:))==1) && doDisplay
warning([num2str(sum(diff(toRemove(:))==1)) ' continous elements removed in one iteration.'])
end
toRemoveLogical=false(pCount,1);
toRemoveLogical(toRemove)=true;
X=X(~toRemoveLogical);
Y=Y(~toRemoveLogical);
indices=indices(~toRemoveLogical);
pCount=length(X);
nRemoved=sum(toRemoveLogical);
if doDisplay
disp(['Iteration ' num2str(iterCount) ', Point count = ' num2str(pCount) ' (' num2str(nRemoved) ' removed)'])
end
if nRemoved==0
break;
end
end
end
function d = ppDistance(p1,p2)
d=sqrt((p1(:,1)-p2(:,1)).^2+(p1(:,2)-p2(:,2)).^2);
end