以下是一个小测试平台,用于查看
scipy.spatial.cKDTree在您的数据上的速度,并对附近点之间的距离散布情况有一个大致的了解。
运行各种K簇的好方法是建立最近对的MST,然后删除K-1个最长的;请参见
Wayne, Greedy Algorithms。
可视化聚类将很有趣--使用PCA投影到2d?
(只是好奇,您的K是10、100还是1000?)
添加于12月17日:实际运行时间:100000 x 5 10秒,500000 x 5 60秒。
from __future__ import division
import random
import sys
import time
import numpy as np
from scipy.spatial import cKDTree as KDTree
__date__ = "2010-12-17 dec denis"
def clumpiness( X, nbin=10 ):
""" how clumpy is X ? histogramdd av, max """
N, dim = X.shape
histo = np.histogramdd( X, nbin )[0] .astype(int)
n0 = histo.size - histo.astype(bool).sum()
print "clumpiness: %d of %d^%d data bins are empty av %.2g max %d" % (
n0, nbin, dim, histo.mean(), histo.max())
N = 100000
nask = 0
dim = 5
rnormal = .9
nnear = 2
leafsize = 10
eps = 1
seed = 1
exec "\n".join( sys.argv[1:] )
np.random.seed(seed)
np.set_printoptions( 2, threshold=200, suppress=True )
nask = nask or N
print "\nkdtree: dim=%d N=%d nask=%d nnear=%d rnormal=%.2g leafsize=%d eps=%.2g" % (
dim, N, nask, nnear, rnormal, leafsize, eps)
if rnormal > 0:
cov = rnormal * np.ones((dim,dim)) + (1 - rnormal) * np.eye(dim)
data = np.abs( np.random.multivariate_normal( np.zeros(dim), cov, N )) % 1
else:
data = np.random.uniform( size=(N,dim) )
clumpiness(data)
ask = data if nask == N else random.sample( data, sample )
t = time.time()
datatree = KDTree( data, leafsize=leafsize )
print "%.1f sec to build KDtree of %d points" % (time.time() - t, N)
t = time.time()
distances, ix = datatree.query( ask, k=nnear+1, eps=eps )
print "%.1f sec to query %d points" % (time.time() - t, nask)
distances = distances[:,1:]
avdist = distances.mean( axis=0 )
maxdist = distances.max( axis=0 )
print "distances to %d nearest: av" % nnear, avdist, "max", maxdist