我会通过设置可能的中心点来处理这个问题,例如你的海岸线。
我认为这与
Nathaniel Saul的第一个评论很接近。
这样,在每次迭代中,不是选择平均值,而是选择可能集合中靠近聚类的点。
我已将条件简化为仅包含2个数据列(经度和纬度),但您应该能够推广这个概念。为了演示简单,我基于
此处的代码。
在这个例子中,紫色的点是沿海的地方。如果我理解正确,最佳的海岸线位置应该看起来像这样:
![tooltip Coastline Optimum](https://raw.githubusercontent.com/Alex-Chervony/kmeans/master/Figure_1.png)
请看下方代码:
import matplotlib.pyplot as plt
import numpy as np
import random
def possible_points(n=20):
y=list(np.linspace( -1, 1, n ))
x=[-1.2]
X=[]
for i in list(range(1,n)):
x.append(x[i-1]+random.uniform(-2/n,2/n) )
for a,b in zip(x,y):
X.append(np.array([a,b]))
X = np.array(X)
return X
def init_board_gauss(N, k):
n = float(N)/k
X = []
for i in range(k):
c = (random.uniform(-1, 1), random.uniform(-1, 1))
s = random.uniform(0.05,0.5)
x = []
while len(x) < n:
a, b = np.array([np.random.normal(c[0], s), np.random.normal(c[1], s)])
if abs(a) < 1 and abs(b) < 1:
x.append([a,b])
X.extend(x)
X = np.array(X)[:N]
return X
def cluster_points(X, mu):
clusters = {}
for x in X:
bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
for i in enumerate(mu)], key=lambda t:t[1])[0]
try:
clusters[bestmukey].append(x)
except KeyError:
clusters[bestmukey] = [x]
return clusters
def closest_point(cluster,possiblePoints):
closestPoints=[]
for possible in possiblePoints:
distances=[]
for point in cluster:
distances.append(np.linalg.norm(possible-point))
closestPoints.append(np.sum(distances))
return possiblePoints[closestPoints.index(min(closestPoints))]
def reevaluate_centers(clusters,possiblePoints):
newmu = []
keys = sorted(clusters.keys())
for k in keys:
newmu.append(closest_point(clusters[k],possiblePoints))
return newmu
def has_converged(mu, oldmu):
return (set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu]))
def find_centers(X, K, possiblePoints):
oldmu = random.sample(list(possiblePoints), K)
mu = random.sample(list(possiblePoints), K)
while not has_converged(mu, oldmu):
oldmu = mu
clusters = cluster_points(X, mu)
mu = reevaluate_centers(clusters,possiblePoints)
return(mu, clusters)
K=3
X = init_board_gauss(30,K)
possiblePoints=possible_points()
results=find_centers(X,K,possiblePoints)
pointtypes1=["gx","gD","g*"]
plt.plot(
np.matrix(possiblePoints).transpose()[0],np.matrix(possiblePoints).transpose()[1],'m.'
)
for i in list(range(0,len(results[0]))) :
plt.plot(
np.matrix(results[0][i]).transpose()[0], np.matrix(results[0][i]).transpose()[1],pointtypes1[i]
)
pointtypes=["bx","yD","c*"]
for i in list(range(0,len(results[1]))) :
plt.plot(
np.matrix(results[1][i]).transpose()[0],np.matrix(results[1][i]).transpose()[1],pointtypes[i]
)
plt.show()
编辑以最小化总距离。