系统聚类算法又称层次聚类或系谱聚类,首先把样本看作各自一类,定义类间距离,选择距离最小的一对元素合并成一个新的类,重复计算各类之间的距离并重复上面的步骤,直到将所有原始元素分成指定数量的类。该算法的计算复杂度比较高,不适合大数据聚类问题。
from random import randrange
def generate(s, m1, m2):
'''生成形式如[('a', (1,5)), ('b', (3,6))]的随机坐标'''
x = [(ch, (randrange(m1), randrange(m1))) for ch in s]
return x
def xitongJulei(points, k=5):
'''根据欧几里得距离对points进行聚类,最终划分为k类'''
points = points[:]
while len(points)>k:
nearest = float('inf')
# 查找距离最近的两个点,进行合并
# 合并后的两个点,使用中点代替其坐标
for index1, point1 in enumerate(points[:-1]):
position1 = point1[1]
# 注意:这里的start参数很重要
for index2, point2 in enumerate(points[index1+1:], start=index1+1):
position2 = point2[1]
distance = (position1[0]-position2[0])**2 + (position1[1]-position2[1])**2
if distance < nearest:
result = (index1, index2)
nearest = distance
# 弹出距离最近的两个点,合并为一个点
p2 = points.pop(result[1])
p1 = points.pop(result[0])
p = (p1[0]+p2[0], ((p1[1][0]+p2[1][0])/2, (p1[1][1]+p2[1][1])/2))
# 使用合并后的点代替原来的两个点
points.append(p)
# 查看每步处理后的数据
print(points)
return points
# 生成随机测试数据
points = generate('abcde', 5, 5)
print('origin:'.center(20,'=')+'\n', points)
print('steps:'.center(20,'='))
# 聚类
result = xitongJulei(points, k=2)
print('result'.center(20,'='))
print(result)
附上代码截图方便理解代码的对齐关系:
某次运行结果:
======origin:=======
[('a', (1, 3)), ('b', (1, 0)), ('c', (1, 3)), ('d', (0, 0)), ('e', (4, 4))]
=======steps:=======
[('b', (1, 0)), ('d', (0, 0)), ('e', (4, 4)), ('ac', (1.0, 3.0))]
[('e', (4, 4)), ('ac', (1.0, 3.0)), ('bd', (0.5, 0.0))]
[('e', (4, 4)), ('acbd', (0.75, 1.5))]
=======result=======
[('e', (4, 4)), ('acbd', (0.75, 1.5))]