根据科罗拉多州罗斯国家森林区域树木类型的观测数据covtype.csv,实现树木类型识别任务
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
# 初始化质心
def initialize_centroids(data, k):
indices = np.random.choice(len(data), k, replace=False)
return data[indices]
# 分配数据点到最近的质心
def get_clusters(data, centroids):
cluster_labels = np.zeros(len(data))
for i, point in enumerate(data):
distances = np.linalg.norm(point - centroids, axis=1)
cluster_labels[i] = np.argmin(distances)
return cluster_labels
# 更新质心
def update_centroids(data, cluster_labels, k):
new_centroids = np.zeros((k, data.shape[1]))
for i in range(k):
cluster_points = data[cluster_labels == i]
new_centroids[i] = np.mean(cluster_points, axis=0)
return new_centroids
# K-Means聚类主函数
def k_means(data, k, T, epsilon):
start = time.time() # 开始时间,计时
centroids = initialize_centroids(data, k)
t = 0
while t <= T:
cluster_labels = get_clusters(data, centroids)
new_centroids = update_centroids(data, cluster_labels, k)
if np.linalg.norm(new_centroids - centroids) < epsilon:
break
centroids = new_centroids
print("第", t, "次迭代")
t += 1
print("用时:{0}".format(time.time() - start))
return cluster_labels, centroids
# 聚类结果的可视化
def visualize_clusters(data_scaled, cluster_labels):
unique_clusters = np.unique(cluster_labels)
colors = plt.cm.jet(np.linspace(0, 1, len(unique_clusters)))
for i, cluster in enumerate(unique_clusters):
cluster_data = data_scaled[cluster_labels == cluster]
plt.scatter(cluster_data[:, 0], cluster_data[:, 1], c=colors[i], label=f'Cluster {i+1}')
plt.title('K-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True)
plt.show()
# 主程序
if __name__ == "__main__":
# 读取数据集
data_path = r'D:/360zip/机器学习实验课材料/covtype.csv' # 替换为您的实际文件路径
data = pd.read_csv(data_path)
# 数据预处理:缩放特征
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)
# 执行K-Means聚类
k = 7 # 聚类的数量
T = 10 # 最大迭代次数
epsilon = 1e-5 # 收敛阈值
cluster_labels, centroids = k_means(data_scaled, k, T, epsilon)
# 可视化聚类结果
visualize_clusters(data_scaled, cluster_labels)
聚类之后,发现可视化的图不是自己想要的,我想要的是每个颜色都有自己的区域,但是运行结果不如意
到底要聚类还是分类啊
如果已经有各类树木的名称维度,用分类