# クラスタリング
#  教師なし学習のタスク
# k-means 階層的クラスタ

# k-means クラスタリング
from sklearn.datasets import load_iris

iris = load_iris()
iris.keys()

#dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

# １００データ ２次元（１列目と３列目）を使用する
X=iris.data[:100,[0,2]]

# 可視化
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.scatter(X[:,0], X[:,1])

<matplotlib.collections.PathCollection at 0x14b4180a0b8>

# クラスタリングする
from sklearn.cluster import KMeans

# K-means のインスタンスを作成する クラスタ数＝３
km = KMeans(n_clusters=3, init='random', n_init=10, random_state=0)

#訓練と予測をする
y_km = km.fit_predict(X)
y_km
'''
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2,
       1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1])
'''

'\narray([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n       0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2,\n       1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2,\n       1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1])\n'

# k-meansで予測したクラスの可視化前処理
#---- 下記、可視化で同じことをやるので、当該セクションのコードは一切不要。

# y_kmのリスト要素がクラス0のときTureとする
y_km == 0
'''
array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])
'''
# 特徴量データXのうち 予測クラス0だったプロットだけを抽出する
X[y_km == 0][:5] #ただし、長いので最初の５行だけ
'''
array([[5.1, 1.4],
       [4.9, 1.4],
       [4.7, 1.3],
       [4.6, 1.5],
       [5. , 1.4]]) →続く
'''
#これを X軸と　Y軸に分割する
X[y_km == 0, 0][:5] # array([5.1, 4.9, 4.7, 4.6, 5. ])
X[y_km == 0, 1][:5] # array([1.4, 1.4, 1.3, 1.5, 1.4])

array([1.4, 1.4, 1.3, 1.5, 1.4])

#可視化
import numpy as np
fig, ax =plt.subplots()

# クラス0をプロット
ax.scatter(X[y_km == 0,0], X[y_km == 0, 1], s=50, edgecolor='black', marker='s', label='cluster_0')

# クラス1をプロット
ax.scatter(X[y_km == 1,0], X[y_km == 1, 1], s=50, edgecolor='black', marker='o', label='cluster_1')

# クラス2をプロット
ax.scatter(X[y_km == 2,0], X[y_km == 2, 1], s=50, edgecolor='black', marker='^', label='cluster_2')

ax.set_xlabel('Sepal Width')
ax.set_ylabel('Petal Width')
fig.legend(loc='best')

#クラスター中心をプロット
ax.plot(np.mean(X[y_km ==0, 0]), np.mean(X[y_km ==0, 1]), marker='x', markersize=20, color='red' )
ax.plot(np.mean(X[y_km ==1, 0]), np.mean(X[y_km ==1, 1]), marker='x', markersize=20, color='red' )
ax.plot(np.mean(X[y_km ==2, 0]), np.mean(X[y_km ==2, 1]), marker='x', markersize=20, color='red' )

plt.show()

# 階層的クラスタリング(凝集型　分割型)

# 凝集型 階層的クラスタリングのインスタンスを作成する
from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete')
'''
AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='complete', memory=None,
            n_clusters=3, pooling_func='deprecated')
'''
# クラスタリング予測を行い　ラベルデータを作成する
labels = ac.fit_predict(X)
labels
'''
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2,
       0, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 2, 2, 2,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0], dtype=int64)
'''

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2,
       0, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 2, 2, 2,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0], dtype=int64)

# 可視化する
import numpy as np
from scipy.cluster.hierarchy import dendrogram

# 子クラスとの関係を抽出する
children = ac.children_

# クラスター間の距離
distance = np.arange(children.shape[0]) # childrenの行数

# 各データの観測番号
no_of_observations = np.arange(2, children.shape[0]+2)

# 子クラスタ・クラスタ間の距離と観測番号を列方向に結合する
linkage_matrix =np.hstack((children, distance[:,np.newaxis], no_of_observations[:, np.newaxis])).astype(float)
linkage_matrix[:5]
'''
array([[ 0., 17.,  0.,  2.],
       [ 1., 37.,  1.,  3.],
       [12., 45.,  2.,  4.],
       [ 4., 49.,  3.,  5.],
       [ 6., 47.,  4.,  6.]])
'''

array([[ 0., 17.,  0.,  2.],
       [ 1., 37.,  1.,  3.],
       [12., 45.,  2.,  4.],
       [ 4., 49.,  3.,  5.],
       [ 6., 47.,  4.,  6.]])

# プロットする
fig, ax = plt.subplots(figsize=(15,3), dpi=300)
dendrogram(linkage_matrix, labels=np.arange(100), leaf_font_size=8, color_threshold=np.inf)
plt.show()