In [13]:
# 重回帰分析

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# データ読み込み
boston = load_boston()
boston.keys()
'''
dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])
'''

boston['feature_names']
'''
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
'''

# 訓練・検証データに分轄
X_train, X_test, y_train, y_test =train_test_split(boston.data, boston.target, test_size=0.3, random_state=0)

# 数学モデルのインスタンスを作成する
lr = LinearRegression()

# 訓練する
lr.fit(X_train, y_train)

'''
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
'''

# 予測する
y_predicted = lr.predict(X_test)
y_predicted
Out[13]:
array([24.9357079 , 23.75163164, 29.32638296, 11.97534566, 21.37272478,
       19.19148525, 20.5717479 , 21.21154015, 19.04572003, 20.35463238,
        5.44119126, 16.93688709, 17.15482272,  5.3928209 , 40.20270696,
       32.31327348, 22.46213268, 36.50124666, 31.03737014, 23.17124551,
       24.74815321, 24.49939403, 20.6595791 , 30.4547583 , 22.32487164,
       10.18932894, 17.44286422, 18.26103077, 35.63299326, 20.81960303,
       18.27218007, 17.72047628, 19.33772473, 23.62254823, 28.97766856,
       19.45036239, 11.13170639, 24.81843595, 18.05294835, 15.59712226,
       26.21043403, 20.81140432, 22.17349382, 15.48367365, 22.62261604,
       24.88561528, 19.74754478, 23.0465628 ,  9.84579105, 24.36378793,
       21.47849008, 17.62118176, 24.39160873, 29.95102691, 13.57219422,
       21.53645439, 20.53306273, 15.03433182, 14.3232289 , 22.11929299,
       17.07321915, 21.54141094, 32.96766968, 31.371599  , 17.7860591 ,
       32.75069556, 18.74795323, 19.21428022, 19.41970047, 23.08087809,
       22.87732816, 24.06399098, 30.52824406, 28.71453508, 25.90763165,
        5.17596718, 36.8709072 , 23.76983849, 27.26064379, 19.25849042,
       28.41860517, 19.3008798 , 18.94922353, 38.00154059, 39.44096748,
       23.72297885, 24.83722534, 16.52015743, 25.9970546 , 16.73997072,
       15.48656983, 13.52825536, 24.12884363, 30.76919578, 22.18731163,
       19.8848644 ,  0.42275479, 24.86785849, 16.05692   , 17.42486412,
       25.49798527, 22.35171315, 32.66562689, 22.04428746, 27.29799885,
       23.20302026,  6.86196574, 14.869251  , 22.31804948, 29.18125768,
       33.22568234, 13.24392523, 19.67195771, 20.7502616 , 12.02271319,
       23.50067006,  5.55662571, 19.87634689,  9.27059783, 44.81787339,
       30.56017983, 12.44394048, 17.33192202, 21.48313292, 23.52664913,
       20.49877266, 35.09161099, 13.22639935, 20.70321163, 35.35582833,
       19.45050576, 13.81603561, 14.15654562, 23.03678503, 15.07521258,
       30.9662041 , 25.23236632, 15.43763716, 24.06406534,  9.93080346,
       15.01618901, 21.06098873, 32.87115732, 27.80927747, 25.91293794,
       15.27877362, 30.97489404, 27.81107682, 14.5068157 ,  7.57369946,
       28.3348068 , 25.04341153])
In [29]:
# 可視化する:正解データと予測データを比較する
%matplotlib inline
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.scatter(y_predicted, y_test)
ax.plot((0,50), (0,50), linestyle='dashed', color='red')
ax.set_xlabel('predicted value')
ax.set_ylabel('actual value')
plt.show()

'''
結果:正解データと予測データで
   強い正の相関を確認できた
'''