In [15]:
# chapter 8 機械学習
"""教師あり学習

予測したい変数 :目的変数 正解データ 応答変数 ターゲット変数 従属変数
予測のための変数:説明変数 特徴量 予測変数 独立変数

y = f(x) の場合 x=説明変数 y=目的変数

回帰:regression
分類:classification

重回帰 multiple linear regression
ロジスティック回帰 logistic regression
k近傍法 k-nearest neighbors
決定木 Decision Tree
サポートベクターマシン
ランダムフォレスト
勾配Boosting
"""

"""教師なし学習
主成分分析
マーケットバスケット分析
"""

"""強化学習
探索と知識利用のバランスを取るのがポイント
動的計画法
モンテカルロ法 
TD学習
"""
Out[15]:
'強化学習\n探索と知識利用のバランスを取るのがポイント\n動的計画法\nモンテカルロ法\u3000\nTD学習\n'
In [16]:
# モジュール
import numpy as np
import scipy as sp
import pandas as pd
import sklearn

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()
%matplotlib inline

%precision 3
Out[16]:
'%.3f'
In [17]:
# 重回帰 multiple linear regression
import requests
import zipfile
import io

# サンプルデータをrequests.get().content (ダウンロード) する
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content
auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)
auto.columns = ['symboling','nomalalined-losses','make','fuel-type','aspiration','num-of-doors',
               'body-style','drive-sheels','enfine-location','wheel-base','length','width','height',
                'corb-weight','engine-type','num-of-cylinders','engin-size','fuel-system','bore',
                'stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']
auto.head(5)
Out[17]:
symboling nomalalined-losses make fuel-type aspiration num-of-doors body-style drive-sheels enfine-location wheel-base ... engin-size fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg price
0 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 13495
1 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 16500
2 1 ? alfa-romero gas std two hatchback rwd front 94.5 ... 152 mpfi 2.68 3.47 9.0 154 5000 19 26 16500
3 2 164 audi gas std four sedan fwd front 99.8 ... 109 mpfi 3.19 3.40 10.0 102 5500 24 30 13950
4 2 164 audi gas std four sedan 4wd front 99.4 ... 136 mpfi 3.19 3.40 8.0 115 5500 18 22 17450

5 rows × 26 columns

In [18]:
# 自動車の価格
auto['price'].head(5)
Out[18]:
0    13495
1    16500
2    16500
3    13950
4    17450
Name: price, dtype: object
In [19]:
# horsepower width height の3つを説明変数として自動車の価格を求めてみよう

# 欠損値の除去
data = auto[['price', 'horsepower', 'width', 'height']]
data.isin(['?']).sum()
"""
price         4
horsepower    2
width         0
height        0
dtype: int64
"""

# ?をNaNに置換して、行を削除する
data = data.replace('?', np.nan).dropna()
print(data.shape) # (199, 4)
data .head()
(199, 4)
Out[19]:
price horsepower width height
0 13495 111 64.1 48.8
1 16500 111 64.1 48.8
2 16500 154 65.5 52.4
3 13950 102 66.2 54.3
4 17450 115 66.4 54.3
In [20]:
# データ型の確認
print(data.dtypes)
"""
objectとfloat64が混ざっている
"""
price          object
horsepower     object
width         float64
height        float64
dtype: object
Out[20]:
'\nobjectとfloat64が混ざっている\n'
In [21]:
# 数値に変換する assign()
data = data.assign(price=pd.to_numeric(data['price']))
data = data.assign(horsepower=pd.to_numeric(data['horsepower']))
data.dtypes
Out[21]:
price           int64
horsepower      int64
width         float64
height        float64
dtype: object
In [22]:
# 相関の確認 corr()
data.corr()
Out[22]:
price horsepower width height
price 1.000000 0.810533 0.753871 0.134990
horsepower 0.810533 1.000000 0.615315 -0.087407
width 0.753871 0.615315 1.000000 0.309223
height 0.134990 -0.087407 0.309223 1.000000
In [23]:
"""
(1)目的変数priceは説明変数horsepower, widthと相関が強い
(2)説明変数horsepowerと説明変数widthの相関が0.6とやや強い
   多重共線性 multi-collineartyを生じる可能性がある
     →回帰係数の分散が大きくなり係数の優位性が失われる現象
       →対策:どちらかだけを代表として使用する
"""
Out[23]:
'\n(1)目的変数priceは説明変数horsepower, widthと相関が強い\n(2)説明変数horsepowerと説明変数widthの相関が0.6とやや強い\n\u3000\u3000\u3000多重共線性\u3000multi-collineartyを生じる可能性がある\n   \u3000\u3000→回帰係数の分散が大きくなり係数の優位性が失われる現象\n     \u3000\u3000→対策:どちらかだけを代表として使用する\n'
In [24]:
# sklearn重回帰モデルを使用する
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# 説明変数X モデル変数yの定義
X = data.drop('price', axis=1) # price列を落とす
y = data['price']

# データを分割する
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# 機械学習モデルのインスタンス作成と学習
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# 決定係数 score
print('score test : {} '.format(model_lr.score(X_test, y_test)))
print('score train: {} '.format(model_lr.score(X_train, y_train)))

# 回帰係数coef_と切片intercept_
print('coef_ : \n{}'.format(pd.Series(model_lr.coef_, index=X_test.columns)))
print('intercept_ : \n{}'.format(model_lr.intercept_))

"""
score testとscore train のscoreが近いことから過学習にはなっていないことがわかる
"""
score test : 0.7370688738125768 
score train: 0.7333575683901379 
coef_ : 
horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64
intercept_ : 
-128409.04630338572
Out[24]:
'\nscore testとscore train\u3000のscoreが近いことから過学習にはなっていないことがわかる\n'
In [25]:
# 予測 predict
y = model_lr.predict(X_test)
plt.scatter(X_test['horsepower'], y)
Out[25]:
<matplotlib.collections.PathCollection at 0x253ce104fd0>
In [26]:
# 変数増加法

# 変数減少法

# ステップワイズ法
# 変数増加法 forward selection method 最初に全変数のうち判別に寄与する割合が最大のものを取り込む。 それ以後は、すでに取り込まれた変数との共有部分を除いて、単独で判別に寄与する割合が最大の変数を取り込んでいく。 その変数を取り込んで判別関数を計算した時、判別効率の増加が最大になることを意味している。 判別効率の増加があらかじめ決めておいた値、すなわち取り込み基準未満になった時に変数の取り込みを終了する。 # 変数減少法 backward selection method 最初に全変数を取り込み、以後は取り込まれた変数のうち単独で判別に寄与している割合が最小のものを追い出していく。 これは、その変数を追い出して判別関数を計算した時、判別効率の減少が最小になることを意味している。 この際、判別効率の減少があらかじめ決めておいた値、すなわち追い出し基準以上になった時に変数の追い出しを終了する。 # ステップワイズ法 変数増減法(stepwise forward selection method) 増加法と減少法を組み合わせた手法。 まず増加法と同様に取り込み基準に従って変数を取り込む。 そして1つ変数を取り込むたびに、すでに取り込んだものの中で減少法と同様の追い出し基準を満足するものがないか調べ、 あればそれを追い出してから、さらに変数の取り込みを続ける。 この手法では取り込むべき変数も追い出すべき変数もなくなった時に変数の選択を終了する。 この手法は判別に寄与する割合が大きい変数を少数選択するという特徴を持ち、 逐次変数選択法の中で最も多用されている。 変数減増法(stepwise backward selection method) 増加法と減少法を組み合わせた手法。 変数増減法とは逆に、まず変数減少法に従って全ての変数を取り込み、 追い出し基準を満足するものを追い出す。 そして1つ変数を追い出すたびに、すでに追い出したものの中で増加法と同様の取り込み基準を満足するものがないか調べ、あればそれを取り込み、さらに変数の追い出しを続ける。 変数増減法と同じく、取り込むべき変数も追い出すべき変数もなくなった時に変数の選択を終了する。 この手法は判別に寄与する割合が少ない変数を少数除外するという特徴を持ち、 逐次変数選択法の中で効率的な変数の組み合わせが得られる可能性が最も高い。
In [27]:
# ロジスティック回帰
"""
目的変数が数値ではなく、買う買わないなどのカテゴリ分けの場合を対象とする。
カテゴリ変数という
"""
# サンプルデータをrequests.get().contentする
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
res = requests.get(url).content
In [28]:
# DataFrame化する
df = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None) # header部分も1データなので Noneにすること
df.head()
Out[28]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
In [29]:
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
              'occupation', 'relationship', 'race', 'sex', 'capital-gain',
              'capital-loss', 'hours-per-week', 
              'native-country', 'flg-50K']
df
Out[29]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country flg-50K
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
5 37 Private 284582 Masters 14 Married-civ-spouse Exec-managerial Wife White Female 0 0 40 United-States <=50K
6 49 Private 160187 9th 5 Married-spouse-absent Other-service Not-in-family Black Female 0 0 16 Jamaica <=50K
7 52 Self-emp-not-inc 209642 HS-grad 9 Married-civ-spouse Exec-managerial Husband White Male 0 0 45 United-States >50K
8 31 Private 45781 Masters 14 Never-married Prof-specialty Not-in-family White Female 14084 0 50 United-States >50K
9 42 Private 159449 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 5178 0 40 United-States >50K
10 37 Private 280464 Some-college 10 Married-civ-spouse Exec-managerial Husband Black Male 0 0 80 United-States >50K
11 30 State-gov 141297 Bachelors 13 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander Male 0 0 40 India >50K
12 23 Private 122272 Bachelors 13 Never-married Adm-clerical Own-child White Female 0 0 30 United-States <=50K
13 32 Private 205019 Assoc-acdm 12 Never-married Sales Not-in-family Black Male 0 0 50 United-States <=50K
14 40 Private 121772 Assoc-voc 11 Married-civ-spouse Craft-repair Husband Asian-Pac-Islander Male 0 0 40 ? >50K
15 34 Private 245487 7th-8th 4 Married-civ-spouse Transport-moving Husband Amer-Indian-Eskimo Male 0 0 45 Mexico <=50K
16 25 Self-emp-not-inc 176756 HS-grad 9 Never-married Farming-fishing Own-child White Male 0 0 35 United-States <=50K
17 32 Private 186824 HS-grad 9 Never-married Machine-op-inspct Unmarried White Male 0 0 40 United-States <=50K
18 38 Private 28887 11th 7 Married-civ-spouse Sales Husband White Male 0 0 50 United-States <=50K
19 43 Self-emp-not-inc 292175 Masters 14 Divorced Exec-managerial Unmarried White Female 0 0 45 United-States >50K
20 40 Private 193524 Doctorate 16 Married-civ-spouse Prof-specialty Husband White Male 0 0 60 United-States >50K
21 54 Private 302146 HS-grad 9 Separated Other-service Unmarried Black Female 0 0 20 United-States <=50K
22 35 Federal-gov 76845 9th 5 Married-civ-spouse Farming-fishing Husband Black Male 0 0 40 United-States <=50K
23 43 Private 117037 11th 7 Married-civ-spouse Transport-moving Husband White Male 0 2042 40 United-States <=50K
24 59 Private 109015 HS-grad 9 Divorced Tech-support Unmarried White Female 0 0 40 United-States <=50K
25 56 Local-gov 216851 Bachelors 13 Married-civ-spouse Tech-support Husband White Male 0 0 40 United-States >50K
26 19 Private 168294 HS-grad 9 Never-married Craft-repair Own-child White Male 0 0 40 United-States <=50K
27 54 ? 180211 Some-college 10 Married-civ-spouse ? Husband Asian-Pac-Islander Male 0 0 60 South >50K
28 39 Private 367260 HS-grad 9 Divorced Exec-managerial Not-in-family White Male 0 0 80 United-States <=50K
29 49 Private 193366 HS-grad 9 Married-civ-spouse Craft-repair Husband White Male 0 0 40 United-States <=50K
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
32531 30 ? 33811 Bachelors 13 Never-married ? Not-in-family Asian-Pac-Islander Female 0 0 99 United-States <=50K
32532 34 Private 204461 Doctorate 16 Married-civ-spouse Prof-specialty Husband White Male 0 0 60 United-States >50K
32533 54 Private 337992 Bachelors 13 Married-civ-spouse Exec-managerial Husband Asian-Pac-Islander Male 0 0 50 Japan >50K
32534 37 Private 179137 Some-college 10 Divorced Adm-clerical Unmarried White Female 0 0 39 United-States <=50K
32535 22 Private 325033 12th 8 Never-married Protective-serv Own-child Black Male 0 0 35 United-States <=50K
32536 34 Private 160216 Bachelors 13 Never-married Exec-managerial Not-in-family White Female 0 0 55 United-States >50K
32537 30 Private 345898 HS-grad 9 Never-married Craft-repair Not-in-family Black Male 0 0 46 United-States <=50K
32538 38 Private 139180 Bachelors 13 Divorced Prof-specialty Unmarried Black Female 15020 0 45 United-States >50K
32539 71 ? 287372 Doctorate 16 Married-civ-spouse ? Husband White Male 0 0 10 United-States >50K
32540 45 State-gov 252208 HS-grad 9 Separated Adm-clerical Own-child White Female 0 0 40 United-States <=50K
32541 41 ? 202822 HS-grad 9 Separated ? Not-in-family Black Female 0 0 32 United-States <=50K
32542 72 ? 129912 HS-grad 9 Married-civ-spouse ? Husband White Male 0 0 25 United-States <=50K
32543 45 Local-gov 119199 Assoc-acdm 12 Divorced Prof-specialty Unmarried White Female 0 0 48 United-States <=50K
32544 31 Private 199655 Masters 14 Divorced Other-service Not-in-family Other Female 0 0 30 United-States <=50K
32545 39 Local-gov 111499 Assoc-acdm 12 Married-civ-spouse Adm-clerical Wife White Female 0 0 20 United-States >50K
32546 37 Private 198216 Assoc-acdm 12 Divorced Tech-support Not-in-family White Female 0 0 40 United-States <=50K
32547 43 Private 260761 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 40 Mexico <=50K
32548 65 Self-emp-not-inc 99359 Prof-school 15 Never-married Prof-specialty Not-in-family White Male 1086 0 60 United-States <=50K
32549 43 State-gov 255835 Some-college 10 Divorced Adm-clerical Other-relative White Female 0 0 40 United-States <=50K
32550 43 Self-emp-not-inc 27242 Some-college 10 Married-civ-spouse Craft-repair Husband White Male 0 0 50 United-States <=50K
32551 32 Private 34066 10th 6 Married-civ-spouse Handlers-cleaners Husband Amer-Indian-Eskimo Male 0 0 40 United-States <=50K
32552 43 Private 84661 Assoc-voc 11 Married-civ-spouse Sales Husband White Male 0 0 45 United-States <=50K
32553 32 Private 116138 Masters 14 Never-married Tech-support Not-in-family Asian-Pac-Islander Male 0 0 11 Taiwan <=50K
32554 53 Private 321865 Masters 14 Married-civ-spouse Exec-managerial Husband White Male 0 0 40 United-States >50K
32555 22 Private 310152 Some-college 10 Never-married Protective-serv Not-in-family White Male 0 0 40 United-States <=50K
32556 27 Private 257302 Assoc-acdm 12 Married-civ-spouse Tech-support Wife White Female 0 0 38 United-States <=50K
32557 40 Private 154374 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 40 United-States >50K
32558 58 Private 151910 HS-grad 9 Widowed Adm-clerical Unmarried White Female 0 0 40 United-States <=50K
32559 22 Private 201490 HS-grad 9 Never-married Adm-clerical Own-child White Male 0 0 20 United-States <=50K
32560 52 Self-emp-inc 287927 HS-grad 9 Married-civ-spouse Exec-managerial Wife White Female 15024 0 40 United-States >50K

32561 rows × 15 columns

In [30]:
# 50K以上になるかどうかを予測するように学習する
In [31]:
# データの欠損を調べる

# 全体の大きさ
print('全体の大きさ', df.shape)

# 欠損値の有無
print('欠損値の数', df.isnull().sum())
全体の大きさ (32561, 15)
欠損値の数 age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
flg-50K           0
dtype: int64
In [32]:
# ?の数
df.isin(['?']).sum()
"""
なぜか出てこない・・・
"""
Out[32]:
'\nなぜか出てこない・・・\n'
In [33]:
# ?をNaNに置換して、行を削除する
df = df.replace('?', np.nan).dropna()
print(df.shape) # (199, 4)
df
(32561, 15)
Out[33]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country flg-50K
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
5 37 Private 284582 Masters 14 Married-civ-spouse Exec-managerial Wife White Female 0 0 40 United-States <=50K
6 49 Private 160187 9th 5 Married-spouse-absent Other-service Not-in-family Black Female 0 0 16 Jamaica <=50K
7 52 Self-emp-not-inc 209642 HS-grad 9 Married-civ-spouse Exec-managerial Husband White Male 0 0 45 United-States >50K
8 31 Private 45781 Masters 14 Never-married Prof-specialty Not-in-family White Female 14084 0 50 United-States >50K
9 42 Private 159449 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 5178 0 40 United-States >50K
10 37 Private 280464 Some-college 10 Married-civ-spouse Exec-managerial Husband Black Male 0 0 80 United-States >50K
11 30 State-gov 141297 Bachelors 13 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander Male 0 0 40 India >50K
12 23 Private 122272 Bachelors 13 Never-married Adm-clerical Own-child White Female 0 0 30 United-States <=50K
13 32 Private 205019 Assoc-acdm 12 Never-married Sales Not-in-family Black Male 0 0 50 United-States <=50K
14 40 Private 121772 Assoc-voc 11 Married-civ-spouse Craft-repair Husband Asian-Pac-Islander Male 0 0 40 ? >50K
15 34 Private 245487 7th-8th 4 Married-civ-spouse Transport-moving Husband Amer-Indian-Eskimo Male 0 0 45 Mexico <=50K
16 25 Self-emp-not-inc 176756 HS-grad 9 Never-married Farming-fishing Own-child White Male 0 0 35 United-States <=50K
17 32 Private 186824 HS-grad 9 Never-married Machine-op-inspct Unmarried White Male 0 0 40 United-States <=50K
18 38 Private 28887 11th 7 Married-civ-spouse Sales Husband White Male 0 0 50 United-States <=50K
19 43 Self-emp-not-inc 292175 Masters 14 Divorced Exec-managerial Unmarried White Female 0 0 45 United-States >50K
20 40 Private 193524 Doctorate 16 Married-civ-spouse Prof-specialty Husband White Male 0 0 60 United-States >50K
21 54 Private 302146 HS-grad 9 Separated Other-service Unmarried Black Female 0 0 20 United-States <=50K
22 35 Federal-gov 76845 9th 5 Married-civ-spouse Farming-fishing Husband Black Male 0 0 40 United-States <=50K
23 43 Private 117037 11th 7 Married-civ-spouse Transport-moving Husband White Male 0 2042 40 United-States <=50K
24 59 Private 109015 HS-grad 9 Divorced Tech-support Unmarried White Female 0 0 40 United-States <=50K
25 56 Local-gov 216851 Bachelors 13 Married-civ-spouse Tech-support Husband White Male 0 0 40 United-States >50K
26 19 Private 168294 HS-grad 9 Never-married Craft-repair Own-child White Male 0 0 40 United-States <=50K
27 54 ? 180211 Some-college 10 Married-civ-spouse ? Husband Asian-Pac-Islander Male 0 0 60 South >50K
28 39 Private 367260 HS-grad 9 Divorced Exec-managerial Not-in-family White Male 0 0 80 United-States <=50K
29 49 Private 193366 HS-grad 9 Married-civ-spouse Craft-repair Husband White Male 0 0 40 United-States <=50K
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
32531 30 ? 33811 Bachelors 13 Never-married ? Not-in-family Asian-Pac-Islander Female 0 0 99 United-States <=50K
32532 34 Private 204461 Doctorate 16 Married-civ-spouse Prof-specialty Husband White Male 0 0 60 United-States >50K
32533 54 Private 337992 Bachelors 13 Married-civ-spouse Exec-managerial Husband Asian-Pac-Islander Male 0 0 50 Japan >50K
32534 37 Private 179137 Some-college 10 Divorced Adm-clerical Unmarried White Female 0 0 39 United-States <=50K
32535 22 Private 325033 12th 8 Never-married Protective-serv Own-child Black Male 0 0 35 United-States <=50K
32536 34 Private 160216 Bachelors 13 Never-married Exec-managerial Not-in-family White Female 0 0 55 United-States >50K
32537 30 Private 345898 HS-grad 9 Never-married Craft-repair Not-in-family Black Male 0 0 46 United-States <=50K
32538 38 Private 139180 Bachelors 13 Divorced Prof-specialty Unmarried Black Female 15020 0 45 United-States >50K
32539 71 ? 287372 Doctorate 16 Married-civ-spouse ? Husband White Male 0 0 10 United-States >50K
32540 45 State-gov 252208 HS-grad 9 Separated Adm-clerical Own-child White Female 0 0 40 United-States <=50K
32541 41 ? 202822 HS-grad 9 Separated ? Not-in-family Black Female 0 0 32 United-States <=50K
32542 72 ? 129912 HS-grad 9 Married-civ-spouse ? Husband White Male 0 0 25 United-States <=50K
32543 45 Local-gov 119199 Assoc-acdm 12 Divorced Prof-specialty Unmarried White Female 0 0 48 United-States <=50K
32544 31 Private 199655 Masters 14 Divorced Other-service Not-in-family Other Female 0 0 30 United-States <=50K
32545 39 Local-gov 111499 Assoc-acdm 12 Married-civ-spouse Adm-clerical Wife White Female 0 0 20 United-States >50K
32546 37 Private 198216 Assoc-acdm 12 Divorced Tech-support Not-in-family White Female 0 0 40 United-States <=50K
32547 43 Private 260761 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 40 Mexico <=50K
32548 65 Self-emp-not-inc 99359 Prof-school 15 Never-married Prof-specialty Not-in-family White Male 1086 0 60 United-States <=50K
32549 43 State-gov 255835 Some-college 10 Divorced Adm-clerical Other-relative White Female 0 0 40 United-States <=50K
32550 43 Self-emp-not-inc 27242 Some-college 10 Married-civ-spouse Craft-repair Husband White Male 0 0 50 United-States <=50K
32551 32 Private 34066 10th 6 Married-civ-spouse Handlers-cleaners Husband Amer-Indian-Eskimo Male 0 0 40 United-States <=50K
32552 43 Private 84661 Assoc-voc 11 Married-civ-spouse Sales Husband White Male 0 0 45 United-States <=50K
32553 32 Private 116138 Masters 14 Never-married Tech-support Not-in-family Asian-Pac-Islander Male 0 0 11 Taiwan <=50K
32554 53 Private 321865 Masters 14 Married-civ-spouse Exec-managerial Husband White Male 0 0 40 United-States >50K
32555 22 Private 310152 Some-college 10 Never-married Protective-serv Not-in-family White Male 0 0 40 United-States <=50K
32556 27 Private 257302 Assoc-acdm 12 Married-civ-spouse Tech-support Wife White Female 0 0 38 United-States <=50K
32557 40 Private 154374 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 40 United-States >50K
32558 58 Private 151910 HS-grad 9 Widowed Adm-clerical Unmarried White Female 0 0 40 United-States <=50K
32559 22 Private 201490 HS-grad 9 Never-married Adm-clerical Own-child White Male 0 0 20 United-States <=50K
32560 52 Self-emp-inc 287927 HS-grad 9 Married-civ-spouse Exec-managerial Wife White Female 15024 0 40 United-States >50K

32561 rows × 15 columns

In [34]:
# 50K以下 50K超え
# 集計してみる
df.groupby('flg-50K').size()
"""
flg-50K
 <=50K    24720
 >50K      7841
dtype: int64
"""
Out[34]:
'\nflg-50K\n <=50K    24720\n >50K      7841\ndtype: int64\n'
In [35]:
# fin_flg列を作成して50K以下=0 50K超え=1 とする
df['fin_flg'] = df['flg-50K'].map(lambda x: 1 if x==' >50K' else 0) # ' >50K' は頭にスペースが入っている。
df.groupby('fin_flg').size()
df
Out[35]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country flg-50K fin_flg
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K 0
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K 0
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K 0
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K 0
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K 0
5 37 Private 284582 Masters 14 Married-civ-spouse Exec-managerial Wife White Female 0 0 40 United-States <=50K 0
6 49 Private 160187 9th 5 Married-spouse-absent Other-service Not-in-family Black Female 0 0 16 Jamaica <=50K 0
7 52 Self-emp-not-inc 209642 HS-grad 9 Married-civ-spouse Exec-managerial Husband White Male 0 0 45 United-States >50K 1
8 31 Private 45781 Masters 14 Never-married Prof-specialty Not-in-family White Female 14084 0 50 United-States >50K 1
9 42 Private 159449 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 5178 0 40 United-States >50K 1
10 37 Private 280464 Some-college 10 Married-civ-spouse Exec-managerial Husband Black Male 0 0 80 United-States >50K 1
11 30 State-gov 141297 Bachelors 13 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander Male 0 0 40 India >50K 1
12 23 Private 122272 Bachelors 13 Never-married Adm-clerical Own-child White Female 0 0 30 United-States <=50K 0
13 32 Private 205019 Assoc-acdm 12 Never-married Sales Not-in-family Black Male 0 0 50 United-States <=50K 0
14 40 Private 121772 Assoc-voc 11 Married-civ-spouse Craft-repair Husband Asian-Pac-Islander Male 0 0 40 ? >50K 1
15 34 Private 245487 7th-8th 4 Married-civ-spouse Transport-moving Husband Amer-Indian-Eskimo Male 0 0 45 Mexico <=50K 0
16 25 Self-emp-not-inc 176756 HS-grad 9 Never-married Farming-fishing Own-child White Male 0 0 35 United-States <=50K 0
17 32 Private 186824 HS-grad 9 Never-married Machine-op-inspct Unmarried White Male 0 0 40 United-States <=50K 0
18 38 Private 28887 11th 7 Married-civ-spouse Sales Husband White Male 0 0 50 United-States <=50K 0
19 43 Self-emp-not-inc 292175 Masters 14 Divorced Exec-managerial Unmarried White Female 0 0 45 United-States >50K 1
20 40 Private 193524 Doctorate 16 Married-civ-spouse Prof-specialty Husband White Male 0 0 60 United-States >50K 1
21 54 Private 302146 HS-grad 9 Separated Other-service Unmarried Black Female 0 0 20 United-States <=50K 0
22 35 Federal-gov 76845 9th 5 Married-civ-spouse Farming-fishing Husband Black Male 0 0 40 United-States <=50K 0
23 43 Private 117037 11th 7 Married-civ-spouse Transport-moving Husband White Male 0 2042 40 United-States <=50K 0
24 59 Private 109015 HS-grad 9 Divorced Tech-support Unmarried White Female 0 0 40 United-States <=50K 0
25 56 Local-gov 216851 Bachelors 13 Married-civ-spouse Tech-support Husband White Male 0 0 40 United-States >50K 1
26 19 Private 168294 HS-grad 9 Never-married Craft-repair Own-child White Male 0 0 40 United-States <=50K 0
27 54 ? 180211 Some-college 10 Married-civ-spouse ? Husband Asian-Pac-Islander Male 0 0 60 South >50K 1
28 39 Private 367260 HS-grad 9 Divorced Exec-managerial Not-in-family White Male 0 0 80 United-States <=50K 0
29 49 Private 193366 HS-grad 9 Married-civ-spouse Craft-repair Husband White Male 0 0 40 United-States <=50K 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
32531 30 ? 33811 Bachelors 13 Never-married ? Not-in-family Asian-Pac-Islander Female 0 0 99 United-States <=50K 0
32532 34 Private 204461 Doctorate 16 Married-civ-spouse Prof-specialty Husband White Male 0 0 60 United-States >50K 1
32533 54 Private 337992 Bachelors 13 Married-civ-spouse Exec-managerial Husband Asian-Pac-Islander Male 0 0 50 Japan >50K 1
32534 37 Private 179137 Some-college 10 Divorced Adm-clerical Unmarried White Female 0 0 39 United-States <=50K 0
32535 22 Private 325033 12th 8 Never-married Protective-serv Own-child Black Male 0 0 35 United-States <=50K 0
32536 34 Private 160216 Bachelors 13 Never-married Exec-managerial Not-in-family White Female 0 0 55 United-States >50K 1
32537 30 Private 345898 HS-grad 9 Never-married Craft-repair Not-in-family Black Male 0 0 46 United-States <=50K 0
32538 38 Private 139180 Bachelors 13 Divorced Prof-specialty Unmarried Black Female 15020 0 45 United-States >50K 1
32539 71 ? 287372 Doctorate 16 Married-civ-spouse ? Husband White Male 0 0 10 United-States >50K 1
32540 45 State-gov 252208 HS-grad 9 Separated Adm-clerical Own-child White Female 0 0 40 United-States <=50K 0
32541 41 ? 202822 HS-grad 9 Separated ? Not-in-family Black Female 0 0 32 United-States <=50K 0
32542 72 ? 129912 HS-grad 9 Married-civ-spouse ? Husband White Male 0 0 25 United-States <=50K 0
32543 45 Local-gov 119199 Assoc-acdm 12 Divorced Prof-specialty Unmarried White Female 0 0 48 United-States <=50K 0
32544 31 Private 199655 Masters 14 Divorced Other-service Not-in-family Other Female 0 0 30 United-States <=50K 0
32545 39 Local-gov 111499 Assoc-acdm 12 Married-civ-spouse Adm-clerical Wife White Female 0 0 20 United-States >50K 1
32546 37 Private 198216 Assoc-acdm 12 Divorced Tech-support Not-in-family White Female 0 0 40 United-States <=50K 0
32547 43 Private 260761 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 40 Mexico <=50K 0
32548 65 Self-emp-not-inc 99359 Prof-school 15 Never-married Prof-specialty Not-in-family White Male 1086 0 60 United-States <=50K 0
32549 43 State-gov 255835 Some-college 10 Divorced Adm-clerical Other-relative White Female 0 0 40 United-States <=50K 0
32550 43 Self-emp-not-inc 27242 Some-college 10 Married-civ-spouse Craft-repair Husband White Male 0 0 50 United-States <=50K 0
32551 32 Private 34066 10th 6 Married-civ-spouse Handlers-cleaners Husband Amer-Indian-Eskimo Male 0 0 40 United-States <=50K 0
32552 43 Private 84661 Assoc-voc 11 Married-civ-spouse Sales Husband White Male 0 0 45 United-States <=50K 0
32553 32 Private 116138 Masters 14 Never-married Tech-support Not-in-family Asian-Pac-Islander Male 0 0 11 Taiwan <=50K 0
32554 53 Private 321865 Masters 14 Married-civ-spouse Exec-managerial Husband White Male 0 0 40 United-States >50K 1
32555 22 Private 310152 Some-college 10 Never-married Protective-serv Not-in-family White Male 0 0 40 United-States <=50K 0
32556 27 Private 257302 Assoc-acdm 12 Married-civ-spouse Tech-support Wife White Female 0 0 38 United-States <=50K 0
32557 40 Private 154374 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 40 United-States >50K 1
32558 58 Private 151910 HS-grad 9 Widowed Adm-clerical Unmarried White Female 0 0 40 United-States <=50K 0
32559 22 Private 201490 HS-grad 9 Never-married Adm-clerical Own-child White Male 0 0 20 United-States <=50K 0
32560 52 Self-emp-inc 287927 HS-grad 9 Married-civ-spouse Exec-managerial Wife White Female 15024 0 40 United-States >50K 1

32561 rows × 16 columns

In [36]:
# モデル構築と評価
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 説明変数と目的変数を定義
X = df[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']]
y = df['fin_flg']

# データ分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# ロジスティック回帰モデルのインスタンス作成と訓練
lr = LogisticRegression()
lr.fit(X_train, y_train)
Out[36]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
In [37]:
print(lr.score(X_test,y_test))
print(lr.score(X_train,y_train))

index_li = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']
print(index_li)
print(lr.coef_)
print('intercept_ : ', lr.intercept_)

# オッズ比
print('オッズ比: ', np.exp(lr.coef_))
"""オッズ比
それぞれの係数が1大尉増加したとき予測確率にどの程度影響があるかを示す指標
(影響なしの場合は1.0)
"""
0.7973097475585038
0.7964373464373464
['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']
[[-4.510e-03 -5.717e-06 -1.082e-03  3.159e-04  7.230e-04]]
intercept_ :  [-0.]
オッズ比:  [[0.996 1.    0.999 1.    1.001]]
Out[37]:
'オッズ比\nそれぞれの係数が1大尉増加したとき予測確率にどの程度影響があるかを示す指標\n(影響なしの場合は1.0)\n'
In [38]:
# スケーリングする StandardScaler
"""
値の大きな説明変数に結果が引っ張られないように標準化する
"""
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 説明変数と目的変数を定義 (以降 ほとんど前回と同じ)
X = df[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']]
y = df['fin_flg']

# データ分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# ここで説明変数をスケーリングする
sc = StandardScaler() # インスタンスを作成する
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# ロジスティック回帰モデルのインスタンス作成と訓練
lr = LogisticRegression()
lr.fit(X_train_std, y_train)
Out[38]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
In [39]:
# 結果
print(lr.score(X_test_std,y_test))
print(lr.score(X_train_std,y_train))
0.8101467968797985
0.8105651105651106