# chapter 8 機械学習
"""教師あり学習
予測したい変数 :目的変数 正解データ 応答変数 ターゲット変数 従属変数
予測のための変数:説明変数 特徴量 予測変数 独立変数
y = f(x) の場合 x=説明変数 y=目的変数
回帰:regression
分類:classification
重回帰 multiple linear regression
ロジスティック回帰 logistic regression
k近傍法 k-nearest neighbors
決定木 Decision Tree
サポートベクターマシン
ランダムフォレスト
勾配Boosting
"""
"""教師なし学習
主成分分析
マーケットバスケット分析
"""
"""強化学習
探索と知識利用のバランスを取るのがポイント
動的計画法
モンテカルロ法
TD学習
"""
'強化学習\n探索と知識利用のバランスを取るのがポイント\n動的計画法\nモンテカルロ法\u3000\nTD学習\n'
# モジュール
import numpy as np
import scipy as sp
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()
%matplotlib inline
%precision 3
'%.3f'
# 重回帰 multiple linear regression
import requests
import zipfile
import io
# サンプルデータをrequests.get().content (ダウンロード) する
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content
auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)
auto.columns = ['symboling','nomalalined-losses','make','fuel-type','aspiration','num-of-doors',
'body-style','drive-sheels','enfine-location','wheel-base','length','width','height',
'corb-weight','engine-type','num-of-cylinders','engin-size','fuel-system','bore',
'stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']
auto.head(5)
symboling | nomalalined-losses | make | fuel-type | aspiration | num-of-doors | body-style | drive-sheels | enfine-location | wheel-base | ... | engin-size | fuel-system | bore | stroke | compression-ratio | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 13495 |
1 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 16500 |
2 | 1 | ? | alfa-romero | gas | std | two | hatchback | rwd | front | 94.5 | ... | 152 | mpfi | 2.68 | 3.47 | 9.0 | 154 | 5000 | 19 | 26 | 16500 |
3 | 2 | 164 | audi | gas | std | four | sedan | fwd | front | 99.8 | ... | 109 | mpfi | 3.19 | 3.40 | 10.0 | 102 | 5500 | 24 | 30 | 13950 |
4 | 2 | 164 | audi | gas | std | four | sedan | 4wd | front | 99.4 | ... | 136 | mpfi | 3.19 | 3.40 | 8.0 | 115 | 5500 | 18 | 22 | 17450 |
5 rows × 26 columns
# 自動車の価格
auto['price'].head(5)
0 13495 1 16500 2 16500 3 13950 4 17450 Name: price, dtype: object
# horsepower width height の3つを説明変数として自動車の価格を求めてみよう
# 欠損値の除去
data = auto[['price', 'horsepower', 'width', 'height']]
data.isin(['?']).sum()
"""
price 4
horsepower 2
width 0
height 0
dtype: int64
"""
# ?をNaNに置換して、行を削除する
data = data.replace('?', np.nan).dropna()
print(data.shape) # (199, 4)
data .head()
(199, 4)
price | horsepower | width | height | |
---|---|---|---|---|
0 | 13495 | 111 | 64.1 | 48.8 |
1 | 16500 | 111 | 64.1 | 48.8 |
2 | 16500 | 154 | 65.5 | 52.4 |
3 | 13950 | 102 | 66.2 | 54.3 |
4 | 17450 | 115 | 66.4 | 54.3 |
# データ型の確認
print(data.dtypes)
"""
objectとfloat64が混ざっている
"""
price object horsepower object width float64 height float64 dtype: object
'\nobjectとfloat64が混ざっている\n'
# 数値に変換する assign()
data = data.assign(price=pd.to_numeric(data['price']))
data = data.assign(horsepower=pd.to_numeric(data['horsepower']))
data.dtypes
price int64 horsepower int64 width float64 height float64 dtype: object
# 相関の確認 corr()
data.corr()
price | horsepower | width | height | |
---|---|---|---|---|
price | 1.000000 | 0.810533 | 0.753871 | 0.134990 |
horsepower | 0.810533 | 1.000000 | 0.615315 | -0.087407 |
width | 0.753871 | 0.615315 | 1.000000 | 0.309223 |
height | 0.134990 | -0.087407 | 0.309223 | 1.000000 |
"""
(1)目的変数priceは説明変数horsepower, widthと相関が強い
(2)説明変数horsepowerと説明変数widthの相関が0.6とやや強い
多重共線性 multi-collineartyを生じる可能性がある
→回帰係数の分散が大きくなり係数の優位性が失われる現象
→対策:どちらかだけを代表として使用する
"""
'\n(1)目的変数priceは説明変数horsepower, widthと相関が強い\n(2)説明変数horsepowerと説明変数widthの相関が0.6とやや強い\n\u3000\u3000\u3000多重共線性\u3000multi-collineartyを生じる可能性がある\n \u3000\u3000→回帰係数の分散が大きくなり係数の優位性が失われる現象\n \u3000\u3000→対策:どちらかだけを代表として使用する\n'
# sklearn重回帰モデルを使用する
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# 説明変数X モデル変数yの定義
X = data.drop('price', axis=1) # price列を落とす
y = data['price']
# データを分割する
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
# 機械学習モデルのインスタンス作成と学習
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
# 決定係数 score
print('score test : {} '.format(model_lr.score(X_test, y_test)))
print('score train: {} '.format(model_lr.score(X_train, y_train)))
# 回帰係数coef_と切片intercept_
print('coef_ : \n{}'.format(pd.Series(model_lr.coef_, index=X_test.columns)))
print('intercept_ : \n{}'.format(model_lr.intercept_))
"""
score testとscore train のscoreが近いことから過学習にはなっていないことがわかる
"""
score test : 0.7370688738125768 score train: 0.7333575683901379 coef_ : horsepower 81.651078 width 1829.174506 height 229.510077 dtype: float64 intercept_ : -128409.04630338572
'\nscore testとscore train\u3000のscoreが近いことから過学習にはなっていないことがわかる\n'
# 予測 predict
y = model_lr.predict(X_test)
plt.scatter(X_test['horsepower'], y)
<matplotlib.collections.PathCollection at 0x253ce104fd0>
# 変数増加法
# 変数減少法
# ステップワイズ法
# ロジスティック回帰
"""
目的変数が数値ではなく、買う買わないなどのカテゴリ分けの場合を対象とする。
カテゴリ変数という
"""
# サンプルデータをrequests.get().contentする
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
res = requests.get(url).content
# DataFrame化する
df = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None) # header部分も1データなので Noneにすること
df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
'occupation', 'relationship', 'race', 'sex', 'capital-gain',
'capital-loss', 'hours-per-week',
'native-country', 'flg-50K']
df
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | flg-50K | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
5 | 37 | Private | 284582 | Masters | 14 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 0 | 0 | 40 | United-States | <=50K |
6 | 49 | Private | 160187 | 9th | 5 | Married-spouse-absent | Other-service | Not-in-family | Black | Female | 0 | 0 | 16 | Jamaica | <=50K |
7 | 52 | Self-emp-not-inc | 209642 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 45 | United-States | >50K |
8 | 31 | Private | 45781 | Masters | 14 | Never-married | Prof-specialty | Not-in-family | White | Female | 14084 | 0 | 50 | United-States | >50K |
9 | 42 | Private | 159449 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 5178 | 0 | 40 | United-States | >50K |
10 | 37 | Private | 280464 | Some-college | 10 | Married-civ-spouse | Exec-managerial | Husband | Black | Male | 0 | 0 | 80 | United-States | >50K |
11 | 30 | State-gov | 141297 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | India | >50K |
12 | 23 | Private | 122272 | Bachelors | 13 | Never-married | Adm-clerical | Own-child | White | Female | 0 | 0 | 30 | United-States | <=50K |
13 | 32 | Private | 205019 | Assoc-acdm | 12 | Never-married | Sales | Not-in-family | Black | Male | 0 | 0 | 50 | United-States | <=50K |
14 | 40 | Private | 121772 | Assoc-voc | 11 | Married-civ-spouse | Craft-repair | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | ? | >50K |
15 | 34 | Private | 245487 | 7th-8th | 4 | Married-civ-spouse | Transport-moving | Husband | Amer-Indian-Eskimo | Male | 0 | 0 | 45 | Mexico | <=50K |
16 | 25 | Self-emp-not-inc | 176756 | HS-grad | 9 | Never-married | Farming-fishing | Own-child | White | Male | 0 | 0 | 35 | United-States | <=50K |
17 | 32 | Private | 186824 | HS-grad | 9 | Never-married | Machine-op-inspct | Unmarried | White | Male | 0 | 0 | 40 | United-States | <=50K |
18 | 38 | Private | 28887 | 11th | 7 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K |
19 | 43 | Self-emp-not-inc | 292175 | Masters | 14 | Divorced | Exec-managerial | Unmarried | White | Female | 0 | 0 | 45 | United-States | >50K |
20 | 40 | Private | 193524 | Doctorate | 16 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 0 | 60 | United-States | >50K |
21 | 54 | Private | 302146 | HS-grad | 9 | Separated | Other-service | Unmarried | Black | Female | 0 | 0 | 20 | United-States | <=50K |
22 | 35 | Federal-gov | 76845 | 9th | 5 | Married-civ-spouse | Farming-fishing | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
23 | 43 | Private | 117037 | 11th | 7 | Married-civ-spouse | Transport-moving | Husband | White | Male | 0 | 2042 | 40 | United-States | <=50K |
24 | 59 | Private | 109015 | HS-grad | 9 | Divorced | Tech-support | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
25 | 56 | Local-gov | 216851 | Bachelors | 13 | Married-civ-spouse | Tech-support | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
26 | 19 | Private | 168294 | HS-grad | 9 | Never-married | Craft-repair | Own-child | White | Male | 0 | 0 | 40 | United-States | <=50K |
27 | 54 | ? | 180211 | Some-college | 10 | Married-civ-spouse | ? | Husband | Asian-Pac-Islander | Male | 0 | 0 | 60 | South | >50K |
28 | 39 | Private | 367260 | HS-grad | 9 | Divorced | Exec-managerial | Not-in-family | White | Male | 0 | 0 | 80 | United-States | <=50K |
29 | 49 | Private | 193366 | HS-grad | 9 | Married-civ-spouse | Craft-repair | Husband | White | Male | 0 | 0 | 40 | United-States | <=50K |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32531 | 30 | ? | 33811 | Bachelors | 13 | Never-married | ? | Not-in-family | Asian-Pac-Islander | Female | 0 | 0 | 99 | United-States | <=50K |
32532 | 34 | Private | 204461 | Doctorate | 16 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 0 | 60 | United-States | >50K |
32533 | 54 | Private | 337992 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | Asian-Pac-Islander | Male | 0 | 0 | 50 | Japan | >50K |
32534 | 37 | Private | 179137 | Some-college | 10 | Divorced | Adm-clerical | Unmarried | White | Female | 0 | 0 | 39 | United-States | <=50K |
32535 | 22 | Private | 325033 | 12th | 8 | Never-married | Protective-serv | Own-child | Black | Male | 0 | 0 | 35 | United-States | <=50K |
32536 | 34 | Private | 160216 | Bachelors | 13 | Never-married | Exec-managerial | Not-in-family | White | Female | 0 | 0 | 55 | United-States | >50K |
32537 | 30 | Private | 345898 | HS-grad | 9 | Never-married | Craft-repair | Not-in-family | Black | Male | 0 | 0 | 46 | United-States | <=50K |
32538 | 38 | Private | 139180 | Bachelors | 13 | Divorced | Prof-specialty | Unmarried | Black | Female | 15020 | 0 | 45 | United-States | >50K |
32539 | 71 | ? | 287372 | Doctorate | 16 | Married-civ-spouse | ? | Husband | White | Male | 0 | 0 | 10 | United-States | >50K |
32540 | 45 | State-gov | 252208 | HS-grad | 9 | Separated | Adm-clerical | Own-child | White | Female | 0 | 0 | 40 | United-States | <=50K |
32541 | 41 | ? | 202822 | HS-grad | 9 | Separated | ? | Not-in-family | Black | Female | 0 | 0 | 32 | United-States | <=50K |
32542 | 72 | ? | 129912 | HS-grad | 9 | Married-civ-spouse | ? | Husband | White | Male | 0 | 0 | 25 | United-States | <=50K |
32543 | 45 | Local-gov | 119199 | Assoc-acdm | 12 | Divorced | Prof-specialty | Unmarried | White | Female | 0 | 0 | 48 | United-States | <=50K |
32544 | 31 | Private | 199655 | Masters | 14 | Divorced | Other-service | Not-in-family | Other | Female | 0 | 0 | 30 | United-States | <=50K |
32545 | 39 | Local-gov | 111499 | Assoc-acdm | 12 | Married-civ-spouse | Adm-clerical | Wife | White | Female | 0 | 0 | 20 | United-States | >50K |
32546 | 37 | Private | 198216 | Assoc-acdm | 12 | Divorced | Tech-support | Not-in-family | White | Female | 0 | 0 | 40 | United-States | <=50K |
32547 | 43 | Private | 260761 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | Mexico | <=50K |
32548 | 65 | Self-emp-not-inc | 99359 | Prof-school | 15 | Never-married | Prof-specialty | Not-in-family | White | Male | 1086 | 0 | 60 | United-States | <=50K |
32549 | 43 | State-gov | 255835 | Some-college | 10 | Divorced | Adm-clerical | Other-relative | White | Female | 0 | 0 | 40 | United-States | <=50K |
32550 | 43 | Self-emp-not-inc | 27242 | Some-college | 10 | Married-civ-spouse | Craft-repair | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K |
32551 | 32 | Private | 34066 | 10th | 6 | Married-civ-spouse | Handlers-cleaners | Husband | Amer-Indian-Eskimo | Male | 0 | 0 | 40 | United-States | <=50K |
32552 | 43 | Private | 84661 | Assoc-voc | 11 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 45 | United-States | <=50K |
32553 | 32 | Private | 116138 | Masters | 14 | Never-married | Tech-support | Not-in-family | Asian-Pac-Islander | Male | 0 | 0 | 11 | Taiwan | <=50K |
32554 | 53 | Private | 321865 | Masters | 14 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
32555 | 22 | Private | 310152 | Some-college | 10 | Never-married | Protective-serv | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
32556 | 27 | Private | 257302 | Assoc-acdm | 12 | Married-civ-spouse | Tech-support | Wife | White | Female | 0 | 0 | 38 | United-States | <=50K |
32557 | 40 | Private | 154374 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
32558 | 58 | Private | 151910 | HS-grad | 9 | Widowed | Adm-clerical | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
32559 | 22 | Private | 201490 | HS-grad | 9 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K |
32560 | 52 | Self-emp-inc | 287927 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 15024 | 0 | 40 | United-States | >50K |
32561 rows × 15 columns
# 50K以上になるかどうかを予測するように学習する
# データの欠損を調べる
# 全体の大きさ
print('全体の大きさ', df.shape)
# 欠損値の有無
print('欠損値の数', df.isnull().sum())
全体の大きさ (32561, 15) 欠損値の数 age 0 workclass 0 fnlwgt 0 education 0 education-num 0 marital-status 0 occupation 0 relationship 0 race 0 sex 0 capital-gain 0 capital-loss 0 hours-per-week 0 native-country 0 flg-50K 0 dtype: int64
# ?の数
df.isin(['?']).sum()
"""
なぜか出てこない・・・
"""
'\nなぜか出てこない・・・\n'
# ?をNaNに置換して、行を削除する
df = df.replace('?', np.nan).dropna()
print(df.shape) # (199, 4)
df
(32561, 15)
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | flg-50K | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
5 | 37 | Private | 284582 | Masters | 14 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 0 | 0 | 40 | United-States | <=50K |
6 | 49 | Private | 160187 | 9th | 5 | Married-spouse-absent | Other-service | Not-in-family | Black | Female | 0 | 0 | 16 | Jamaica | <=50K |
7 | 52 | Self-emp-not-inc | 209642 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 45 | United-States | >50K |
8 | 31 | Private | 45781 | Masters | 14 | Never-married | Prof-specialty | Not-in-family | White | Female | 14084 | 0 | 50 | United-States | >50K |
9 | 42 | Private | 159449 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 5178 | 0 | 40 | United-States | >50K |
10 | 37 | Private | 280464 | Some-college | 10 | Married-civ-spouse | Exec-managerial | Husband | Black | Male | 0 | 0 | 80 | United-States | >50K |
11 | 30 | State-gov | 141297 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | India | >50K |
12 | 23 | Private | 122272 | Bachelors | 13 | Never-married | Adm-clerical | Own-child | White | Female | 0 | 0 | 30 | United-States | <=50K |
13 | 32 | Private | 205019 | Assoc-acdm | 12 | Never-married | Sales | Not-in-family | Black | Male | 0 | 0 | 50 | United-States | <=50K |
14 | 40 | Private | 121772 | Assoc-voc | 11 | Married-civ-spouse | Craft-repair | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | ? | >50K |
15 | 34 | Private | 245487 | 7th-8th | 4 | Married-civ-spouse | Transport-moving | Husband | Amer-Indian-Eskimo | Male | 0 | 0 | 45 | Mexico | <=50K |
16 | 25 | Self-emp-not-inc | 176756 | HS-grad | 9 | Never-married | Farming-fishing | Own-child | White | Male | 0 | 0 | 35 | United-States | <=50K |
17 | 32 | Private | 186824 | HS-grad | 9 | Never-married | Machine-op-inspct | Unmarried | White | Male | 0 | 0 | 40 | United-States | <=50K |
18 | 38 | Private | 28887 | 11th | 7 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K |
19 | 43 | Self-emp-not-inc | 292175 | Masters | 14 | Divorced | Exec-managerial | Unmarried | White | Female | 0 | 0 | 45 | United-States | >50K |
20 | 40 | Private | 193524 | Doctorate | 16 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 0 | 60 | United-States | >50K |
21 | 54 | Private | 302146 | HS-grad | 9 | Separated | Other-service | Unmarried | Black | Female | 0 | 0 | 20 | United-States | <=50K |
22 | 35 | Federal-gov | 76845 | 9th | 5 | Married-civ-spouse | Farming-fishing | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
23 | 43 | Private | 117037 | 11th | 7 | Married-civ-spouse | Transport-moving | Husband | White | Male | 0 | 2042 | 40 | United-States | <=50K |
24 | 59 | Private | 109015 | HS-grad | 9 | Divorced | Tech-support | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
25 | 56 | Local-gov | 216851 | Bachelors | 13 | Married-civ-spouse | Tech-support | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
26 | 19 | Private | 168294 | HS-grad | 9 | Never-married | Craft-repair | Own-child | White | Male | 0 | 0 | 40 | United-States | <=50K |
27 | 54 | ? | 180211 | Some-college | 10 | Married-civ-spouse | ? | Husband | Asian-Pac-Islander | Male | 0 | 0 | 60 | South | >50K |
28 | 39 | Private | 367260 | HS-grad | 9 | Divorced | Exec-managerial | Not-in-family | White | Male | 0 | 0 | 80 | United-States | <=50K |
29 | 49 | Private | 193366 | HS-grad | 9 | Married-civ-spouse | Craft-repair | Husband | White | Male | 0 | 0 | 40 | United-States | <=50K |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32531 | 30 | ? | 33811 | Bachelors | 13 | Never-married | ? | Not-in-family | Asian-Pac-Islander | Female | 0 | 0 | 99 | United-States | <=50K |
32532 | 34 | Private | 204461 | Doctorate | 16 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 0 | 60 | United-States | >50K |
32533 | 54 | Private | 337992 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | Asian-Pac-Islander | Male | 0 | 0 | 50 | Japan | >50K |
32534 | 37 | Private | 179137 | Some-college | 10 | Divorced | Adm-clerical | Unmarried | White | Female | 0 | 0 | 39 | United-States | <=50K |
32535 | 22 | Private | 325033 | 12th | 8 | Never-married | Protective-serv | Own-child | Black | Male | 0 | 0 | 35 | United-States | <=50K |
32536 | 34 | Private | 160216 | Bachelors | 13 | Never-married | Exec-managerial | Not-in-family | White | Female | 0 | 0 | 55 | United-States | >50K |
32537 | 30 | Private | 345898 | HS-grad | 9 | Never-married | Craft-repair | Not-in-family | Black | Male | 0 | 0 | 46 | United-States | <=50K |
32538 | 38 | Private | 139180 | Bachelors | 13 | Divorced | Prof-specialty | Unmarried | Black | Female | 15020 | 0 | 45 | United-States | >50K |
32539 | 71 | ? | 287372 | Doctorate | 16 | Married-civ-spouse | ? | Husband | White | Male | 0 | 0 | 10 | United-States | >50K |
32540 | 45 | State-gov | 252208 | HS-grad | 9 | Separated | Adm-clerical | Own-child | White | Female | 0 | 0 | 40 | United-States | <=50K |
32541 | 41 | ? | 202822 | HS-grad | 9 | Separated | ? | Not-in-family | Black | Female | 0 | 0 | 32 | United-States | <=50K |
32542 | 72 | ? | 129912 | HS-grad | 9 | Married-civ-spouse | ? | Husband | White | Male | 0 | 0 | 25 | United-States | <=50K |
32543 | 45 | Local-gov | 119199 | Assoc-acdm | 12 | Divorced | Prof-specialty | Unmarried | White | Female | 0 | 0 | 48 | United-States | <=50K |
32544 | 31 | Private | 199655 | Masters | 14 | Divorced | Other-service | Not-in-family | Other | Female | 0 | 0 | 30 | United-States | <=50K |
32545 | 39 | Local-gov | 111499 | Assoc-acdm | 12 | Married-civ-spouse | Adm-clerical | Wife | White | Female | 0 | 0 | 20 | United-States | >50K |
32546 | 37 | Private | 198216 | Assoc-acdm | 12 | Divorced | Tech-support | Not-in-family | White | Female | 0 | 0 | 40 | United-States | <=50K |
32547 | 43 | Private | 260761 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | Mexico | <=50K |
32548 | 65 | Self-emp-not-inc | 99359 | Prof-school | 15 | Never-married | Prof-specialty | Not-in-family | White | Male | 1086 | 0 | 60 | United-States | <=50K |
32549 | 43 | State-gov | 255835 | Some-college | 10 | Divorced | Adm-clerical | Other-relative | White | Female | 0 | 0 | 40 | United-States | <=50K |
32550 | 43 | Self-emp-not-inc | 27242 | Some-college | 10 | Married-civ-spouse | Craft-repair | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K |
32551 | 32 | Private | 34066 | 10th | 6 | Married-civ-spouse | Handlers-cleaners | Husband | Amer-Indian-Eskimo | Male | 0 | 0 | 40 | United-States | <=50K |
32552 | 43 | Private | 84661 | Assoc-voc | 11 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 45 | United-States | <=50K |
32553 | 32 | Private | 116138 | Masters | 14 | Never-married | Tech-support | Not-in-family | Asian-Pac-Islander | Male | 0 | 0 | 11 | Taiwan | <=50K |
32554 | 53 | Private | 321865 | Masters | 14 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
32555 | 22 | Private | 310152 | Some-college | 10 | Never-married | Protective-serv | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
32556 | 27 | Private | 257302 | Assoc-acdm | 12 | Married-civ-spouse | Tech-support | Wife | White | Female | 0 | 0 | 38 | United-States | <=50K |
32557 | 40 | Private | 154374 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
32558 | 58 | Private | 151910 | HS-grad | 9 | Widowed | Adm-clerical | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
32559 | 22 | Private | 201490 | HS-grad | 9 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K |
32560 | 52 | Self-emp-inc | 287927 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 15024 | 0 | 40 | United-States | >50K |
32561 rows × 15 columns
# 50K以下 50K超え
# 集計してみる
df.groupby('flg-50K').size()
"""
flg-50K
<=50K 24720
>50K 7841
dtype: int64
"""
'\nflg-50K\n <=50K 24720\n >50K 7841\ndtype: int64\n'
# fin_flg列を作成して50K以下=0 50K超え=1 とする
df['fin_flg'] = df['flg-50K'].map(lambda x: 1 if x==' >50K' else 0) # ' >50K' は頭にスペースが入っている。
df.groupby('fin_flg').size()
df
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | flg-50K | fin_flg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K | 0 |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K | 0 |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K | 0 |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K | 0 |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K | 0 |
5 | 37 | Private | 284582 | Masters | 14 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 0 | 0 | 40 | United-States | <=50K | 0 |
6 | 49 | Private | 160187 | 9th | 5 | Married-spouse-absent | Other-service | Not-in-family | Black | Female | 0 | 0 | 16 | Jamaica | <=50K | 0 |
7 | 52 | Self-emp-not-inc | 209642 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 45 | United-States | >50K | 1 |
8 | 31 | Private | 45781 | Masters | 14 | Never-married | Prof-specialty | Not-in-family | White | Female | 14084 | 0 | 50 | United-States | >50K | 1 |
9 | 42 | Private | 159449 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 5178 | 0 | 40 | United-States | >50K | 1 |
10 | 37 | Private | 280464 | Some-college | 10 | Married-civ-spouse | Exec-managerial | Husband | Black | Male | 0 | 0 | 80 | United-States | >50K | 1 |
11 | 30 | State-gov | 141297 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | India | >50K | 1 |
12 | 23 | Private | 122272 | Bachelors | 13 | Never-married | Adm-clerical | Own-child | White | Female | 0 | 0 | 30 | United-States | <=50K | 0 |
13 | 32 | Private | 205019 | Assoc-acdm | 12 | Never-married | Sales | Not-in-family | Black | Male | 0 | 0 | 50 | United-States | <=50K | 0 |
14 | 40 | Private | 121772 | Assoc-voc | 11 | Married-civ-spouse | Craft-repair | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | ? | >50K | 1 |
15 | 34 | Private | 245487 | 7th-8th | 4 | Married-civ-spouse | Transport-moving | Husband | Amer-Indian-Eskimo | Male | 0 | 0 | 45 | Mexico | <=50K | 0 |
16 | 25 | Self-emp-not-inc | 176756 | HS-grad | 9 | Never-married | Farming-fishing | Own-child | White | Male | 0 | 0 | 35 | United-States | <=50K | 0 |
17 | 32 | Private | 186824 | HS-grad | 9 | Never-married | Machine-op-inspct | Unmarried | White | Male | 0 | 0 | 40 | United-States | <=50K | 0 |
18 | 38 | Private | 28887 | 11th | 7 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K | 0 |
19 | 43 | Self-emp-not-inc | 292175 | Masters | 14 | Divorced | Exec-managerial | Unmarried | White | Female | 0 | 0 | 45 | United-States | >50K | 1 |
20 | 40 | Private | 193524 | Doctorate | 16 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 0 | 60 | United-States | >50K | 1 |
21 | 54 | Private | 302146 | HS-grad | 9 | Separated | Other-service | Unmarried | Black | Female | 0 | 0 | 20 | United-States | <=50K | 0 |
22 | 35 | Federal-gov | 76845 | 9th | 5 | Married-civ-spouse | Farming-fishing | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K | 0 |
23 | 43 | Private | 117037 | 11th | 7 | Married-civ-spouse | Transport-moving | Husband | White | Male | 0 | 2042 | 40 | United-States | <=50K | 0 |
24 | 59 | Private | 109015 | HS-grad | 9 | Divorced | Tech-support | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K | 0 |
25 | 56 | Local-gov | 216851 | Bachelors | 13 | Married-civ-spouse | Tech-support | Husband | White | Male | 0 | 0 | 40 | United-States | >50K | 1 |
26 | 19 | Private | 168294 | HS-grad | 9 | Never-married | Craft-repair | Own-child | White | Male | 0 | 0 | 40 | United-States | <=50K | 0 |
27 | 54 | ? | 180211 | Some-college | 10 | Married-civ-spouse | ? | Husband | Asian-Pac-Islander | Male | 0 | 0 | 60 | South | >50K | 1 |
28 | 39 | Private | 367260 | HS-grad | 9 | Divorced | Exec-managerial | Not-in-family | White | Male | 0 | 0 | 80 | United-States | <=50K | 0 |
29 | 49 | Private | 193366 | HS-grad | 9 | Married-civ-spouse | Craft-repair | Husband | White | Male | 0 | 0 | 40 | United-States | <=50K | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32531 | 30 | ? | 33811 | Bachelors | 13 | Never-married | ? | Not-in-family | Asian-Pac-Islander | Female | 0 | 0 | 99 | United-States | <=50K | 0 |
32532 | 34 | Private | 204461 | Doctorate | 16 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 0 | 60 | United-States | >50K | 1 |
32533 | 54 | Private | 337992 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | Asian-Pac-Islander | Male | 0 | 0 | 50 | Japan | >50K | 1 |
32534 | 37 | Private | 179137 | Some-college | 10 | Divorced | Adm-clerical | Unmarried | White | Female | 0 | 0 | 39 | United-States | <=50K | 0 |
32535 | 22 | Private | 325033 | 12th | 8 | Never-married | Protective-serv | Own-child | Black | Male | 0 | 0 | 35 | United-States | <=50K | 0 |
32536 | 34 | Private | 160216 | Bachelors | 13 | Never-married | Exec-managerial | Not-in-family | White | Female | 0 | 0 | 55 | United-States | >50K | 1 |
32537 | 30 | Private | 345898 | HS-grad | 9 | Never-married | Craft-repair | Not-in-family | Black | Male | 0 | 0 | 46 | United-States | <=50K | 0 |
32538 | 38 | Private | 139180 | Bachelors | 13 | Divorced | Prof-specialty | Unmarried | Black | Female | 15020 | 0 | 45 | United-States | >50K | 1 |
32539 | 71 | ? | 287372 | Doctorate | 16 | Married-civ-spouse | ? | Husband | White | Male | 0 | 0 | 10 | United-States | >50K | 1 |
32540 | 45 | State-gov | 252208 | HS-grad | 9 | Separated | Adm-clerical | Own-child | White | Female | 0 | 0 | 40 | United-States | <=50K | 0 |
32541 | 41 | ? | 202822 | HS-grad | 9 | Separated | ? | Not-in-family | Black | Female | 0 | 0 | 32 | United-States | <=50K | 0 |
32542 | 72 | ? | 129912 | HS-grad | 9 | Married-civ-spouse | ? | Husband | White | Male | 0 | 0 | 25 | United-States | <=50K | 0 |
32543 | 45 | Local-gov | 119199 | Assoc-acdm | 12 | Divorced | Prof-specialty | Unmarried | White | Female | 0 | 0 | 48 | United-States | <=50K | 0 |
32544 | 31 | Private | 199655 | Masters | 14 | Divorced | Other-service | Not-in-family | Other | Female | 0 | 0 | 30 | United-States | <=50K | 0 |
32545 | 39 | Local-gov | 111499 | Assoc-acdm | 12 | Married-civ-spouse | Adm-clerical | Wife | White | Female | 0 | 0 | 20 | United-States | >50K | 1 |
32546 | 37 | Private | 198216 | Assoc-acdm | 12 | Divorced | Tech-support | Not-in-family | White | Female | 0 | 0 | 40 | United-States | <=50K | 0 |
32547 | 43 | Private | 260761 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | Mexico | <=50K | 0 |
32548 | 65 | Self-emp-not-inc | 99359 | Prof-school | 15 | Never-married | Prof-specialty | Not-in-family | White | Male | 1086 | 0 | 60 | United-States | <=50K | 0 |
32549 | 43 | State-gov | 255835 | Some-college | 10 | Divorced | Adm-clerical | Other-relative | White | Female | 0 | 0 | 40 | United-States | <=50K | 0 |
32550 | 43 | Self-emp-not-inc | 27242 | Some-college | 10 | Married-civ-spouse | Craft-repair | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K | 0 |
32551 | 32 | Private | 34066 | 10th | 6 | Married-civ-spouse | Handlers-cleaners | Husband | Amer-Indian-Eskimo | Male | 0 | 0 | 40 | United-States | <=50K | 0 |
32552 | 43 | Private | 84661 | Assoc-voc | 11 | Married-civ-spouse | Sales | Husband | White | Male | 0 | 0 | 45 | United-States | <=50K | 0 |
32553 | 32 | Private | 116138 | Masters | 14 | Never-married | Tech-support | Not-in-family | Asian-Pac-Islander | Male | 0 | 0 | 11 | Taiwan | <=50K | 0 |
32554 | 53 | Private | 321865 | Masters | 14 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 40 | United-States | >50K | 1 |
32555 | 22 | Private | 310152 | Some-college | 10 | Never-married | Protective-serv | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K | 0 |
32556 | 27 | Private | 257302 | Assoc-acdm | 12 | Married-civ-spouse | Tech-support | Wife | White | Female | 0 | 0 | 38 | United-States | <=50K | 0 |
32557 | 40 | Private | 154374 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | United-States | >50K | 1 |
32558 | 58 | Private | 151910 | HS-grad | 9 | Widowed | Adm-clerical | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K | 0 |
32559 | 22 | Private | 201490 | HS-grad | 9 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K | 0 |
32560 | 52 | Self-emp-inc | 287927 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 15024 | 0 | 40 | United-States | >50K | 1 |
32561 rows × 16 columns
# モデル構築と評価
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# 説明変数と目的変数を定義
X = df[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']]
y = df['fin_flg']
# データ分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
# ロジスティック回帰モデルのインスタンス作成と訓練
lr = LogisticRegression()
lr.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
print(lr.score(X_test,y_test))
print(lr.score(X_train,y_train))
index_li = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']
print(index_li)
print(lr.coef_)
print('intercept_ : ', lr.intercept_)
# オッズ比
print('オッズ比: ', np.exp(lr.coef_))
"""オッズ比
それぞれの係数が1大尉増加したとき予測確率にどの程度影響があるかを示す指標
(影響なしの場合は1.0)
"""
0.7973097475585038 0.7964373464373464 ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss'] [[-4.510e-03 -5.717e-06 -1.082e-03 3.159e-04 7.230e-04]] intercept_ : [-0.] オッズ比: [[0.996 1. 0.999 1. 1.001]]
'オッズ比\nそれぞれの係数が1大尉増加したとき予測確率にどの程度影響があるかを示す指標\n(影響なしの場合は1.0)\n'
# スケーリングする StandardScaler
"""
値の大きな説明変数に結果が引っ張られないように標準化する
"""
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# 説明変数と目的変数を定義 (以降 ほとんど前回と同じ)
X = df[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']]
y = df['fin_flg']
# データ分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
# ここで説明変数をスケーリングする
sc = StandardScaler() # インスタンスを作成する
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
# ロジスティック回帰モデルのインスタンス作成と訓練
lr = LogisticRegression()
lr.fit(X_train_std, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
# 結果
print(lr.score(X_test_std,y_test))
print(lr.score(X_train_std,y_train))
0.8101467968797985 0.8105651105651106