# chapter 3 記述統計　と　単回帰分析（推測統計の一つ）

# 演算用モジュール
import numpy as np
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame

# 可視化モジュール
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()
%matplotlib inline

# 機械学習モジュール
from sklearn import linear_model

%precision 3

'%.3f'

pwd

'C:\\Users\\keita\\chapter3'

mkdir chapter3

サブディレクトリまたはファイル chapter3 は既に存在します。

cd chapter3

C:\Users\keita\chapter3\chapter3

ls

 ドライブ C のボリューム ラベルは Windows です
 ボリューム シリアル番号は 64C2-9721 です

 C:\Users\keita\chapter3\chapter3 のディレクトリ

2019/12/15  12:52    <DIR>          .
2019/12/15  12:52    <DIR>          ..
2019/12/15  12:52             3,206 student.txt
2019/12/15  12:52            56,993 student-mat.csv
2019/12/15  12:52               269 student-merge.R
2019/12/15  12:52            93,220 student-por.csv
               4 個のファイル             153,688 バイト
               2 個のディレクトリ  293,460,262,912 バイトの空き領域

import requests
import zipfile
from io import StringIO
import io

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00356/student.zip'

r = requests.get(url, stream=True)

z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

ls

 ドライブ C のボリューム ラベルは Windows です
 ボリューム シリアル番号は 64C2-9721 です

 C:\Users\keita\chapter3\chapter3 のディレクトリ

2019/12/15  12:52    <DIR>          .
2019/12/15  12:52    <DIR>          ..
2019/12/15  13:54             3,206 student.txt
2019/12/15  13:54            56,993 student-mat.csv
2019/12/15  13:54               269 student-merge.R
2019/12/15  13:54            93,220 student-por.csv
               4 個のファイル             153,688 バイト
               2 個のディレクトリ  293,460,262,912 バイトの空き領域

data = pd.read_csv('student-mat.csv')
data.head()
# うまく区切れていない。次はセミコロンで分けてみる

data = pd.read_csv('student-mat.csv', sep=';')
data.head()
# うまくいった。

# データ情報を分析
print(data.info)

'''
[395 rows x 33 columns]

'''
data.keys()
'''
Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')
'''

<bound method DataFrame.info of     school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
5       GP   M   16       U     LE3       T     4     3  services     other   
6       GP   M   16       U     LE3       T     2     2     other     other   
7       GP   F   17       U     GT3       A     4     4     other   teacher   
8       GP   M   15       U     LE3       A     3     2  services     other   
9       GP   M   15       U     GT3       T     3     4     other     other   
10      GP   F   15       U     GT3       T     4     4   teacher    health   
11      GP   F   15       U     GT3       T     2     1  services     other   
12      GP   M   15       U     LE3       T     4     4    health  services   
13      GP   M   15       U     GT3       T     4     3   teacher     other   
14      GP   M   15       U     GT3       A     2     2     other     other   
15      GP   F   16       U     GT3       T     4     4    health     other   
16      GP   F   16       U     GT3       T     4     4  services  services   
17      GP   F   16       U     GT3       T     3     3     other     other   
18      GP   M   17       U     GT3       T     3     2  services  services   
19      GP   M   16       U     LE3       T     4     3    health     other   
20      GP   M   15       U     GT3       T     4     3   teacher     other   
21      GP   M   15       U     GT3       T     4     4    health    health   
22      GP   M   16       U     LE3       T     4     2   teacher     other   
23      GP   M   16       U     LE3       T     2     2     other     other   
24      GP   F   15       R     GT3       T     2     4  services    health   
25      GP   F   16       U     GT3       T     2     2  services  services   
26      GP   M   15       U     GT3       T     2     2     other     other   
27      GP   M   15       U     GT3       T     4     2    health  services   
28      GP   M   16       U     LE3       A     3     4  services     other   
29      GP   M   16       U     GT3       T     4     4   teacher   teacher   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
365     MS   M   18       R     GT3       T     1     3   at_home     other   
366     MS   M   18       U     LE3       T     4     4   teacher  services   
367     MS   F   17       R     GT3       T     1     1     other  services   
368     MS   F   18       U     GT3       T     2     3   at_home  services   
369     MS   F   18       R     GT3       T     4     4     other   teacher   
370     MS   F   19       U     LE3       T     3     2  services  services   
371     MS   M   18       R     LE3       T     1     2   at_home  services   
372     MS   F   17       U     GT3       T     2     2     other   at_home   
373     MS   F   17       R     GT3       T     1     2     other     other   
374     MS   F   18       R     LE3       T     4     4     other     other   
375     MS   F   18       R     GT3       T     1     1     other     other   
376     MS   F   20       U     GT3       T     4     2    health     other   
377     MS   F   18       R     LE3       T     4     4   teacher  services   
378     MS   F   18       U     GT3       T     3     3     other     other   
379     MS   F   17       R     GT3       T     3     1   at_home     other   
380     MS   M   18       U     GT3       T     4     4   teacher   teacher   
381     MS   M   18       R     GT3       T     2     1     other     other   
382     MS   M   17       U     GT3       T     2     3     other  services   
383     MS   M   19       R     GT3       T     1     1     other  services   
384     MS   M   18       R     GT3       T     4     2     other     other   
385     MS   F   18       R     GT3       T     2     2   at_home     other   
386     MS   F   18       R     GT3       T     4     4   teacher   at_home   
387     MS   F   19       R     GT3       T     2     3  services     other   
388     MS   F   18       U     LE3       T     3     1   teacher  services   
389     MS   F   18       U     GT3       T     1     1     other     other   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0    ...      4        3      4     1     1      3        6   5   6   6  
1    ...      5        3      3     1     1      3        4   5   5   6  
2    ...      4        3      2     2     3      3       10   7   8  10  
3    ...      3        2      2     1     1      5        2  15  14  15  
4    ...      4        3      2     1     2      5        4   6  10  10  
5    ...      5        4      2     1     2      5       10  15  15  15  
6    ...      4        4      4     1     1      3        0  12  12  11  
7    ...      4        1      4     1     1      1        6   6   5   6  
8    ...      4        2      2     1     1      1        0  16  18  19  
9    ...      5        5      1     1     1      5        0  14  15  15  
10   ...      3        3      3     1     2      2        0  10   8   9  
11   ...      5        2      2     1     1      4        4  10  12  12  
12   ...      4        3      3     1     3      5        2  14  14  14  
13   ...      5        4      3     1     2      3        2  10  10  11  
14   ...      4        5      2     1     1      3        0  14  16  16  
15   ...      4        4      4     1     2      2        4  14  14  14  
16   ...      3        2      3     1     2      2        6  13  14  14  
17   ...      5        3      2     1     1      4        4   8  10  10  
18   ...      5        5      5     2     4      5       16   6   5   5  
19   ...      3        1      3     1     3      5        4   8  10  10  
20   ...      4        4      1     1     1      1        0  13  14  15  
21   ...      5        4      2     1     1      5        0  12  15  15  
22   ...      4        5      1     1     3      5        2  15  15  16  
23   ...      5        4      4     2     4      5        0  13  13  12  
24   ...      4        3      2     1     1      5        2  10   9   8  
25   ...      1        2      2     1     3      5       14   6   9   8  
26   ...      4        2      2     1     2      5        2  12  12  11  
27   ...      2        2      4     2     4      1        4  15  16  15  
28   ...      5        3      3     1     1      5        4  11  11  11  
29   ...      4        4      5     5     5      5       16  10  12  11  
..   ...    ...      ...    ...   ...   ...    ...      ...  ..  ..  ..  
365  ...      3        3      4     2     4      3        4  10  10  10  
366  ...      4        2      2     2     2      5        0  13  13  13  
367  ...      5        2      1     1     2      1        0   7   6   0  
368  ...      5        2      3     1     2      4        0  11  10  10  
369  ...      3        2      2     4     2      5       10  14  12  11  
370  ...      3        2      2     1     1      3        4   7   7   9  
371  ...      4        3      3     2     3      3        3  14  12  12  
372  ...      3        4      3     1     1      3        8  13  11  11  
373  ...      3        5      5     1     3      1       14   6   5   5  
374  ...      5        4      4     1     1      1        0  19  18  19  
375  ...      4        3      2     1     2      4        2   8   8  10  
376  ...      5        4      3     1     1      3        4  15  14  15  
377  ...      5        4      3     3     4      2        4   8   9  10  
378  ...      4        1      3     1     2      1        0  15  15  15  
379  ...      4        5      4     2     3      1       17  10  10  10  
380  ...      3        2      4     1     4      2        4  15  14  14  
381  ...      4        4      3     1     3      5        5   7   6   7  
382  ...      4        4      3     1     1      3        2  11  11  10  
383  ...      4        3      2     1     3      5        0   6   5   0  
384  ...      5        4      3     4     3      3       14   6   5   5  
385  ...      5        3      3     1     3      4        2  10   9  10  
386  ...      4        4      3     2     2      5        7   6   5   6  
387  ...      5        4      2     1     2      5        0   7   5   0  
388  ...      4        3      4     1     1      1        0   7   9   8  
389  ...      1        1      1     1     1      5        0   6   5   0  
390  ...      5        5      4     4     5      4       11   9   9   9  
391  ...      2        4      5     3     4      2        3  14  16  16  
392  ...      5        5      3     3     3      3        3  10   8   7  
393  ...      4        4      1     3     4      5        0  11  12  10  
394  ...      3        2      3     3     3      5        5   8   9   9  

[395 rows x 33 columns]>

"\nIndex(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',\n       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',\n       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',\n       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',\n       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],\n      dtype='object')\n"

# keysの内容を確認する
path = 'student.txt'
with open(path,encoding='utf-8') as f:
    s = f.read()
    print(s)

# Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:
1 school - student's school (binary: "GP" - Gabriel Pereira or "MS" - Mousinho da Silveira)
2 sex - student's sex (binary: "F" - female or "M" - male)
3 age - student's age (numeric: from 15 to 22)
4 address - student's home address type (binary: "U" - urban or "R" - rural)
5 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
6 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
7 Medu - mother's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
8 Fedu - father's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
9 Mjob - mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
10 Fjob - father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
11 reason - reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other")
12 guardian - student's guardian (nominal: "mother", "father" or "other")
13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)
16 schoolsup - extra educational support (binary: yes or no)
17 famsup - family educational support (binary: yes or no)
18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
19 activities - extra-curricular activities (binary: yes or no)
20 nursery - attended nursery school (binary: yes or no)
21 higher - wants to take higher education (binary: yes or no)
22 internet - Internet access at home (binary: yes or no)
23 romantic - with a romantic relationship (binary: yes or no)
24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)
26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
29 health - current health status (numeric: from 1 - very bad to 5 - very good)
30 absences - number of school absences (numeric: from 0 to 93)

# these grades are related with the course subject, Math or Portuguese:
31 G1 - first period grade (numeric: from 0 to 20)
31 G2 - second period grade (numeric: from 0 to 20)
32 G3 - final grade (numeric: from 0 to 20, output target)

Additional note: there are several (382) students that belong to both datasets . 
These students can be identified by searching for identical attributes
that characterize each student, as shown in the annexed R file.

# 質的データ
data['sex'].head()

0    F
1    F
2    F
3    F
4    F
Name: sex, dtype: object

# 量的データ
data['absences'].head()

0     6
1     4
2    10
3     2
4     4
Name: absences, dtype: int64

# 性別を軸にして、年齢の平均値を計算する
data.groupby('sex')['age'].mean()

sex
F    16.730769
M    16.657754
Name: age, dtype: float64

# chapter 3-3 記述統計
# 欠席日数の度数分布をかいてみる
x = data['absences']
plt.hist(x, bins=30)

# ロングテールで　右に歪みのある分布になっている。

(array([183.,  61.,  43.,  25.,  32.,  15.,  11.,   6.,   8.,   2.,   2.,
          1.,   1.,   0.,   0.,   1.,   1.,   0.,   0.,   0.,   0.,   1.,
          1.,   0.,   0.,   0.,   0.,   0.,   0.,   1.]),
 array([ 0. ,  2.5,  5. ,  7.5, 10. , 12.5, 15. , 17.5, 20. , 22.5, 25. ,
        27.5, 30. , 32.5, 35. , 37.5, 40. , 42.5, 45. , 47.5, 50. , 52.5,
        55. , 57.5, 60. , 62.5, 65. , 67.5, 70. , 72.5, 75. ]),
 <a list of 30 Patch objects>)

# 平均値　中央値　最頻値
print(data['absences'].mean())
print(data['absences'].median())
print(data['absences'].mode())

5.708860759493671
4.0
0    0
dtype: int64

# 分散 標準偏差　 n-1 でなくて n で割り算している。
print(data['absences'].var())
print(data['absences'].std())

64.04954057700951
8.003095687108177

# 要約統計量とパーセンタイル値
data.describe()

# 四分位範囲　２５％～７５％に入る分
print(data['absences'].describe().keys())

data['absences'].describe()['75%'] - data['absences'].describe()['25%']

Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], dtype='object')

8.0

# 箱ひげ図
data['G1'].head()

0     5
1     5
2     7
3    15
4     6
Name: G1, dtype: int64

plt.boxplot(data['G1'])
# 下から　最小値　第１四分位　中央値　第３四分位　最大値

{'whiskers': [<matplotlib.lines.Line2D at 0x2700aa95978>,
  <matplotlib.lines.Line2D at 0x2700aa95d68>],
 'caps': [<matplotlib.lines.Line2D at 0x2700aa95e80>,
  <matplotlib.lines.Line2D at 0x2700aa9a4a8>],
 'boxes': [<matplotlib.lines.Line2D at 0x2700aa955c0>],
 'medians': [<matplotlib.lines.Line2D at 0x2700aa9a828>],
 'fliers': [<matplotlib.lines.Line2D at 0x2700aa9aba8>],
 'means': []}

plt.boxplot(data['absences'])
# はずれ値は最大値に採用されない。

{'whiskers': [<matplotlib.lines.Line2D at 0x2700a7b5e80>,
  <matplotlib.lines.Line2D at 0x2700a794550>],
 'caps': [<matplotlib.lines.Line2D at 0x2700a7948d0>,
  <matplotlib.lines.Line2D at 0x2700a794c50>],
 'boxes': [<matplotlib.lines.Line2D at 0x2700a7b5cf8>],
 'medians': [<matplotlib.lines.Line2D at 0x2700a794fd0>],
 'fliers': [<matplotlib.lines.Line2D at 0x2700a794f60>],
 'means': []}

# 並べるときはリストで与える
plt.boxplot([data['G1'], data['G2'], data['G3']])
plt.grid(True)

# 変動係数 CV = σ/mean
cv = data['absences'].std() / data['absences'].mean()
cv

1.402

# 一気に標準偏差を求める
data.std()

age           1.276043
Medu          1.094735
Fedu          1.088201
traveltime    0.697505
studytime     0.839240
failures      0.743651
famrel        0.896659
freetime      0.998862
goout         1.113278
Dalc          0.890741
Walc          1.287897
health        1.390303
absences      8.003096
G1            3.319195
G2            3.761505
G3            4.581443
dtype: float64

# 一気に変動係数を求める
data.std() / data.mean()

age           0.076427
Medu          0.398177
Fedu          0.431565
traveltime    0.481668
studytime     0.412313
failures      2.225319
famrel        0.227330
freetime      0.308725
goout         0.358098
Dalc          0.601441
Walc          0.562121
health        0.391147
absences      1.401873
G1            0.304266
G2            0.351086
G3            0.439881
dtype: float64

# 散布図と相関係数 G1とG3の関係性を調べてみる

plt.plot(data['G1'], data['G3'], 'o')
plt.xlabel('G1')
plt.ylabel('G3')
plt.xlim(0, 30)
plt.ylim(0, 30)
plt.grid(True)

'''
G1に値があるのでG3がゼロのデータが複数ある。→謎
'''

'\nG1に値があるのでG3がゼロのデータが複数ある。→謎\n'

# ２つの変数の関係性　共分散 cov
# 偏差の積の和の平均

np.cov(data['G1'], data['G3'])

'''
G1分散　共分散
共分散　G2分散

array([[11.017, 12.188],
       [12.188, 20.99 ]])
'''
print(data['G1'].var())
print(data['G3'].var())

11.017053267364899
20.989616397866737

# 相関係数：共分散はスケールの影響を受けるので、それをなくす。
# 共分散をそれぞれの変数の標準偏差で割り算する
sp.stats.pearsonr(data['G1'], data['G3'])

# 相関係数：0.8014679320174141
# p値     ：9.001430312276602e-90

# 相関係数では因果関係はわからないので注意

(0.8014679320174141, 9.001430312276602e-90)

# 相関行列　すべての組み合わせで相関関係を算出する リストで入れる
np.corrcoef([data['G1'], data['G2'], data['G3']])

array([[1.   , 0.852, 0.801],
       [0.852, 1.   , 0.905],
       [0.801, 0.905, 1.   ]])

# すべての変数のヒストグラム　散布図を書く sns.pariplot([])
# アルコールと成績の関係性を確認してみる
sns.pairplot(data[['Dalc', 'Walc', 'G1', 'G3']])

<seaborn.axisgrid.PairGrid at 0x27009460cf8>

# 任意の列を抽出するときは　列をリストで渡す
data[['Dalc','Walc']].head()

# 練習　要約統計量を表示する
file = 'student-por.csv'
with open(file) as f:
    data = pd.read_csv(f, sep = ';')

data.head()

data.describe()

# 練習：マージする
file2 = 'student-mat.csv'
with open(file2) as f:
    data2 = pd.read_csv(f, sep = ';')

data2.head()

# キーを指定してマージする

merge_keys = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'nursery', 'internet']

data_m = pd.merge(data, data2, on = merge_keys, suffixes=( '_por', '_math'))
data_m.head()

# 練習
sns.pairplot(data_m[['Medu', 'Fedu', 'G3_math']])

<seaborn.axisgrid.PairGrid at 0x2702ec15748>

# chapter 3-4 単回帰分析
# 数学のG1 G3データをよくみてみます。
file_path = 'student-mat.csv'
with open(file_path) as f:
    data = pd.read_csv(f, sep=';')

print(data.keys())
plt.plot(data['G1'], data['G3'], 'o')
plt.xlabel('G1')
plt.ylabel('G3')

# このG1の成績を元に　G3の成績を予測するモデルを単回帰分析で作成する
# G1は説明変数　G3は目的変数という。

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

Text(0, 0.5, 'G3')

from sklearn import linear_model

# インスタンスの作成
reg = linear_model.LinearRegression()

X = data.loc[:,['G1']].values # 説明変数は１変数でもベクトルとして入力しなければならない。
Y = data['G3'].values
reg.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

print('回帰係数：', reg.coef_)
print('切片：', reg.intercept_)

回帰係数： [1.106]
切片： -1.6528038288004616

# 回帰結果をグラフに表示する
plt.plot(data['G1'], data['G3'], 'o')
plt.xlabel('G1')
plt.ylabel('G3')

plt.plot(X, reg.predict(X)) # Xを再利用

[<matplotlib.lines.Line2D at 0x27036eae7b8>]

# 決定係数 R^2　うまく回帰できているか具合
print('決定係数：', reg.score(X, Y))

決定係数： 0.64235084605227

# 練習
file_path = 'student-por.csv'
with open(file_path) as f:
    data = pd.read_csv(f, sep=';')

X = data.loc[:,['G1']].values # ベクトルのベクトル
Y = data['G3'].values # 　スカラーのベクトル

from sklearn import linear_model

reg = linear_model.LinearRegression()
reg.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

print(reg.coef_, reg.intercept_, reg.score(X, Y))

plt.plot(X, Y, 'o')
plt.plot(X, reg.predict(X))

[0.973] 0.8203984121064565 0.6829156800171085

[<matplotlib.lines.Line2D at 0x270370d4240>]

# 練習
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'

import requests
import io

r = requests.get(url, stream=True)

data =io.BytesIO(r.content) # zipファイルではない
df = pd.read_csv(data, sep=';')
df.head()

df_describe = df.describe()
df_describe.to_csv('df_describe.csv')

 ドライブ C のボリューム ラベルは Windows です
 ボリューム シリアル番号は 64C2-9721 です

 C:\Users\keita\chapter3\chapter3 のディレクトリ

2019/12/15  18:32    <DIR>          .
2019/12/15  18:32    <DIR>          ..
2019/12/15  18:30             1,653 df_describe
2019/12/15  18:32             1,054 df_describe.csv
2019/12/15  18:30             1,653 df_describe.pickle
2019/12/15  13:54             3,206 student.txt
2019/12/15  13:54            56,993 student-mat.csv
2019/12/15  13:54               269 student-merge.R
2019/12/15  13:54            93,220 student-por.csv
               7 個のファイル             158,048 バイト
               2 個のディレクトリ  293,433,446,400 バイトの空き領域

sns.pairplot(df)

'''
# すべての変数のヒストグラム　散布図を書く sns.pariplot([])
# アルコールと成績の関係性を確認してみる
sns.pairplot(data[['Dalc', 'Walc', 'G1', 'G3']])
'''

<seaborn.axisgrid.PairGrid at 0x270377225c0>

df

	school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3
0	GP;"F";18;"U";"GT3";"A";4;4;"at_home";"teacher...
1	GP;"F";17;"U";"GT3";"T";1;1;"at_home";"other";...
2	GP;"F";15;"U";"LE3";"T";1;1;"at_home";"other";...
3	GP;"F";15;"U";"GT3";"T";4;2;"health";"services...
4	GP;"F";16;"U";"GT3";"T";3;3;"other";"other";"h...

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2	G3
count	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000	395.000000
mean	16.696203	2.749367	2.521519	1.448101	2.035443	0.334177	3.944304	3.235443	3.108861	1.481013	2.291139	3.554430	5.708861	10.908861	10.713924	10.415190
std	1.276043	1.094735	1.088201	0.697505	0.839240	0.743651	0.896659	0.998862	1.113278	0.890741	1.287897	1.390303	8.003096	3.319195	3.761505	4.581443
min	15.000000	0.000000	0.000000	1.000000	1.000000	0.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	0.000000	3.000000	0.000000	0.000000
25%	16.000000	2.000000	2.000000	1.000000	1.000000	0.000000	4.000000	3.000000	2.000000	1.000000	1.000000	3.000000	0.000000	8.000000	9.000000	8.000000
50%	17.000000	3.000000	2.000000	1.000000	2.000000	0.000000	4.000000	3.000000	3.000000	1.000000	2.000000	4.000000	4.000000	11.000000	11.000000	11.000000
75%	18.000000	4.000000	3.000000	2.000000	2.000000	0.000000	5.000000	4.000000	4.000000	2.000000	3.000000	5.000000	8.000000	13.000000	13.000000	14.000000
max	22.000000	4.000000	4.000000	4.000000	4.000000	3.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	75.000000	19.000000	19.000000	20.000000

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2	G3
count	649.000000	649.000000	649.000000	649.000000	649.000000	649.000000	649.000000	649.000000	649.000000	649.000000	649.000000	649.000000	649.000000	649.000000	649.000000	649.000000
mean	16.744222	2.514638	2.306626	1.568567	1.930663	0.221880	3.930663	3.180277	3.184900	1.502311	2.280431	3.536210	3.659476	11.399076	11.570108	11.906009
std	1.218138	1.134552	1.099931	0.748660	0.829510	0.593235	0.955717	1.051093	1.175766	0.924834	1.284380	1.446259	4.640759	2.745265	2.913639	3.230656
min	15.000000	0.000000	0.000000	1.000000	1.000000	0.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000
25%	16.000000	2.000000	1.000000	1.000000	1.000000	0.000000	4.000000	3.000000	2.000000	1.000000	1.000000	2.000000	0.000000	10.000000	10.000000	10.000000
50%	17.000000	2.000000	2.000000	1.000000	2.000000	0.000000	4.000000	3.000000	3.000000	1.000000	2.000000	4.000000	2.000000	11.000000	11.000000	12.000000
75%	18.000000	4.000000	3.000000	2.000000	2.000000	0.000000	5.000000	4.000000	4.000000	2.000000	3.000000	5.000000	6.000000	13.000000	13.000000	14.000000
max	22.000000	4.000000	4.000000	4.000000	4.000000	3.000000	5.000000	5.000000	5.000000	5.000000	5.000000	5.000000	32.000000	19.000000	19.000000	19.000000

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	5
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	5
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.9980	3.16	0.58	9.8	6
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.4	0.700	0.00	1.9	0.076	11.0	34.0	0.99780	3.51	0.56	9.4	5
1	7.8	0.880	0.00	2.6	0.098	25.0	67.0	0.99680	3.20	0.68	9.8	5
2	7.8	0.760	0.04	2.3	0.092	15.0	54.0	0.99700	3.26	0.65	9.8	5
3	11.2	0.280	0.56	1.9	0.075	17.0	60.0	0.99800	3.16	0.58	9.8	6
4	7.4	0.700	0.00	1.9	0.076	11.0	34.0	0.99780	3.51	0.56	9.4	5
5	7.4	0.660	0.00	1.8	0.075	13.0	40.0	0.99780	3.51	0.56	9.4	5
6	7.9	0.600	0.06	1.6	0.069	15.0	59.0	0.99640	3.30	0.46	9.4	5
7	7.3	0.650	0.00	1.2	0.065	15.0	21.0	0.99460	3.39	0.47	10.0	7
8	7.8	0.580	0.02	2.0	0.073	9.0	18.0	0.99680	3.36	0.57	9.5	7
9	7.5	0.500	0.36	6.1	0.071	17.0	102.0	0.99780	3.35	0.80	10.5	5
10	6.7	0.580	0.08	1.8	0.097	15.0	65.0	0.99590	3.28	0.54	9.2	5
11	7.5	0.500	0.36	6.1	0.071	17.0	102.0	0.99780	3.35	0.80	10.5	5
12	5.6	0.615	0.00	1.6	0.089	16.0	59.0	0.99430	3.58	0.52	9.9	5
13	7.8	0.610	0.29	1.6	0.114	9.0	29.0	0.99740	3.26	1.56	9.1	5
14	8.9	0.620	0.18	3.8	0.176	52.0	145.0	0.99860	3.16	0.88	9.2	5
15	8.9	0.620	0.19	3.9	0.170	51.0	148.0	0.99860	3.17	0.93	9.2	5
16	8.5	0.280	0.56	1.8	0.092	35.0	103.0	0.99690	3.30	0.75	10.5	7
17	8.1	0.560	0.28	1.7	0.368	16.0	56.0	0.99680	3.11	1.28	9.3	5
18	7.4	0.590	0.08	4.4	0.086	6.0	29.0	0.99740	3.38	0.50	9.0	4
19	7.9	0.320	0.51	1.8	0.341	17.0	56.0	0.99690	3.04	1.08	9.2	6
20	8.9	0.220	0.48	1.8	0.077	29.0	60.0	0.99680	3.39	0.53	9.4	6
21	7.6	0.390	0.31	2.3	0.082	23.0	71.0	0.99820	3.52	0.65	9.7	5
22	7.9	0.430	0.21	1.6	0.106	10.0	37.0	0.99660	3.17	0.91	9.5	5
23	8.5	0.490	0.11	2.3	0.084	9.0	67.0	0.99680	3.17	0.53	9.4	5
24	6.9	0.400	0.14	2.4	0.085	21.0	40.0	0.99680	3.43	0.63	9.7	6
25	6.3	0.390	0.16	1.4	0.080	11.0	23.0	0.99550	3.34	0.56	9.3	5
26	7.6	0.410	0.24	1.8	0.080	4.0	11.0	0.99620	3.28	0.59	9.5	5
27	7.9	0.430	0.21	1.6	0.106	10.0	37.0	0.99660	3.17	0.91	9.5	5
28	7.1	0.710	0.00	1.9	0.080	14.0	35.0	0.99720	3.47	0.55	9.4	5
29	7.8	0.645	0.00	2.0	0.082	8.0	16.0	0.99640	3.38	0.59	9.8	6
...	...	...	...	...	...	...	...	...	...	...	...	...
1569	6.2	0.510	0.14	1.9	0.056	15.0	34.0	0.99396	3.48	0.57	11.5	6
1570	6.4	0.360	0.53	2.2	0.230	19.0	35.0	0.99340	3.37	0.93	12.4	6
1571	6.4	0.380	0.14	2.2	0.038	15.0	25.0	0.99514	3.44	0.65	11.1	6
1572	7.3	0.690	0.32	2.2	0.069	35.0	104.0	0.99632	3.33	0.51	9.5	5
1573	6.0	0.580	0.20	2.4	0.075	15.0	50.0	0.99467	3.58	0.67	12.5	6
1574	5.6	0.310	0.78	13.9	0.074	23.0	92.0	0.99677	3.39	0.48	10.5	6
1575	7.5	0.520	0.40	2.2	0.060	12.0	20.0	0.99474	3.26	0.64	11.8	6
1576	8.0	0.300	0.63	1.6	0.081	16.0	29.0	0.99588	3.30	0.78	10.8	6
1577	6.2	0.700	0.15	5.1	0.076	13.0	27.0	0.99622	3.54	0.60	11.9	6
1578	6.8	0.670	0.15	1.8	0.118	13.0	20.0	0.99540	3.42	0.67	11.3	6
1579	6.2	0.560	0.09	1.7	0.053	24.0	32.0	0.99402	3.54	0.60	11.3	5
1580	7.4	0.350	0.33	2.4	0.068	9.0	26.0	0.99470	3.36	0.60	11.9	6
1581	6.2	0.560	0.09	1.7	0.053	24.0	32.0	0.99402	3.54	0.60	11.3	5
1582	6.1	0.715	0.10	2.6	0.053	13.0	27.0	0.99362	3.57	0.50	11.9	5
1583	6.2	0.460	0.29	2.1	0.074	32.0	98.0	0.99578	3.33	0.62	9.8	5
1584	6.7	0.320	0.44	2.4	0.061	24.0	34.0	0.99484	3.29	0.80	11.6	7
1585	7.2	0.390	0.44	2.6	0.066	22.0	48.0	0.99494	3.30	0.84	11.5	6
1586	7.5	0.310	0.41	2.4	0.065	34.0	60.0	0.99492	3.34	0.85	11.4	6
1587	5.8	0.610	0.11	1.8	0.066	18.0	28.0	0.99483	3.55	0.66	10.9	6
1588	7.2	0.660	0.33	2.5	0.068	34.0	102.0	0.99414	3.27	0.78	12.8	6
1589	6.6	0.725	0.20	7.8	0.073	29.0	79.0	0.99770	3.29	0.54	9.2	5
1590	6.3	0.550	0.15	1.8	0.077	26.0	35.0	0.99314	3.32	0.82	11.6	6
1591	5.4	0.740	0.09	1.7	0.089	16.0	26.0	0.99402	3.67	0.56	11.6	6
1592	6.3	0.510	0.13	2.3	0.076	29.0	40.0	0.99574	3.42	0.75	11.0	6
1593	6.8	0.620	0.08	1.9	0.068	28.0	38.0	0.99651	3.42	0.82	9.5	6
1594	6.2	0.600	0.08	2.0	0.090	32.0	44.0	0.99490	3.45	0.58	10.5	5
1595	5.9	0.550	0.10	2.2	0.062	39.0	51.0	0.99512	3.52	0.76	11.2	6
1596	6.3	0.510	0.13	2.3	0.076	29.0	40.0	0.99574	3.42	0.75	11.0	6
1597	5.9	0.645	0.12	2.0	0.075	32.0	44.0	0.99547	3.57	0.71	10.2	5
1598	6.0	0.310	0.47	3.6	0.067	18.0	42.0	0.99549	3.39	0.66	11.0	6

	school	sex	age	address	famsize	Pstatus	Medu	Fedu	Mjob	Fjob	...	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2	G3
0	GP	F	18	U	GT3	A	4	4	at_home	teacher	...	4	3	4	1	1	3	6	5	6	6
1	GP	F	17	U	GT3	T	1	1	at_home	other	...	5	3	3	1	1	3	4	5	5	6
2	GP	F	15	U	LE3	T	1	1	at_home	other	...	4	3	2	2	3	3	10	7	8	10
3	GP	F	15	U	GT3	T	4	2	health	services	...	3	2	2	1	1	5	2	15	14	15
4	GP	F	16	U	GT3	T	3	3	other	other	...	4	3	2	1	2	5	4	6	10	10