In [25]:
# chapter 3 記述統計 と 単回帰分析(推測統計の一つ)
In [61]:
# 演算用モジュール
import numpy as np
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame

# 可視化モジュール
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()
%matplotlib inline

# 機械学習モジュール
from sklearn import linear_model

%precision 3
Out[61]:
'%.3f'
In [27]:
pwd
Out[27]:
'C:\\Users\\keita\\chapter3'
In [28]:
mkdir chapter3
サブディレクトリまたはファイル chapter3 は既に存在します。
In [29]:
cd chapter3
C:\Users\keita\chapter3\chapter3
In [30]:
ls
 ドライブ C のボリューム ラベルは Windows です
 ボリューム シリアル番号は 64C2-9721 です

 C:\Users\keita\chapter3\chapter3 のディレクトリ

2019/12/15  12:52    <DIR>          .
2019/12/15  12:52    <DIR>          ..
2019/12/15  12:52             3,206 student.txt
2019/12/15  12:52            56,993 student-mat.csv
2019/12/15  12:52               269 student-merge.R
2019/12/15  12:52            93,220 student-por.csv
               4 個のファイル             153,688 バイト
               2 個のディレクトリ  293,460,262,912 バイトの空き領域
In [31]:
import requests
import zipfile
from io import StringIO
import io
In [32]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00356/student.zip'

r = requests.get(url, stream=True)

z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
In [33]:
ls
 ドライブ C のボリューム ラベルは Windows です
 ボリューム シリアル番号は 64C2-9721 です

 C:\Users\keita\chapter3\chapter3 のディレクトリ

2019/12/15  12:52    <DIR>          .
2019/12/15  12:52    <DIR>          ..
2019/12/15  13:54             3,206 student.txt
2019/12/15  13:54            56,993 student-mat.csv
2019/12/15  13:54               269 student-merge.R
2019/12/15  13:54            93,220 student-por.csv
               4 個のファイル             153,688 バイト
               2 個のディレクトリ  293,460,262,912 バイトの空き領域
In [34]:
data = pd.read_csv('student-mat.csv')
data.head()
# うまく区切れていない。次はセミコロンで分けてみる
Out[34]:
school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3
0 GP;"F";18;"U";"GT3";"A";4;4;"at_home";"teacher...
1 GP;"F";17;"U";"GT3";"T";1;1;"at_home";"other";...
2 GP;"F";15;"U";"LE3";"T";1;1;"at_home";"other";...
3 GP;"F";15;"U";"GT3";"T";4;2;"health";"services...
4 GP;"F";16;"U";"GT3";"T";3;3;"other";"other";"h...
In [35]:
data = pd.read_csv('student-mat.csv', sep=';')
data.head()
# うまくいった。
Out[35]:
school sex age address famsize Pstatus Medu Fedu Mjob Fjob ... famrel freetime goout Dalc Walc health absences G1 G2 G3
0 GP F 18 U GT3 A 4 4 at_home teacher ... 4 3 4 1 1 3 6 5 6 6
1 GP F 17 U GT3 T 1 1 at_home other ... 5 3 3 1 1 3 4 5 5 6
2 GP F 15 U LE3 T 1 1 at_home other ... 4 3 2 2 3 3 10 7 8 10
3 GP F 15 U GT3 T 4 2 health services ... 3 2 2 1 1 5 2 15 14 15
4 GP F 16 U GT3 T 3 3 other other ... 4 3 2 1 2 5 4 6 10 10

5 rows × 33 columns

In [36]:
# データ情報を分析
print(data.info)

'''
[395 rows x 33 columns]

'''
data.keys()
'''
Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')
'''
<bound method DataFrame.info of     school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
5       GP   M   16       U     LE3       T     4     3  services     other   
6       GP   M   16       U     LE3       T     2     2     other     other   
7       GP   F   17       U     GT3       A     4     4     other   teacher   
8       GP   M   15       U     LE3       A     3     2  services     other   
9       GP   M   15       U     GT3       T     3     4     other     other   
10      GP   F   15       U     GT3       T     4     4   teacher    health   
11      GP   F   15       U     GT3       T     2     1  services     other   
12      GP   M   15       U     LE3       T     4     4    health  services   
13      GP   M   15       U     GT3       T     4     3   teacher     other   
14      GP   M   15       U     GT3       A     2     2     other     other   
15      GP   F   16       U     GT3       T     4     4    health     other   
16      GP   F   16       U     GT3       T     4     4  services  services   
17      GP   F   16       U     GT3       T     3     3     other     other   
18      GP   M   17       U     GT3       T     3     2  services  services   
19      GP   M   16       U     LE3       T     4     3    health     other   
20      GP   M   15       U     GT3       T     4     3   teacher     other   
21      GP   M   15       U     GT3       T     4     4    health    health   
22      GP   M   16       U     LE3       T     4     2   teacher     other   
23      GP   M   16       U     LE3       T     2     2     other     other   
24      GP   F   15       R     GT3       T     2     4  services    health   
25      GP   F   16       U     GT3       T     2     2  services  services   
26      GP   M   15       U     GT3       T     2     2     other     other   
27      GP   M   15       U     GT3       T     4     2    health  services   
28      GP   M   16       U     LE3       A     3     4  services     other   
29      GP   M   16       U     GT3       T     4     4   teacher   teacher   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
365     MS   M   18       R     GT3       T     1     3   at_home     other   
366     MS   M   18       U     LE3       T     4     4   teacher  services   
367     MS   F   17       R     GT3       T     1     1     other  services   
368     MS   F   18       U     GT3       T     2     3   at_home  services   
369     MS   F   18       R     GT3       T     4     4     other   teacher   
370     MS   F   19       U     LE3       T     3     2  services  services   
371     MS   M   18       R     LE3       T     1     2   at_home  services   
372     MS   F   17       U     GT3       T     2     2     other   at_home   
373     MS   F   17       R     GT3       T     1     2     other     other   
374     MS   F   18       R     LE3       T     4     4     other     other   
375     MS   F   18       R     GT3       T     1     1     other     other   
376     MS   F   20       U     GT3       T     4     2    health     other   
377     MS   F   18       R     LE3       T     4     4   teacher  services   
378     MS   F   18       U     GT3       T     3     3     other     other   
379     MS   F   17       R     GT3       T     3     1   at_home     other   
380     MS   M   18       U     GT3       T     4     4   teacher   teacher   
381     MS   M   18       R     GT3       T     2     1     other     other   
382     MS   M   17       U     GT3       T     2     3     other  services   
383     MS   M   19       R     GT3       T     1     1     other  services   
384     MS   M   18       R     GT3       T     4     2     other     other   
385     MS   F   18       R     GT3       T     2     2   at_home     other   
386     MS   F   18       R     GT3       T     4     4   teacher   at_home   
387     MS   F   19       R     GT3       T     2     3  services     other   
388     MS   F   18       U     LE3       T     3     1   teacher  services   
389     MS   F   18       U     GT3       T     1     1     other     other   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0    ...      4        3      4     1     1      3        6   5   6   6  
1    ...      5        3      3     1     1      3        4   5   5   6  
2    ...      4        3      2     2     3      3       10   7   8  10  
3    ...      3        2      2     1     1      5        2  15  14  15  
4    ...      4        3      2     1     2      5        4   6  10  10  
5    ...      5        4      2     1     2      5       10  15  15  15  
6    ...      4        4      4     1     1      3        0  12  12  11  
7    ...      4        1      4     1     1      1        6   6   5   6  
8    ...      4        2      2     1     1      1        0  16  18  19  
9    ...      5        5      1     1     1      5        0  14  15  15  
10   ...      3        3      3     1     2      2        0  10   8   9  
11   ...      5        2      2     1     1      4        4  10  12  12  
12   ...      4        3      3     1     3      5        2  14  14  14  
13   ...      5        4      3     1     2      3        2  10  10  11  
14   ...      4        5      2     1     1      3        0  14  16  16  
15   ...      4        4      4     1     2      2        4  14  14  14  
16   ...      3        2      3     1     2      2        6  13  14  14  
17   ...      5        3      2     1     1      4        4   8  10  10  
18   ...      5        5      5     2     4      5       16   6   5   5  
19   ...      3        1      3     1     3      5        4   8  10  10  
20   ...      4        4      1     1     1      1        0  13  14  15  
21   ...      5        4      2     1     1      5        0  12  15  15  
22   ...      4        5      1     1     3      5        2  15  15  16  
23   ...      5        4      4     2     4      5        0  13  13  12  
24   ...      4        3      2     1     1      5        2  10   9   8  
25   ...      1        2      2     1     3      5       14   6   9   8  
26   ...      4        2      2     1     2      5        2  12  12  11  
27   ...      2        2      4     2     4      1        4  15  16  15  
28   ...      5        3      3     1     1      5        4  11  11  11  
29   ...      4        4      5     5     5      5       16  10  12  11  
..   ...    ...      ...    ...   ...   ...    ...      ...  ..  ..  ..  
365  ...      3        3      4     2     4      3        4  10  10  10  
366  ...      4        2      2     2     2      5        0  13  13  13  
367  ...      5        2      1     1     2      1        0   7   6   0  
368  ...      5        2      3     1     2      4        0  11  10  10  
369  ...      3        2      2     4     2      5       10  14  12  11  
370  ...      3        2      2     1     1      3        4   7   7   9  
371  ...      4        3      3     2     3      3        3  14  12  12  
372  ...      3        4      3     1     1      3        8  13  11  11  
373  ...      3        5      5     1     3      1       14   6   5   5  
374  ...      5        4      4     1     1      1        0  19  18  19  
375  ...      4        3      2     1     2      4        2   8   8  10  
376  ...      5        4      3     1     1      3        4  15  14  15  
377  ...      5        4      3     3     4      2        4   8   9  10  
378  ...      4        1      3     1     2      1        0  15  15  15  
379  ...      4        5      4     2     3      1       17  10  10  10  
380  ...      3        2      4     1     4      2        4  15  14  14  
381  ...      4        4      3     1     3      5        5   7   6   7  
382  ...      4        4      3     1     1      3        2  11  11  10  
383  ...      4        3      2     1     3      5        0   6   5   0  
384  ...      5        4      3     4     3      3       14   6   5   5  
385  ...      5        3      3     1     3      4        2  10   9  10  
386  ...      4        4      3     2     2      5        7   6   5   6  
387  ...      5        4      2     1     2      5        0   7   5   0  
388  ...      4        3      4     1     1      1        0   7   9   8  
389  ...      1        1      1     1     1      5        0   6   5   0  
390  ...      5        5      4     4     5      4       11   9   9   9  
391  ...      2        4      5     3     4      2        3  14  16  16  
392  ...      5        5      3     3     3      3        3  10   8   7  
393  ...      4        4      1     3     4      5        0  11  12  10  
394  ...      3        2      3     3     3      5        5   8   9   9  

[395 rows x 33 columns]>
Out[36]:
"\nIndex(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',\n       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',\n       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',\n       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',\n       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],\n      dtype='object')\n"
In [37]:
# keysの内容を確認する
path = 'student.txt'
with open(path,encoding='utf-8') as f:
    s = f.read()
    print(s)   
# Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:
1 school - student's school (binary: "GP" - Gabriel Pereira or "MS" - Mousinho da Silveira)
2 sex - student's sex (binary: "F" - female or "M" - male)
3 age - student's age (numeric: from 15 to 22)
4 address - student's home address type (binary: "U" - urban or "R" - rural)
5 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
6 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
7 Medu - mother's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
8 Fedu - father's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
9 Mjob - mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
10 Fjob - father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
11 reason - reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other")
12 guardian - student's guardian (nominal: "mother", "father" or "other")
13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)
16 schoolsup - extra educational support (binary: yes or no)
17 famsup - family educational support (binary: yes or no)
18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
19 activities - extra-curricular activities (binary: yes or no)
20 nursery - attended nursery school (binary: yes or no)
21 higher - wants to take higher education (binary: yes or no)
22 internet - Internet access at home (binary: yes or no)
23 romantic - with a romantic relationship (binary: yes or no)
24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)
26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
29 health - current health status (numeric: from 1 - very bad to 5 - very good)
30 absences - number of school absences (numeric: from 0 to 93)

# these grades are related with the course subject, Math or Portuguese:
31 G1 - first period grade (numeric: from 0 to 20)
31 G2 - second period grade (numeric: from 0 to 20)
32 G3 - final grade (numeric: from 0 to 20, output target)

Additional note: there are several (382) students that belong to both datasets . 
These students can be identified by searching for identical attributes
that characterize each student, as shown in the annexed R file.

In [39]:
# 質的データ
data['sex'].head()
Out[39]:
0    F
1    F
2    F
3    F
4    F
Name: sex, dtype: object
In [41]:
# 量的データ
data['absences'].head()
Out[41]:
0     6
1     4
2    10
3     2
4     4
Name: absences, dtype: int64
In [49]:
# 性別を軸にして、年齢の平均値を計算する
data.groupby('sex')['age'].mean()
Out[49]:
sex
F    16.730769
M    16.657754
Name: age, dtype: float64
In [55]:
# chapter 3-3 記述統計
# 欠席日数の度数分布をかいてみる
x = data['absences']
plt.hist(x, bins=30)

# ロングテールで 右に歪みのある分布になっている。
Out[55]:
(array([183.,  61.,  43.,  25.,  32.,  15.,  11.,   6.,   8.,   2.,   2.,
          1.,   1.,   0.,   0.,   1.,   1.,   0.,   0.,   0.,   0.,   1.,
          1.,   0.,   0.,   0.,   0.,   0.,   0.,   1.]),
 array([ 0. ,  2.5,  5. ,  7.5, 10. , 12.5, 15. , 17.5, 20. , 22.5, 25. ,
        27.5, 30. , 32.5, 35. , 37.5, 40. , 42.5, 45. , 47.5, 50. , 52.5,
        55. , 57.5, 60. , 62.5, 65. , 67.5, 70. , 72.5, 75. ]),
 <a list of 30 Patch objects>)
In [62]:
# 平均値 中央値 最頻値
print(data['absences'].mean())
print(data['absences'].median())
print(data['absences'].mode())
5.708860759493671
4.0
0    0
dtype: int64
In [63]:
# 分散 標準偏差  n-1 でなくて n で割り算している。
print(data['absences'].var())
print(data['absences'].std())
64.04954057700951
8.003095687108177
In [66]:
# 要約統計量とパーセンタイル値
data.describe()
Out[66]:
age Medu Fedu traveltime studytime failures famrel freetime goout Dalc Walc health absences G1 G2 G3
count 395.000000 395.000000 395.000000 395.000000 395.000000 395.000000 395.000000 395.000000 395.000000 395.000000 395.000000 395.000000 395.000000 395.000000 395.000000 395.000000
mean 16.696203 2.749367 2.521519 1.448101 2.035443 0.334177 3.944304 3.235443 3.108861 1.481013 2.291139 3.554430 5.708861 10.908861 10.713924 10.415190
std 1.276043 1.094735 1.088201 0.697505 0.839240 0.743651 0.896659 0.998862 1.113278 0.890741 1.287897 1.390303 8.003096 3.319195 3.761505 4.581443
min 15.000000 0.000000 0.000000 1.000000 1.000000 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 3.000000 0.000000 0.000000
25% 16.000000 2.000000 2.000000 1.000000 1.000000 0.000000 4.000000 3.000000 2.000000 1.000000 1.000000 3.000000 0.000000 8.000000 9.000000 8.000000
50% 17.000000 3.000000 2.000000 1.000000 2.000000 0.000000 4.000000 3.000000 3.000000 1.000000 2.000000 4.000000 4.000000 11.000000 11.000000 11.000000
75% 18.000000 4.000000 3.000000 2.000000 2.000000 0.000000 5.000000 4.000000 4.000000 2.000000 3.000000 5.000000 8.000000 13.000000 13.000000 14.000000
max 22.000000 4.000000 4.000000 4.000000 4.000000 3.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 75.000000 19.000000 19.000000 20.000000
In [71]:
# 四分位範囲 25%~75%に入る分
print(data['absences'].describe().keys())

data['absences'].describe()['75%'] - data['absences'].describe()['25%']
Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], dtype='object')
Out[71]:
8.0
In [76]:
# 箱ひげ図
data['G1'].head()
Out[76]:
0     5
1     5
2     7
3    15
4     6
Name: G1, dtype: int64
In [77]:
plt.boxplot(data['G1'])
# 下から 最小値 第1四分位 中央値 第3四分位 最大値
Out[77]:
{'whiskers': [<matplotlib.lines.Line2D at 0x2700aa95978>,
  <matplotlib.lines.Line2D at 0x2700aa95d68>],
 'caps': [<matplotlib.lines.Line2D at 0x2700aa95e80>,
  <matplotlib.lines.Line2D at 0x2700aa9a4a8>],
 'boxes': [<matplotlib.lines.Line2D at 0x2700aa955c0>],
 'medians': [<matplotlib.lines.Line2D at 0x2700aa9a828>],
 'fliers': [<matplotlib.lines.Line2D at 0x2700aa9aba8>],
 'means': []}
In [78]:
plt.boxplot(data['absences'])
# はずれ値は最大値に採用されない。
Out[78]:
{'whiskers': [<matplotlib.lines.Line2D at 0x2700a7b5e80>,
  <matplotlib.lines.Line2D at 0x2700a794550>],
 'caps': [<matplotlib.lines.Line2D at 0x2700a7948d0>,
  <matplotlib.lines.Line2D at 0x2700a794c50>],
 'boxes': [<matplotlib.lines.Line2D at 0x2700a7b5cf8>],
 'medians': [<matplotlib.lines.Line2D at 0x2700a794fd0>],
 'fliers': [<matplotlib.lines.Line2D at 0x2700a794f60>],
 'means': []}
In [82]:
# 並べるときはリストで与える
plt.boxplot([data['G1'], data['G2'], data['G3']])
plt.grid(True)
In [86]:
# 変動係数 CV = σ/mean
cv = data['absences'].std() / data['absences'].mean()
cv
Out[86]:
1.402
In [89]:
# 一気に標準偏差を求める
data.std()
Out[89]:
age           1.276043
Medu          1.094735
Fedu          1.088201
traveltime    0.697505
studytime     0.839240
failures      0.743651
famrel        0.896659
freetime      0.998862
goout         1.113278
Dalc          0.890741
Walc          1.287897
health        1.390303
absences      8.003096
G1            3.319195
G2            3.761505
G3            4.581443
dtype: float64
In [90]:
# 一気に変動係数を求める
data.std() / data.mean()
Out[90]:
age           0.076427
Medu          0.398177
Fedu          0.431565
traveltime    0.481668
studytime     0.412313
failures      2.225319
famrel        0.227330
freetime      0.308725
goout         0.358098
Dalc          0.601441
Walc          0.562121
health        0.391147
absences      1.401873
G1            0.304266
G2            0.351086
G3            0.439881
dtype: float64
In [98]:
# 散布図と相関係数 G1とG3の関係性を調べてみる

plt.plot(data['G1'], data['G3'], 'o')
plt.xlabel('G1')
plt.ylabel('G3')
plt.xlim(0, 30)
plt.ylim(0, 30)
plt.grid(True)

'''
G1に値があるのでG3がゼロのデータが複数ある。→謎
'''
Out[98]:
'\nG1に値があるのでG3がゼロのデータが複数ある。→謎\n'
In [103]:
# 2つの変数の関係性 共分散 cov
# 偏差の積の和の平均

np.cov(data['G1'], data['G3'])

'''
G1分散 共分散
共分散 G2分散

array([[11.017, 12.188],
       [12.188, 20.99 ]])
'''
print(data['G1'].var())
print(data['G3'].var())
11.017053267364899
20.989616397866737
In [106]:
# 相関係数:共分散はスケールの影響を受けるので、それをなくす。
# 共分散をそれぞれの変数の標準偏差で割り算する
sp.stats.pearsonr(data['G1'], data['G3'])

# 相関係数:0.8014679320174141
# p値     :9.001430312276602e-90

# 相関係数では因果関係はわからないので注意
Out[106]:
(0.8014679320174141, 9.001430312276602e-90)
In [109]:
# 相関行列 すべての組み合わせで相関関係を算出する リストで入れる
np.corrcoef([data['G1'], data['G2'], data['G3']])
Out[109]:
array([[1.   , 0.852, 0.801],
       [0.852, 1.   , 0.905],
       [0.801, 0.905, 1.   ]])
In [112]:
# すべての変数のヒストグラム 散布図を書く sns.pariplot([])
# アルコールと成績の関係性を確認してみる
sns.pairplot(data[['Dalc', 'Walc', 'G1', 'G3']])
Out[112]:
<seaborn.axisgrid.PairGrid at 0x27009460cf8>
In [118]:
# 任意の列を抽出するときは 列をリストで渡す
data[['Dalc','Walc']].head()
Out[118]:
Dalc Walc
0 1 1
1 1 1
2 2 3
3 1 1
4 1 2
In [126]:
# 練習 要約統計量を表示する
file = 'student-por.csv'
with open(file) as f:
    data = pd.read_csv(f, sep = ';')

data.head()
Out[126]:
school sex age address famsize Pstatus Medu Fedu Mjob Fjob ... famrel freetime goout Dalc Walc health absences G1 G2 G3
0 GP F 18 U GT3 A 4 4 at_home teacher ... 4 3 4 1 1 3 4 0 11 11
1 GP F 17 U GT3 T 1 1 at_home other ... 5 3 3 1 1 3 2 9 11 11
2 GP F 15 U LE3 T 1 1 at_home other ... 4 3 2 2 3 3 6 12 13 12
3 GP F 15 U GT3 T 4 2 health services ... 3 2 2 1 1 5 0 14 14 14
4 GP F 16 U GT3 T 3 3 other other ... 4 3 2 1 2 5 0 11 13 13

5 rows × 33 columns

In [127]:
data.describe()
Out[127]:
age Medu Fedu traveltime studytime failures famrel freetime goout Dalc Walc health absences G1 G2 G3
count 649.000000 649.000000 649.000000 649.000000 649.000000 649.000000 649.000000 649.000000 649.000000 649.000000 649.000000 649.000000 649.000000 649.000000 649.000000 649.000000
mean 16.744222 2.514638 2.306626 1.568567 1.930663 0.221880 3.930663 3.180277 3.184900 1.502311 2.280431 3.536210 3.659476 11.399076 11.570108 11.906009
std 1.218138 1.134552 1.099931 0.748660 0.829510 0.593235 0.955717 1.051093 1.175766 0.924834 1.284380 1.446259 4.640759 2.745265 2.913639 3.230656
min 15.000000 0.000000 0.000000 1.000000 1.000000 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000
25% 16.000000 2.000000 1.000000 1.000000 1.000000 0.000000 4.000000 3.000000 2.000000 1.000000 1.000000 2.000000 0.000000 10.000000 10.000000 10.000000
50% 17.000000 2.000000 2.000000 1.000000 2.000000 0.000000 4.000000 3.000000 3.000000 1.000000 2.000000 4.000000 2.000000 11.000000 11.000000 12.000000
75% 18.000000 4.000000 3.000000 2.000000 2.000000 0.000000 5.000000 4.000000 4.000000 2.000000 3.000000 5.000000 6.000000 13.000000 13.000000 14.000000
max 22.000000 4.000000 4.000000 4.000000 4.000000 3.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 32.000000 19.000000 19.000000 19.000000
In [130]:
# 練習:マージする
file2 = 'student-mat.csv'
with open(file2) as f:
    data2 = pd.read_csv(f, sep = ';')

data2.head()
Out[130]:
school sex age address famsize Pstatus Medu Fedu Mjob Fjob ... famrel freetime goout Dalc Walc health absences G1 G2 G3
0 GP F 18 U GT3 A 4 4 at_home teacher ... 4 3 4 1 1 3 6 5 6 6
1 GP F 17 U GT3 T 1 1 at_home other ... 5 3 3 1 1 3 4 5 5 6
2 GP F 15 U LE3 T 1 1 at_home other ... 4 3 2 2 3 3 10 7 8 10
3 GP F 15 U GT3 T 4 2 health services ... 3 2 2 1 1 5 2 15 14 15
4 GP F 16 U GT3 T 3 3 other other ... 4 3 2 1 2 5 4 6 10 10

5 rows × 33 columns

In [151]:
# キーを指定してマージする

merge_keys = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'nursery', 'internet']

data_m = pd.merge(data, data2, on = merge_keys, suffixes=( '_por', '_math'))
data_m.head()
Out[151]:
school sex age address famsize Pstatus Medu Fedu Mjob Fjob ... famrel_math freetime_math goout_math Dalc_math Walc_math health_math absences_math G1_math G2_math G3_math
0 GP F 18 U GT3 A 4 4 at_home teacher ... 4 3 4 1 1 3 6 5 6 6
1 GP F 17 U GT3 T 1 1 at_home other ... 5 3 3 1 1 3 4 5 5 6
2 GP F 15 U LE3 T 1 1 at_home other ... 4 3 2 2 3 3 10 7 8 10
3 GP F 15 U GT3 T 4 2 health services ... 3 2 2 1 1 5 2 15 14 15
4 GP F 16 U GT3 T 3 3 other other ... 4 3 2 1 2 5 4 6 10 10

5 rows × 53 columns

In [152]:
# 練習
sns.pairplot(data_m[['Medu', 'Fedu', 'G3_math']])
Out[152]:
<seaborn.axisgrid.PairGrid at 0x2702ec15748>
In [161]:
# chapter 3-4 単回帰分析
# 数学のG1 G3データをよくみてみます。
file_path = 'student-mat.csv'
with open(file_path) as f:
    data = pd.read_csv(f, sep=';')

print(data.keys())
plt.plot(data['G1'], data['G3'], 'o')
plt.xlabel('G1')
plt.ylabel('G3')

# このG1の成績を元に G3の成績を予測するモデルを単回帰分析で作成する
# G1は説明変数 G3は目的変数という。
Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')
Out[161]:
Text(0, 0.5, 'G3')
In [168]:
from sklearn import linear_model

# インスタンスの作成
reg = linear_model.LinearRegression()
In [198]:
X = data.loc[:,['G1']].values # 説明変数は1変数でもベクトルとして入力しなければならない。
Y = data['G3'].values
reg.fit(X, Y)
Out[198]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [201]:
print('回帰係数:', reg.coef_)
print('切片:', reg.intercept_)
回帰係数: [1.106]
切片: -1.6528038288004616
In [203]:
# 回帰結果をグラフに表示する
plt.plot(data['G1'], data['G3'], 'o')
plt.xlabel('G1')
plt.ylabel('G3')

plt.plot(X, reg.predict(X)) # Xを再利用
Out[203]:
[<matplotlib.lines.Line2D at 0x27036eae7b8>]
In [205]:
# 決定係数 R^2 うまく回帰できているか具合
print('決定係数:', reg.score(X, Y))
決定係数: 0.64235084605227
In [220]:
# 練習
file_path = 'student-por.csv'
with open(file_path) as f:
    data = pd.read_csv(f, sep=';')

X = data.loc[:,['G1']].values # ベクトルのベクトル
Y = data['G3'].values #  スカラーのベクトル

from sklearn import linear_model

reg = linear_model.LinearRegression()
reg.fit(X, Y)
Out[220]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [230]:
print(reg.coef_, reg.intercept_, reg.score(X, Y))

plt.plot(X, Y, 'o')
plt.plot(X, reg.predict(X))
[0.973] 0.8203984121064565 0.6829156800171085
Out[230]:
[<matplotlib.lines.Line2D at 0x270370d4240>]
In [256]:
# 練習
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'

import requests
import io

r = requests.get(url, stream=True)

data =io.BytesIO(r.content) # zipファイルではない
df = pd.read_csv(data, sep=';')
df.head()
Out[256]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
In [268]:
df_describe = df.describe()
df_describe.to_csv('df_describe.csv')
In [270]:
 
 ドライブ C のボリューム ラベルは Windows です
 ボリューム シリアル番号は 64C2-9721 です

 C:\Users\keita\chapter3\chapter3 のディレクトリ

2019/12/15  18:32    <DIR>          .
2019/12/15  18:32    <DIR>          ..
2019/12/15  18:30             1,653 df_describe
2019/12/15  18:32             1,054 df_describe.csv
2019/12/15  18:30             1,653 df_describe.pickle
2019/12/15  13:54             3,206 student.txt
2019/12/15  13:54            56,993 student-mat.csv
2019/12/15  13:54               269 student-merge.R
2019/12/15  13:54            93,220 student-por.csv
               7 個のファイル             158,048 バイト
               2 個のディレクトリ  293,433,446,400 バイトの空き領域
In [278]:
sns.pairplot(df)

'''
# すべての変数のヒストグラム 散布図を書く sns.pariplot([])
# アルコールと成績の関係性を確認してみる
sns.pairplot(data[['Dalc', 'Walc', 'G1', 'G3']])
'''
Out[278]:
<seaborn.axisgrid.PairGrid at 0x270377225c0>