# Pandas
import numpy as np
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

%precision 3

'%.3f'

# 階層型インデックス
# サンプルデータ生成
np.random.seed(0)
df = DataFrame(np.random.randint(0,10,9).reshape((3,3)),
              index = [
                       ['a', 'a', 'b'],
                       [1, 1, 2]
                       ],
              columns = [
                         ['Hakodate','Sapporo','Hakodate'],
                         ['Blue','Red','Red']
                         ] )
df

# index に名前をつける
df.index.names=['index1','index2']
df

# columns　に名前をつける
df.columns.names = ['column1','column2']
df

# columnsの絞り込み
df['Hakodate']

# index2 を軸にした集計
df.sum(level = 'index2', axis=0)

# column2　を軸にした集計
df.sum(level = 'column2', axis=1)

# index要素の削除
# indexキーが２つあり、どちらも共通の場合はdropの引数をタプルにする必要がある。
df.drop(("a",1))

# データの結合
data1 = {'id':[100,101,102,103],
          'point':[10,11,12,13]}

data2 = {'id':[102,103,104,105],
          'point':[55,66,77,88]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print('=================')
print(df2)

    id  point
0  100     10
1  101     11
2  102     12
3  103     13
=================
    id  point
0  102     55
1  103     66
2  104     77
3  105     88

# inner join : 共通のキーがある場合にキーで結合（共通項目のみで共通でないキーは捨てられる）
pd.merge(df1, df2, on = 'id')

# full join ：とにかく全結合
pd.merge(df1, df2, how = 'outer')

# インデックスをキーとして結合することもできる
data1 = {'id':[100,101,102,103],
         'point':[10,11,12,13]}

data2 = {'id':[102,103,104,105],
         'point':[55,66,77,88],
         'index_num':[0,1,2,3]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print('=================')
print(df2)

# df1のインデックスとdf2のindex_num
df3 = pd.merge(df1, df2, left_index= True, right_on='index_num')
df3

    id  point
0  100     10
1  101     11
2  102     12
3  103     13
=================
    id  index_num  point
0  102          0     55
1  103          1     66
2  104          2     77
3  105          3     88

# left join : 左側の
data1 = {'id':[100,101,102,103],
         'point':[10,11,12,13]}

data2 = {'id':[102,103,104,105],
         'point':[55,66,77,88],
         'index_num':[0,1,2,3]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print('=================')
print(df2)

pd.merge(df1, df2, how = 'left')

    id  point
0  100     10
1  101     11
2  102     12
3  103     13
=================
    id  index_num  point
0  102          0     55
1  103          1     66
2  104          2     77
3  105          3     88

# 縦結合 concat ただただ縦に積み上げる
data1 = {'a':[1,2,3,4],
         'b':[11,22,33,44]}
data2 = {'A':[5,6,7,8],
         'B':[55,66,77,88]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

print(df1)
print('============')
print(df2)
print('# concatの引数dfはリストで投げる必要がある')
pd.concat([df1, df2])

   a   b
0  1  11
1  2  22
2  3  33
3  4  44
============
   A   B
0  5  55
1  6  66
2  7  77
3  8  88
# concatの引数dfはリストで投げる必要がある

# データの操作・変換

#サンプルデータ
data = {'value1':[1,2,3,4,5],
        'value2':[1,2,3,4,5]}

index = [['a', 'a', 'b', 'b','c'],
         [1, 1, 2, 3, 4]]
    
df = pd.DataFrame(data, index=index)

df.index.names=['index1','index2']
df

# ピボット操作：行列を入れ替える
df2 = df.stack()
print(type(df2))
pd.DataFrame(df2)
df2

<class 'pandas.core.series.Series'>

index1  index2        
a       1       value1    1
                value2    1
                value1    2
                value2    2
b       2       value1    3
                value2    3
        3       value1    4
                value2    4
c       4       value1    5
                value2    5
dtype: int64

# 重複データの除去

# 重複のあるサンプルデータを生成

data = {'value1':[1,2,3,4,5,6,3,3],
        'value2':[1,2,3,0,0,0,3,3]}

df = pd.DataFrame(data)
df

# 重複データを判定する duplicated()メソド
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
dtype: bool

# 重複データを削除する drop_duplicates()メソド
df.drop_duplicates()

# マッピング処理：エクセルのvlookup（検索）のような機能

map_data = {'c1':'cc1',
            'c2':'cc2',}
map_data

{'c1': 'cc1', 'c2': 'cc2'}

# マッピング処理　：DataFrameのcity列に対応した新しいcity2という列を追加する
# 追加する前のサンプルデータを生成
df1_data ={'city':['c1','c1','c2','c3'],
           'name':['A','B','C','D']}

df1 = pd.DataFrame(df1_data)
df1

# 対応表を作成 
map_data = {'c1':'cc1',
            'c2':'cc2',}
print(map_data)

# city列の中でc1に対しては新規city2列を生成しcc1というデータを与える
df1['city2'] = df1['city'].map(map_data)
df1

{'c1': 'cc1', 'c2': 'cc2'}

# 無名関数とmap関数の組み合わせ:列の一部を取り出す
print(df1)
print()

# df['city2']の頭２文字を取り出して新しい列を作る
df1['city2']
df1['new'] = df1['city2'].map(lambda x: str(x)[0:2])

print(df1)

  city name city2
0   c1    A   cc1
1   c1    B   cc1
2   c2    C   cc2
3   c3    D   NaN

  city name city2 new
0   c1    A   cc1  cc
1   c1    B   cc1  cc
2   c2    C   cc2  cc
3   c3    D   NaN  na

lambda x: str(x)[0:2]

<function __main__.<lambda>>

# ビン分割

# サンプルデータを生成する
data ={'year': [1955, 1990, 1989, 1992, 2015, 1980, 2009, 2005, 2019, 2020]}
df = pd.DataFrame(data)

# ビンを用意する
bins = [1980, 1990, 2000, 2010, 2020]

# ビン分割を実行
cut_data = pd.cut(df.year, bins)
print(cut_data)

"""
1990は[1980,1990]のクラスに入る
1980を超えて1990以下　という意味

なので1980はどのbinにも属さないのでNaNになる
"""

0             NaN
1    (1980, 1990]
2    (1980, 1990]
3    (1990, 2000]
4    (2010, 2020]
5             NaN
6    (2000, 2010]
7    (2000, 2010]
8    (2010, 2020]
9    (2010, 2020]
Name: year, dtype: category
Categories (4, interval[int64]): [(1980, 1990] < (1990, 2000] < (2000, 2010] < (2010, 2020]]

'\n1990は[1980,1990]のクラスに入る\n1980を超えて1990以下\u3000という意味\n\nなので1980はどのbinにも属さないのでNaNになる\n'

# right=Falseオブションで　1980以上,1990未満に変更できる
# ビン分割を実行
cut_data = pd.cut(df.year, bins, right=False)
print(cut_data)

0             NaN
1    [1990, 2000)
2    [1980, 1990)
3    [1990, 2000)
4    [2010, 2020)
5    [1980, 1990)
6    [2000, 2010)
7    [2000, 2010)
8    [2010, 2020)
9             NaN
Name: year, dtype: category
Categories (4, interval[int64]): [[1980, 1990) < [1990, 2000) < [2000, 2010) < [2010, 2020)]

# 集計できる
pd.value_counts(cut_data)

[2010, 2020)    2
[2000, 2010)    2
[1990, 2000)    2
[1980, 1990)    2
Name: year, dtype: int64

# 集計に名前をつけられる
group_names = ['1980s','1990s','2000s','2010s' ]
cut_data = pd.cut(df.year, bins, right=False, labels=group_names)
pd.value_counts(cut_data)

2010s    2
2000s    2
1990s    2
1980s    2
Name: year, dtype: int64

# 分割数を数字で指定できる
pd.cut(df.year, 4)

0    (1954.935, 1971.25]
1      (1987.5, 2003.75]
2      (1987.5, 2003.75]
3      (1987.5, 2003.75]
4      (2003.75, 2020.0]
5      (1971.25, 1987.5]
6      (2003.75, 2020.0]
7      (2003.75, 2020.0]
8      (2003.75, 2020.0]
9      (2003.75, 2020.0]
Name: year, dtype: category
Categories (4, interval[float64]): [(1954.935, 1971.25] < (1971.25, 1987.5] < (1987.5, 2003.75] < (2003.75, 2020.0]]

# 分位点で分割できる
cut_data = pd.qcut(df.year, 2)
print(cut_data)
print()
pd.value_counts(cut_data)

0    (1954.999, 1998.5]
1    (1954.999, 1998.5]
2    (1954.999, 1998.5]
3    (1954.999, 1998.5]
4      (1998.5, 2020.0]
5    (1954.999, 1998.5]
6      (1998.5, 2020.0]
7      (1998.5, 2020.0]
8      (1998.5, 2020.0]
9      (1998.5, 2020.0]
Name: year, dtype: category
Categories (2, interval[float64]): [(1954.999, 1998.5] < (1998.5, 2020.0]]

(1998.5, 2020.0]      5
(1954.999, 1998.5]    5
Name: year, dtype: int64

# ヒスとグラム
df.hist(bins=6)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001EBCB56A9B0>]],
      dtype=object)

# データの集約とグループ演算

# サンプルデータの生成　region付き
data = {'id': [100,101,102,103,104,106,108,110],
       'city': ['tokyo', 'osaka', 'kyoto', 'hakodate', 'tokyo', 'tokyo', 'osaka', 'kyoto'],
       'birth_year': [1990, 1989, 1992, 1997, 1982, 1991, 1988, 1990],
       'name': ['hiroshi', 'akiko', 'yuki', 'satoru', 'steeve', 'mituru', 'aoi', 'tarou'],
       'region': ['kanto', 'kansai', 'kansai', 'hokkaido', 'kanto', 'kanto', 'kansai', 'kansai']}
df1 = pd.DataFrame(data)
df1

# cityの値を集計する
df1.groupby('city').size()

city
hakodate    1
kyoto       2
osaka       2
tokyo       3
dtype: int64

# cityを軸にしてbirth_yearの平均値を求める
df1.groupby('city')['birth_year'].mean()

city
hakodate    1997.000000
kyoto       1991.000000
osaka       1988.500000
tokyo       1987.666667
Name: birth_year, dtype: float64

# regionとcityの2軸にしても計算可能
df1.groupby(['region', 'city'])['birth_year'].mean()

region    city    
hokkaido  hakodate    1997.000000
kansai    kyoto       1991.000000
          osaka       1988.500000
kanto     tokyo       1987.666667
Name: birth_year, dtype: float64

# このままテーブルとして使用したいとき as_index = False
df1.groupby(['region', 'city'], as_index=False)['birth_year'].mean()

# groupbyメソドでイテレータを使う

# regionごと　順番に取り出す
# groupはregionを取り出し subdfはそのregionの行を抽出する
for group, subdf in df1.groupby('region'):
    print(group)
    print(subdf)
    print()

hokkaido
   birth_year      city   id    name    region
3        1997  hakodate  103  satoru  hokkaido

kansai
   birth_year   city   id   name  region
1        1989  osaka  101  akiko  kansai
2        1992  kyoto  102   yuki  kansai
6        1988  osaka  108    aoi  kansai
7        1990  kyoto  110  tarou  kansai

kanto
   birth_year   city   id     name region
0        1990  tokyo  100  hiroshi  kanto
4        1982  tokyo  104   steeve  kanto
5        1991  tokyo  106   mituru  kanto

# 複数の計算をまとめて行う　aggメソド
# サンプルデータ
data = pd.read_csv('student-mat.csv', sep=';')
data.head()

# 列に count, mean , max , minを適用する
functions = ['count', 'mean', 'max', 'min']
grouped_data = data.groupby(['sex', 'address'])
grouped_data['age', 'G1'].agg(functions)

# 欠損データ　異常値　の取り扱い

# サンプルデータの生成
np.random.seed(0)
df = pd.DataFrame(np.random.rand(10, 4))
df

# NAにする
from numpy import nan as NA

df.iloc[1,0] = NA
df.iloc[2:3, 2] = NA
df.iloc[5:, 3] = NA
df

# リストワイズ削除 dropna：NaN を取り除く
df.dropna()

# ペアワイズ削除 絶対に使いたい列を指定してから dropnaする
df[[0,1]].dropna()

# fillnaで埋める
df.fillna(0).head()

# 前の値で埋める ffill:フォワードフィル
df.fillna(method = 'ffill').head()

# 平均値で埋める
df.fillna(df.mean()).head()

# 時系列データの取り扱い
import pandas_datareader.data as pdr

start_date = '2001/1/2'
end_date = '2016/12/30'
df = pdr.DataReader('DEXJPUS', 'fred', start_date, end_date)
df.head()

# 特定の年月のデータを参照する
df['2016-04'].head()

# 月毎 M のデータで末尾 lastのデータを取り出す
# resampling 頻度変更の意　他： D Y
df.resample('M').last().head()

# 欠損がある場合 : 日毎 D
df.resample('D').last().head()

# 前日のデータで埋める ffill
df.resample('D').ffill().head()

# データを1日ずらして shift()　比率を計算する 
df['ratio'] = df / df.shift(1)

df.head()

# 1日の差分を計算する diff
data['diff'] =  data['DEXJPUS'].diff(1)

data.head()

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2524             try:
-> 2525                 return self._engine.get_loc(key)
   2526             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'DEXJPUS'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-228-fb810b21f40f> in <module>()
      1 # 1日の差分を計算する diff
----> 2 data['diff'] =  data['DEXJPUS'].diff(1)
      3 
      4 data.head()

~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2137             return self._getitem_multilevel(key)
   2138         else:
-> 2139             return self._getitem_column(key)
   2140 
   2141     def _getitem_column(self, key):

~\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
   2144         # get column
   2145         if self.columns.is_unique:
-> 2146             return self._get_item_cache(key)
   2147 
   2148         # duplicate columns & possible reduce dimensionality

~\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
   1840         res = cache.get(item)
   1841         if res is None:
-> 1842             values = self._data.get(item)
   1843             res = self._box_item_values(item, values)
   1844             cache[item] = res

~\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
   3841 
   3842             if not isna(item):
-> 3843                 loc = self.items.get_loc(item)
   3844             else:
   3845                 indexer = np.arange(len(self.items))[isna(self.items)]

~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2525                 return self._engine.get_loc(key)
   2526             except KeyError:
-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2528 
   2529         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'DEXJPUS'

#pct_changeを使っても比率は計算できる
df['pct_change'] = df['DEXJPUS'].pct_change(1)
df.head()

# 過去3日移動平均
df['mean_3'] = df['DEXJPUS'].rolling(3).mean()
df.head()

# 過去３日標準偏差 stdメソド
df['std_3'] = df['DEXJPUS'].rolling(3).std() 
df.head()

# =============== Script End =================

	school	sex	age	address	famsize	Pstatus	Medu	Fedu	Mjob	Fjob	...	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2	G3
0	GP	F	18	U	GT3	A	4	4	at_home	teacher	...	4	3	4	1	1	3	6	5	6	6
1	GP	F	17	U	GT3	T	1	1	at_home	other	...	5	3	3	1	1	3	4	5	5	6
2	GP	F	15	U	LE3	T	1	1	at_home	other	...	4	3	2	2	3	3	10	7	8	10
3	GP	F	15	U	GT3	T	4	2	health	services	...	3	2	2	1	1	5	2	15	14	15
4	GP	F	16	U	GT3	T	3	3	other	other	...	4	3	2	1	2	5	4	6	10	10

	0	1	2	3
0	0.548814	0.715189	0.602763	0.544883
1	0.423655	0.645894	0.437587	0.891773
2	0.963663	0.383442	0.791725	0.528895
3	0.568045	0.925597	0.071036	0.087129
4	0.020218	0.832620	0.778157	0.870012
5	0.978618	0.799159	0.461479	0.780529
6	0.118274	0.639921	0.143353	0.944669
7	0.521848	0.414662	0.264556	0.774234
8	0.456150	0.568434	0.018790	0.617635
9	0.612096	0.616934	0.943748	0.681820

	0	1	2	3
0	0.548814	0.715189	0.602763	0.544883
1	NaN	0.645894	0.437587	0.891773
2	0.963663	0.383442	NaN	0.528895
3	0.568045	0.925597	0.071036	0.087129
4	0.020218	0.832620	0.778157	0.870012
5	0.978618	0.799159	0.461479	NaN
6	0.118274	0.639921	0.143353	NaN
7	0.521848	0.414662	0.264556	NaN
8	0.456150	0.568434	0.018790	NaN
9	0.612096	0.616934	0.943748	NaN

	0	1	2	3
0	0.548814	0.715189	0.602763	0.544883
3	0.568045	0.925597	0.071036	0.087129
4	0.020218	0.832620	0.778157	0.870012

	0	1
0	0.548814	0.715189
2	0.963663	0.383442
3	0.568045	0.925597
4	0.020218	0.832620
5	0.978618	0.799159
6	0.118274	0.639921
7	0.521848	0.414662
8	0.456150	0.568434
9	0.612096	0.616934

	A	B	a	b
0	NaN	NaN	1.0	11.0
1	NaN	NaN	2.0	22.0
2	NaN	NaN	3.0	33.0
3	NaN	NaN	4.0	44.0
0	5.0	55.0	NaN	NaN
1	6.0	66.0	NaN	NaN
2	7.0	77.0	NaN	NaN
3	8.0	88.0	NaN	NaN

	birth_year	city	id	name	region
0	1990	tokyo	100	hiroshi	kanto
1	1989	osaka	101	akiko	kansai
2	1992	kyoto	102	yuki	kansai
3	1997	hakodate	103	satoru	hokkaido
4	1982	tokyo	104	steeve	kanto
5	1991	tokyo	106	mituru	kanto
6	1988	osaka	108	aoi	kansai
7	1990	kyoto	110	tarou	kansai

	region	city	birth_year
0	hokkaido	hakodate	1997.000000
1	kansai	kyoto	1991.000000
2	kansai	osaka	1988.500000
3	kanto	tokyo	1987.666667

		age				G1
		count	mean	max	min	count	mean	max	min
sex	address
F	R	44	16.977273	19	15	44	10.295455	19	6
F	U	164	16.664634	20	15	164	10.707317	18	4
M	R	44	17.113636	21	15	44	10.659091	18	3
M	U	143	16.517483	22	15	143	11.405594	19	5

	DEXJPUS
DATE
2001-01-02	114.73
2001-01-03	114.26
2001-01-04	115.47
2001-01-05	116.19
2001-01-08	115.97

	DEXJPUS
DATE
2016-04-01	112.06
2016-04-04	111.18
2016-04-05	110.26
2016-04-06	109.63
2016-04-07	107.98

	DEXJPUS
DATE
2001-01-31	116.39
2001-02-28	117.28
2001-03-31	125.54
2001-04-30	123.57
2001-05-31	118.88

		Hakodate	Sapporo	Hakodate
		Blue	Red	Red
a	1	5	0	3
a	1	3	7	9
b	2	3	5	2

		Hakodate	Sapporo	Hakodate
		Blue	Red	Red
index1	index2
a	1	5	0	3
a	1	3	7	9
b	2	3	5	2

	column1	Hakodate	Sapporo	Hakodate
	column2	Blue	Red	Red
index1	index2
a	1	5	0	3
a	1	3	7	9
b	2	3	5	2

column1	Hakodate	Sapporo	Hakodate
column2	Blue	Red	Red
index2
1	8	7	12
2	3	5	2