In [176]:
# Pandas
import numpy as np
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

%precision 3
Out[176]:
'%.3f'
In [177]:
# 階層型インデックス
# サンプルデータ生成
np.random.seed(0)
df = DataFrame(np.random.randint(0,10,9).reshape((3,3)),
              index = [
                       ['a', 'a', 'b'],
                       [1, 1, 2]
                       ],
              columns = [
                         ['Hakodate','Sapporo','Hakodate'],
                         ['Blue','Red','Red']
                         ] )
df
Out[177]:
Hakodate Sapporo Hakodate
Blue Red Red
a 1 5 0 3
1 3 7 9
b 2 3 5 2
In [178]:
# index に名前をつける
df.index.names=['index1','index2']
df
Out[178]:
Hakodate Sapporo Hakodate
Blue Red Red
index1 index2
a 1 5 0 3
1 3 7 9
b 2 3 5 2
In [179]:
# columns に名前をつける
df.columns.names = ['column1','column2']
df
Out[179]:
column1 Hakodate Sapporo Hakodate
column2 Blue Red Red
index1 index2
a 1 5 0 3
1 3 7 9
b 2 3 5 2
In [180]:
# columnsの絞り込み
df['Hakodate']
Out[180]:
column2 Blue Red
index1 index2
a 1 5 3
1 3 9
b 2 3 2
In [181]:
# index2 を軸にした集計
df.sum(level = 'index2', axis=0)
Out[181]:
column1 Hakodate Sapporo Hakodate
column2 Blue Red Red
index2
1 8 7 12
2 3 5 2
In [182]:
# column2 を軸にした集計
df.sum(level = 'column2', axis=1)
Out[182]:
column2 Blue Red
index1 index2
a 1 5 3
1 3 16
b 2 3 7
In [183]:
# index要素の削除
# indexキーが2つあり、どちらも共通の場合はdropの引数をタプルにする必要がある。
df.drop(("a",1))
Out[183]:
column1 Hakodate Sapporo Hakodate
column2 Blue Red Red
index1 index2
b 2 3 5 2
In [184]:
# データの結合
data1 = {'id':[100,101,102,103],
          'point':[10,11,12,13]}

data2 = {'id':[102,103,104,105],
          'point':[55,66,77,88]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print('=================')
print(df2)
    id  point
0  100     10
1  101     11
2  102     12
3  103     13
=================
    id  point
0  102     55
1  103     66
2  104     77
3  105     88
In [185]:
# inner join : 共通のキーがある場合にキーで結合(共通項目のみで共通でないキーは捨てられる)
pd.merge(df1, df2, on = 'id')
Out[185]:
id point_x point_y
0 102 12 55
1 103 13 66
In [186]:
# full join :とにかく全結合
pd.merge(df1, df2, how = 'outer')
Out[186]:
id point
0 100 10
1 101 11
2 102 12
3 103 13
4 102 55
5 103 66
6 104 77
7 105 88
In [187]:
# インデックスをキーとして結合することもできる
data1 = {'id':[100,101,102,103],
         'point':[10,11,12,13]}

data2 = {'id':[102,103,104,105],
         'point':[55,66,77,88],
         'index_num':[0,1,2,3]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print('=================')
print(df2)

# df1のインデックスとdf2のindex_num
df3 = pd.merge(df1, df2, left_index= True, right_on='index_num')
df3
    id  point
0  100     10
1  101     11
2  102     12
3  103     13
=================
    id  index_num  point
0  102          0     55
1  103          1     66
2  104          2     77
3  105          3     88
Out[187]:
id_x point_x id_y index_num point_y
0 100 10 102 0 55
1 101 11 103 1 66
2 102 12 104 2 77
3 103 13 105 3 88
In [188]:
# left join : 左側の
data1 = {'id':[100,101,102,103],
         'point':[10,11,12,13]}

data2 = {'id':[102,103,104,105],
         'point':[55,66,77,88],
         'index_num':[0,1,2,3]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print('=================')
print(df2)

pd.merge(df1, df2, how = 'left')
    id  point
0  100     10
1  101     11
2  102     12
3  103     13
=================
    id  index_num  point
0  102          0     55
1  103          1     66
2  104          2     77
3  105          3     88
Out[188]:
id point index_num
0 100 10 NaN
1 101 11 NaN
2 102 12 NaN
3 103 13 NaN
In [189]:
# 縦結合 concat ただただ縦に積み上げる
data1 = {'a':[1,2,3,4],
         'b':[11,22,33,44]}
data2 = {'A':[5,6,7,8],
         'B':[55,66,77,88]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

print(df1)
print('============')
print(df2)
print('# concatの引数dfはリストで投げる必要がある')
pd.concat([df1, df2])
   a   b
0  1  11
1  2  22
2  3  33
3  4  44
============
   A   B
0  5  55
1  6  66
2  7  77
3  8  88
# concatの引数dfはリストで投げる必要がある
Out[189]:
A B a b
0 NaN NaN 1.0 11.0
1 NaN NaN 2.0 22.0
2 NaN NaN 3.0 33.0
3 NaN NaN 4.0 44.0
0 5.0 55.0 NaN NaN
1 6.0 66.0 NaN NaN
2 7.0 77.0 NaN NaN
3 8.0 88.0 NaN NaN
In [190]:
# データの操作・変換

#サンプルデータ
data = {'value1':[1,2,3,4,5],
        'value2':[1,2,3,4,5]}

index = [['a', 'a', 'b', 'b','c'],
         [1, 1, 2, 3, 4]]
    
df = pd.DataFrame(data, index=index)

df.index.names=['index1','index2']
df
Out[190]:
value1 value2
index1 index2
a 1 1 1
1 2 2
b 2 3 3
3 4 4
c 4 5 5
In [191]:
# ピボット操作:行列を入れ替える
df2 = df.stack()
print(type(df2))
pd.DataFrame(df2)
df2
<class 'pandas.core.series.Series'>
Out[191]:
index1  index2        
a       1       value1    1
                value2    1
                value1    2
                value2    2
b       2       value1    3
                value2    3
        3       value1    4
                value2    4
c       4       value1    5
                value2    5
dtype: int64
In [192]:
# 重複データの除去

# 重複のあるサンプルデータを生成

data = {'value1':[1,2,3,4,5,6,3,3],
        'value2':[1,2,3,0,0,0,3,3]}

df = pd.DataFrame(data)
df
Out[192]:
value1 value2
0 1 1
1 2 2
2 3 3
3 4 0
4 5 0
5 6 0
6 3 3
7 3 3
In [193]:
# 重複データを判定する duplicated()メソド
df.duplicated()
Out[193]:
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
dtype: bool
In [194]:
# 重複データを削除する drop_duplicates()メソド
df.drop_duplicates()
Out[194]:
value1 value2
0 1 1
1 2 2
2 3 3
3 4 0
4 5 0
5 6 0
In [195]:
# マッピング処理:エクセルのvlookup(検索)のような機能

map_data = {'c1':'cc1',
            'c2':'cc2',}
map_data
Out[195]:
{'c1': 'cc1', 'c2': 'cc2'}
In [196]:
# マッピング処理 :DataFrameのcity列に対応した新しいcity2という列を追加する
# 追加する前のサンプルデータを生成
df1_data ={'city':['c1','c1','c2','c3'],
           'name':['A','B','C','D']}

df1 = pd.DataFrame(df1_data)
df1
Out[196]:
city name
0 c1 A
1 c1 B
2 c2 C
3 c3 D
In [197]:
# 対応表を作成 
map_data = {'c1':'cc1',
            'c2':'cc2',}
print(map_data)

# city列の中でc1に対しては新規city2列を生成しcc1というデータを与える
df1['city2'] = df1['city'].map(map_data)
df1
{'c1': 'cc1', 'c2': 'cc2'}
Out[197]:
city name city2
0 c1 A cc1
1 c1 B cc1
2 c2 C cc2
3 c3 D NaN
In [198]:
# 無名関数とmap関数の組み合わせ:列の一部を取り出す
print(df1)
print()

# df['city2']の頭2文字を取り出して新しい列を作る
df1['city2']
df1['new'] = df1['city2'].map(lambda x: str(x)[0:2])

print(df1)
  city name city2
0   c1    A   cc1
1   c1    B   cc1
2   c2    C   cc2
3   c3    D   NaN

  city name city2 new
0   c1    A   cc1  cc
1   c1    B   cc1  cc
2   c2    C   cc2  cc
3   c3    D   NaN  na
In [199]:
lambda x: str(x)[0:2]
Out[199]:
<function __main__.<lambda>>
In [200]:
# ビン分割

# サンプルデータを生成する
data ={'year': [1955, 1990, 1989, 1992, 2015, 1980, 2009, 2005, 2019, 2020]}
df = pd.DataFrame(data)

# ビンを用意する
bins = [1980, 1990, 2000, 2010, 2020]

# ビン分割を実行
cut_data = pd.cut(df.year, bins)
print(cut_data)

"""
1990は[1980,1990]のクラスに入る
1980を超えて1990以下 という意味

なので1980はどのbinにも属さないのでNaNになる
"""
0             NaN
1    (1980, 1990]
2    (1980, 1990]
3    (1990, 2000]
4    (2010, 2020]
5             NaN
6    (2000, 2010]
7    (2000, 2010]
8    (2010, 2020]
9    (2010, 2020]
Name: year, dtype: category
Categories (4, interval[int64]): [(1980, 1990] < (1990, 2000] < (2000, 2010] < (2010, 2020]]
Out[200]:
'\n1990は[1980,1990]のクラスに入る\n1980を超えて1990以下\u3000という意味\n\nなので1980はどのbinにも属さないのでNaNになる\n'
In [201]:
# right=Falseオブションで 1980以上,1990未満に変更できる
# ビン分割を実行
cut_data = pd.cut(df.year, bins, right=False)
print(cut_data)
0             NaN
1    [1990, 2000)
2    [1980, 1990)
3    [1990, 2000)
4    [2010, 2020)
5    [1980, 1990)
6    [2000, 2010)
7    [2000, 2010)
8    [2010, 2020)
9             NaN
Name: year, dtype: category
Categories (4, interval[int64]): [[1980, 1990) < [1990, 2000) < [2000, 2010) < [2010, 2020)]
In [202]:
# 集計できる
pd.value_counts(cut_data)
Out[202]:
[2010, 2020)    2
[2000, 2010)    2
[1990, 2000)    2
[1980, 1990)    2
Name: year, dtype: int64
In [203]:
# 集計に名前をつけられる
group_names = ['1980s','1990s','2000s','2010s' ]
cut_data = pd.cut(df.year, bins, right=False, labels=group_names)
pd.value_counts(cut_data)
Out[203]:
2010s    2
2000s    2
1990s    2
1980s    2
Name: year, dtype: int64
In [204]:
# 分割数を数字で指定できる
pd.cut(df.year, 4)
Out[204]:
0    (1954.935, 1971.25]
1      (1987.5, 2003.75]
2      (1987.5, 2003.75]
3      (1987.5, 2003.75]
4      (2003.75, 2020.0]
5      (1971.25, 1987.5]
6      (2003.75, 2020.0]
7      (2003.75, 2020.0]
8      (2003.75, 2020.0]
9      (2003.75, 2020.0]
Name: year, dtype: category
Categories (4, interval[float64]): [(1954.935, 1971.25] < (1971.25, 1987.5] < (1987.5, 2003.75] < (2003.75, 2020.0]]
In [205]:
# 分位点で分割できる
cut_data = pd.qcut(df.year, 2)
print(cut_data)
print()
pd.value_counts(cut_data)
0    (1954.999, 1998.5]
1    (1954.999, 1998.5]
2    (1954.999, 1998.5]
3    (1954.999, 1998.5]
4      (1998.5, 2020.0]
5    (1954.999, 1998.5]
6      (1998.5, 2020.0]
7      (1998.5, 2020.0]
8      (1998.5, 2020.0]
9      (1998.5, 2020.0]
Name: year, dtype: category
Categories (2, interval[float64]): [(1954.999, 1998.5] < (1998.5, 2020.0]]

Out[205]:
(1998.5, 2020.0]      5
(1954.999, 1998.5]    5
Name: year, dtype: int64
In [206]:
# ヒスとグラム
df.hist(bins=6)
Out[206]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001EBCB56A9B0>]],
      dtype=object)
In [207]:
# データの集約とグループ演算

# サンプルデータの生成 region付き
data = {'id': [100,101,102,103,104,106,108,110],
       'city': ['tokyo', 'osaka', 'kyoto', 'hakodate', 'tokyo', 'tokyo', 'osaka', 'kyoto'],
       'birth_year': [1990, 1989, 1992, 1997, 1982, 1991, 1988, 1990],
       'name': ['hiroshi', 'akiko', 'yuki', 'satoru', 'steeve', 'mituru', 'aoi', 'tarou'],
       'region': ['kanto', 'kansai', 'kansai', 'hokkaido', 'kanto', 'kanto', 'kansai', 'kansai']}
df1 = pd.DataFrame(data)
df1
Out[207]:
birth_year city id name region
0 1990 tokyo 100 hiroshi kanto
1 1989 osaka 101 akiko kansai
2 1992 kyoto 102 yuki kansai
3 1997 hakodate 103 satoru hokkaido
4 1982 tokyo 104 steeve kanto
5 1991 tokyo 106 mituru kanto
6 1988 osaka 108 aoi kansai
7 1990 kyoto 110 tarou kansai
In [208]:
# cityの値を集計する
df1.groupby('city').size()
Out[208]:
city
hakodate    1
kyoto       2
osaka       2
tokyo       3
dtype: int64
In [209]:
# cityを軸にしてbirth_yearの平均値を求める
df1.groupby('city')['birth_year'].mean()
Out[209]:
city
hakodate    1997.000000
kyoto       1991.000000
osaka       1988.500000
tokyo       1987.666667
Name: birth_year, dtype: float64
In [210]:
# regionとcityの2軸にしても計算可能
df1.groupby(['region', 'city'])['birth_year'].mean()
Out[210]:
region    city    
hokkaido  hakodate    1997.000000
kansai    kyoto       1991.000000
          osaka       1988.500000
kanto     tokyo       1987.666667
Name: birth_year, dtype: float64
In [211]:
# このままテーブルとして使用したいとき as_index = False
df1.groupby(['region', 'city'], as_index=False)['birth_year'].mean()
Out[211]:
region city birth_year
0 hokkaido hakodate 1997.000000
1 kansai kyoto 1991.000000
2 kansai osaka 1988.500000
3 kanto tokyo 1987.666667
In [212]:
# groupbyメソドでイテレータを使う

# regionごと 順番に取り出す
# groupはregionを取り出し subdfはそのregionの行を抽出する
for group, subdf in df1.groupby('region'):
    print(group)
    print(subdf)
    print()
hokkaido
   birth_year      city   id    name    region
3        1997  hakodate  103  satoru  hokkaido

kansai
   birth_year   city   id   name  region
1        1989  osaka  101  akiko  kansai
2        1992  kyoto  102   yuki  kansai
6        1988  osaka  108    aoi  kansai
7        1990  kyoto  110  tarou  kansai

kanto
   birth_year   city   id     name region
0        1990  tokyo  100  hiroshi  kanto
4        1982  tokyo  104   steeve  kanto
5        1991  tokyo  106   mituru  kanto

In [213]:
# 複数の計算をまとめて行う aggメソド
# サンプルデータ
data = pd.read_csv('student-mat.csv', sep=';')
data.head()
Out[213]:
school sex age address famsize Pstatus Medu Fedu Mjob Fjob ... famrel freetime goout Dalc Walc health absences G1 G2 G3
0 GP F 18 U GT3 A 4 4 at_home teacher ... 4 3 4 1 1 3 6 5 6 6
1 GP F 17 U GT3 T 1 1 at_home other ... 5 3 3 1 1 3 4 5 5 6
2 GP F 15 U LE3 T 1 1 at_home other ... 4 3 2 2 3 3 10 7 8 10
3 GP F 15 U GT3 T 4 2 health services ... 3 2 2 1 1 5 2 15 14 15
4 GP F 16 U GT3 T 3 3 other other ... 4 3 2 1 2 5 4 6 10 10

5 rows × 33 columns

In [214]:
# 列に count, mean , max , minを適用する
functions = ['count', 'mean', 'max', 'min']
grouped_data = data.groupby(['sex', 'address'])
grouped_data['age', 'G1'].agg(functions)
Out[214]:
age G1
count mean max min count mean max min
sex address
F R 44 16.977273 19 15 44 10.295455 19 6
U 164 16.664634 20 15 164 10.707317 18 4
M R 44 17.113636 21 15 44 10.659091 18 3
U 143 16.517483 22 15 143 11.405594 19 5
In [215]:
# 欠損データ 異常値 の取り扱い

# サンプルデータの生成
np.random.seed(0)
df = pd.DataFrame(np.random.rand(10, 4))
df
Out[215]:
0 1 2 3
0 0.548814 0.715189 0.602763 0.544883
1 0.423655 0.645894 0.437587 0.891773
2 0.963663 0.383442 0.791725 0.528895
3 0.568045 0.925597 0.071036 0.087129
4 0.020218 0.832620 0.778157 0.870012
5 0.978618 0.799159 0.461479 0.780529
6 0.118274 0.639921 0.143353 0.944669
7 0.521848 0.414662 0.264556 0.774234
8 0.456150 0.568434 0.018790 0.617635
9 0.612096 0.616934 0.943748 0.681820
In [216]:
# NAにする
from numpy import nan as NA

df.iloc[1,0] = NA
df.iloc[2:3, 2] = NA
df.iloc[5:, 3] = NA
df
Out[216]:
0 1 2 3
0 0.548814 0.715189 0.602763 0.544883
1 NaN 0.645894 0.437587 0.891773
2 0.963663 0.383442 NaN 0.528895
3 0.568045 0.925597 0.071036 0.087129
4 0.020218 0.832620 0.778157 0.870012
5 0.978618 0.799159 0.461479 NaN
6 0.118274 0.639921 0.143353 NaN
7 0.521848 0.414662 0.264556 NaN
8 0.456150 0.568434 0.018790 NaN
9 0.612096 0.616934 0.943748 NaN
In [217]:
# リストワイズ削除 dropna:NaN を取り除く
df.dropna()
Out[217]:
0 1 2 3
0 0.548814 0.715189 0.602763 0.544883
3 0.568045 0.925597 0.071036 0.087129
4 0.020218 0.832620 0.778157 0.870012
In [218]:
# ペアワイズ削除 絶対に使いたい列を指定してから dropnaする
df[[0,1]].dropna()
Out[218]:
0 1
0 0.548814 0.715189
2 0.963663 0.383442
3 0.568045 0.925597
4 0.020218 0.832620
5 0.978618 0.799159
6 0.118274 0.639921
7 0.521848 0.414662
8 0.456150 0.568434
9 0.612096 0.616934
In [219]:
# fillnaで埋める
df.fillna(0).head()
Out[219]:
0 1 2 3
0 0.548814 0.715189 0.602763 0.544883
1 0.000000 0.645894 0.437587 0.891773
2 0.963663 0.383442 0.000000 0.528895
3 0.568045 0.925597 0.071036 0.087129
4 0.020218 0.832620 0.778157 0.870012
In [220]:
# 前の値で埋める ffill:フォワードフィル
df.fillna(method = 'ffill').head()
Out[220]:
0 1 2 3
0 0.548814 0.715189 0.602763 0.544883
1 0.548814 0.645894 0.437587 0.891773
2 0.963663 0.383442 0.437587 0.528895
3 0.568045 0.925597 0.071036 0.087129
4 0.020218 0.832620 0.778157 0.870012
In [221]:
# 平均値で埋める
df.fillna(df.mean()).head()
Out[221]:
0 1 2 3
0 0.548814 0.715189 0.602763 0.544883
1 0.531970 0.645894 0.437587 0.891773
2 0.963663 0.383442 0.413497 0.528895
3 0.568045 0.925597 0.071036 0.087129
4 0.020218 0.832620 0.778157 0.870012
In [222]:
# 時系列データの取り扱い
import pandas_datareader.data as pdr

start_date = '2001/1/2'
end_date = '2016/12/30'
df = pdr.DataReader('DEXJPUS', 'fred', start_date, end_date)
df.head()
Out[222]:
DEXJPUS
DATE
2001-01-02 114.73
2001-01-03 114.26
2001-01-04 115.47
2001-01-05 116.19
2001-01-08 115.97
In [223]:
# 特定の年月のデータを参照する
df['2016-04'].head()
Out[223]:
DEXJPUS
DATE
2016-04-01 112.06
2016-04-04 111.18
2016-04-05 110.26
2016-04-06 109.63
2016-04-07 107.98
In [224]:
# 月毎 M のデータで末尾 lastのデータを取り出す
# resampling 頻度変更の意 他: D Y
df.resample('M').last().head()
Out[224]:
DEXJPUS
DATE
2001-01-31 116.39
2001-02-28 117.28
2001-03-31 125.54
2001-04-30 123.57
2001-05-31 118.88
In [225]:
# 欠損がある場合 : 日毎 D
df.resample('D').last().head()
Out[225]:
DEXJPUS
DATE
2001-01-02 114.73
2001-01-03 114.26
2001-01-04 115.47
2001-01-05 116.19
2001-01-06 NaN
In [226]:
# 前日のデータで埋める ffill
df.resample('D').ffill().head()
Out[226]:
DEXJPUS
DATE
2001-01-02 114.73
2001-01-03 114.26
2001-01-04 115.47
2001-01-05 116.19
2001-01-06 116.19
In [227]:
# データを1日ずらして shift() 比率を計算する 
df['ratio'] = df / df.shift(1)

df.head()
Out[227]:
DEXJPUS ratio
DATE
2001-01-02 114.73 NaN
2001-01-03 114.26 0.995903
2001-01-04 115.47 1.010590
2001-01-05 116.19 1.006235
2001-01-08 115.97 0.998107
In [228]:
# 1日の差分を計算する diff
data['diff'] =  data['DEXJPUS'].diff(1)

data.head()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2524             try:
-> 2525                 return self._engine.get_loc(key)
   2526             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'DEXJPUS'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-228-fb810b21f40f> in <module>()
      1 # 1日の差分を計算する diff
----> 2 data['diff'] =  data['DEXJPUS'].diff(1)
      3 
      4 data.head()

~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2137             return self._getitem_multilevel(key)
   2138         else:
-> 2139             return self._getitem_column(key)
   2140 
   2141     def _getitem_column(self, key):

~\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
   2144         # get column
   2145         if self.columns.is_unique:
-> 2146             return self._get_item_cache(key)
   2147 
   2148         # duplicate columns & possible reduce dimensionality

~\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
   1840         res = cache.get(item)
   1841         if res is None:
-> 1842             values = self._data.get(item)
   1843             res = self._box_item_values(item, values)
   1844             cache[item] = res

~\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
   3841 
   3842             if not isna(item):
-> 3843                 loc = self.items.get_loc(item)
   3844             else:
   3845                 indexer = np.arange(len(self.items))[isna(self.items)]

~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2525                 return self._engine.get_loc(key)
   2526             except KeyError:
-> 2527                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2528 
   2529         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'DEXJPUS'
In [229]:
#pct_changeを使っても比率は計算できる
df['pct_change'] = df['DEXJPUS'].pct_change(1)
df.head()
Out[229]:
DEXJPUS ratio pct_change
DATE
2001-01-02 114.73 NaN NaN
2001-01-03 114.26 0.995903 -0.004097
2001-01-04 115.47 1.010590 0.010590
2001-01-05 116.19 1.006235 0.006235
2001-01-08 115.97 0.998107 -0.001893
In [230]:
# 過去3日移動平均
df['mean_3'] = df['DEXJPUS'].rolling(3).mean()
df.head()
Out[230]:
DEXJPUS ratio pct_change mean_3
DATE
2001-01-02 114.73 NaN NaN NaN
2001-01-03 114.26 0.995903 -0.004097 NaN
2001-01-04 115.47 1.010590 0.010590 114.820000
2001-01-05 116.19 1.006235 0.006235 115.306667
2001-01-08 115.97 0.998107 -0.001893 115.876667
In [231]:
# 過去3日標準偏差 stdメソド
df['std_3'] = df['DEXJPUS'].rolling(3).std() 
df.head()
Out[231]:
DEXJPUS ratio pct_change mean_3 std_3
DATE
2001-01-02 114.73 NaN NaN NaN NaN
2001-01-03 114.26 0.995903 -0.004097 NaN NaN
2001-01-04 115.47 1.010590 0.010590 114.820000 0.610000
2001-01-05 116.19 1.006235 0.006235 115.306667 0.975312
2001-01-08 115.97 0.998107 -0.001893 115.876667 0.368963
In [232]:
# =============== Script End =================