In [2]:
import pandas as pd
In [3]:
# Seriesを作成する
li = [1,2,3,4,5]
ser = pd.Series(li)
ser
Out[3]:
0    1
1    2
2    3
3    4
4    5
dtype: int64
In [4]:
# DataFrameを作成する
li = [['aaa', 1, True],
      ['bbb', 2, False],
      ['ccc', 3, False]]
df = pd.DataFrame(li)
df
# 一つの列に文字列が混在すると演算できなくなるので注意
Out[4]:
0 1 2
0 aaa 1 True
1 bbb 2 False
2 ccc 3 False
In [5]:
# DataFrameを観察する
import numpy as np
import pandas as pd

df =pd.DataFrame(np.arange(100).reshape(25,4))
df.head(3) #データの頭の3行を表示する
df.tail(3) #データの尾の3行を表示する
df.shape   #(25, 4)  25行 x 4列
Out[5]:
(25, 4)
In [6]:
# DataFrameのindex とcolumns を定義する

df = pd.DataFrame(np.arange(6).reshape(3,2))

df.index = ['001', '002', '003']
df.columns =['Value_1', 'Value_2']
df
Out[6]:
Value_1 Value_2
001 0 1
002 2 3
003 4 5
In [7]:
# index指定
df[0:2]
Out[7]:
Value_1 Value_2
001 0 1
002 2 3
In [8]:
# columns指定
df['Value_1']
Out[8]:
001    0
002    2
003    4
Name: Value_1, dtype: int32
In [9]:
# columns指定 複数の場合はリストで渡す
df[['Value_1','Value_2']]
Out[9]:
Value_1 Value_2
001 0 1
002 2 3
003 4 5
In [10]:
# ディクショナリから DataFrameを作成する
dic ={'aaa':[1,2,3,4], 'bbb':[5,6,7,8], 'c':[9,10,11,12]}
df = pd.DataFrame(dic)
df
Out[10]:
aaa bbb c
0 1 5 9
1 2 6 10
2 3 7 11
3 4 8 12
In [11]:
# ロケーション指定 loc , iloc 
import numpy as np
import pandas as pd

# 24行x 4列のDataFrameを生成
df = pd.DataFrame(np.arange(5*4).reshape(5,4))
df.index = ['row1', 'row2', 'row3', 'row4', 'row5']
df.columns =['A', 'B', 'C', 'D', ]
df
Out[11]:
A B C D
row1 0 1 2 3
row2 4 5 6 7
row3 8 9 10 11
row4 12 13 14 15
row5 16 17 18 19
In [12]:
df.loc[:,:] # 必ず index columnsの両方を指定するので曖昧さがない
df.loc[:,'B']
Out[12]:
row1     1
row2     5
row3     9
row4    13
row5    17
Name: B, dtype: int32
In [13]:
df.loc['row3','B']
Out[13]:
9
In [14]:
df.loc[['row1', 'row2'], 'B']
Out[14]:
row1    1
row2    5
Name: B, dtype: int32
In [15]:
df.iloc[0,0] # 0
df.iloc[2:4, :]
Out[15]:
A B C D
row3 8 9 10 11
row4 12 13 14 15
In [16]:
# データの読み込み
df_csv = pd.read_csv('data/sample_data.csv', encoding='utf-8', index_col=0, parse_dates=True)
df_excel = pd.read_excel('data/sample_data.xlsx', encoding='utf-8', index_col=0, parse_dates=True)
df = df_csv
In [17]:
# htmlのtableから表を抜き出す
# url = ""
# tables = pd.read_html(url)
# df_html = tables[0]
In [18]:
# データの書き込み
df.to_csv('data/write_data.csv')       # csvデータ
df.to_excel('data/write_data.xlsx')    # エクセルデータ
df.to_pickle('data/write_data.pickle') # ピクルデータ
In [19]:
# 保存したデータの読み込み
df_csv = pd.read_csv('data/sample_data.csv', encoding='utf-8')
df_pickle = pd.read_pickle('data/write_data.pickle')
In [20]:
df_pickle
Out[20]:
column_1 column_2 column_3 column_4 column_5 column_6
rows
row1 1 1.01 aaa True aaa aaa
row2 2 2.01 bbb False bbb bbb
row3 3 3.01 aaa True NaN 4.01
row4 4 4.01 bbb False NaN 5.01
row5 5 5.01 aaa True NaN 7
row6 6 6.01 bbb False bbb 8
row7 7 7.01 aaa True aaa NaN
row8 8 8.01 bbb False bbb NaN
row9 9 9.01 aaa True aaa FALSE
row10 10 10.01 bbb False bbb TRUE
In [21]:
# 条件抽出
df_bool = df_pickle['column_2'] >= 5
df_bool
Out[21]:
rows
row1     False
row2     False
row3     False
row4     False
row5      True
row6      True
row7      True
row8      True
row9      True
row10     True
Name: column_2, dtype: bool
In [22]:
df_selected = df_pickle[df_pickle['column_2'] > 5]
df_selected
Out[22]:
column_1 column_2 column_3 column_4 column_5 column_6
rows
row5 5 5.01 aaa True NaN 7
row6 6 6.01 bbb False bbb 8
row7 7 7.01 aaa True aaa NaN
row8 8 8.01 bbb False bbb NaN
row9 9 9.01 aaa True aaa FALSE
row10 10 10.01 bbb False bbb TRUE
In [23]:
# 条件抽出 SQLのようにqueryメソドを使用する
df_pickle.query('column_2 >= 5 and column_2 <= 9')
Out[23]:
column_1 column_2 column_3 column_4 column_5 column_6
rows
row5 5 5.01 aaa True NaN 7
row6 6 6.01 bbb False bbb 8
row7 7 7.01 aaa True aaa NaN
row8 8 8.01 bbb False bbb NaN
In [24]:
# 各列の型を確認する df.dtypes
df_pickle.dtypes
Out[24]:
column_1      int64
column_2    float64
column_3     object
column_4       bool
column_5     object
column_6     object
dtype: object
In [25]:
# 型を変更する
df = df_pickle # 以後説明しやすいように名前を変更しておきます。

df.loc[:,'date'] = df.loc[:,'column_1']
df['date'] = df['date'].apply(pd.to_datetime)
df
Out[25]:
column_1 column_2 column_3 column_4 column_5 column_6 date
rows
row1 1 1.01 aaa True aaa aaa 1970-01-01 00:00:00.000000001
row2 2 2.01 bbb False bbb bbb 1970-01-01 00:00:00.000000002
row3 3 3.01 aaa True NaN 4.01 1970-01-01 00:00:00.000000003
row4 4 4.01 bbb False NaN 5.01 1970-01-01 00:00:00.000000004
row5 5 5.01 aaa True NaN 7 1970-01-01 00:00:00.000000005
row6 6 6.01 bbb False bbb 8 1970-01-01 00:00:00.000000006
row7 7 7.01 aaa True aaa NaN 1970-01-01 00:00:00.000000007
row8 8 8.01 bbb False bbb NaN 1970-01-01 00:00:00.000000008
row9 9 9.01 aaa True aaa FALSE 1970-01-01 00:00:00.000000009
row10 10 10.01 bbb False bbb TRUE 1970-01-01 00:00:00.000000010
In [26]:
# int から floatに型を変更する
df['float'] = df['column_1'].astype(np.float32)
df
Out[26]:
column_1 column_2 column_3 column_4 column_5 column_6 date float
rows
row1 1 1.01 aaa True aaa aaa 1970-01-01 00:00:00.000000001 1.0
row2 2 2.01 bbb False bbb bbb 1970-01-01 00:00:00.000000002 2.0
row3 3 3.01 aaa True NaN 4.01 1970-01-01 00:00:00.000000003 3.0
row4 4 4.01 bbb False NaN 5.01 1970-01-01 00:00:00.000000004 4.0
row5 5 5.01 aaa True NaN 7 1970-01-01 00:00:00.000000005 5.0
row6 6 6.01 bbb False bbb 8 1970-01-01 00:00:00.000000006 6.0
row7 7 7.01 aaa True aaa NaN 1970-01-01 00:00:00.000000007 7.0
row8 8 8.01 bbb False bbb NaN 1970-01-01 00:00:00.000000008 8.0
row9 9 9.01 aaa True aaa FALSE 1970-01-01 00:00:00.000000009 9.0
row10 10 10.01 bbb False bbb TRUE 1970-01-01 00:00:00.000000010 10.0
In [27]:
# 並べ替え .sort()
df.sort_values(by = 'column_2', ascending=True)  # 昇順
df.sort_values(by = 'column_2', ascending=False) # 降順
Out[27]:
column_1 column_2 column_3 column_4 column_5 column_6 date float
rows
row10 10 10.01 bbb False bbb TRUE 1970-01-01 00:00:00.000000010 10.0
row9 9 9.01 aaa True aaa FALSE 1970-01-01 00:00:00.000000009 9.0
row8 8 8.01 bbb False bbb NaN 1970-01-01 00:00:00.000000008 8.0
row7 7 7.01 aaa True aaa NaN 1970-01-01 00:00:00.000000007 7.0
row6 6 6.01 bbb False bbb 8 1970-01-01 00:00:00.000000006 6.0
row5 5 5.01 aaa True NaN 7 1970-01-01 00:00:00.000000005 5.0
row4 4 4.01 bbb False NaN 5.01 1970-01-01 00:00:00.000000004 4.0
row3 3 3.01 aaa True NaN 4.01 1970-01-01 00:00:00.000000003 3.0
row2 2 2.01 bbb False bbb bbb 1970-01-01 00:00:00.000000002 2.0
row1 1 1.01 aaa True aaa aaa 1970-01-01 00:00:00.000000001 1.0
In [28]:
# 関数を使って条件分岐する
def judge(value):
    if value >= 7:
        return 'good'
    if value >=4:
        return 'nomal'
    else:
        return 'bad'
    
df['judged'] = df['column_2'].apply(judge)    
df
Out[28]:
column_1 column_2 column_3 column_4 column_5 column_6 date float judged
rows
row1 1 1.01 aaa True aaa aaa 1970-01-01 00:00:00.000000001 1.0 bad
row2 2 2.01 bbb False bbb bbb 1970-01-01 00:00:00.000000002 2.0 bad
row3 3 3.01 aaa True NaN 4.01 1970-01-01 00:00:00.000000003 3.0 bad
row4 4 4.01 bbb False NaN 5.01 1970-01-01 00:00:00.000000004 4.0 nomal
row5 5 5.01 aaa True NaN 7 1970-01-01 00:00:00.000000005 5.0 nomal
row6 6 6.01 bbb False bbb 8 1970-01-01 00:00:00.000000006 6.0 nomal
row7 7 7.01 aaa True aaa NaN 1970-01-01 00:00:00.000000007 7.0 good
row8 8 8.01 bbb False bbb NaN 1970-01-01 00:00:00.000000008 8.0 good
row9 9 9.01 aaa True aaa FALSE 1970-01-01 00:00:00.000000009 9.0 good
row10 10 10.01 bbb False bbb TRUE 1970-01-01 00:00:00.000000010 10.0 good
In [29]:
# One-Hot エンコーディングをする
df_moved = pd.get_dummies(df['judged'], prefix='J')
df_moved
Out[29]:
J_bad J_good J_nomal
rows
row1 1 0 0
row2 1 0 0
row3 1 0 0
row4 0 0 1
row5 0 0 1
row6 0 0 1
row7 0 1 0
row8 0 1 0
row9 0 1 0
row10 0 1 0
In [30]:
# pickle保存しておく
pd.to_pickle(df_moved, 'data/df_moved.pickle')
In [31]:
# 時系列データの作成
dates = pd.date_range(start='2019-01-01', end='2019-1-30')
np.random.seed(0)
df =pd.DataFrame(np.random.randint(1,31,30), index=dates, columns=['rand_value'])
df.head()
Out[31]:
rand_value
2019-01-01 13
2019-01-02 16
2019-01-03 22
2019-01-04 1
2019-01-05 4
In [32]:
# 時系列データの作成2
dates = pd.date_range(start='2019-01-01', periods=365)
np.random.seed(1)
df = pd.DataFrame(np.random.randint(1, 31, 365), index=dates, columns=['rand_value'])
df.head()
Out[32]:
rand_value
2019-01-01 6
2019-01-02 12
2019-01-03 13
2019-01-04 9
2019-01-05 10
In [33]:
# 月平均を算出する .grooupby(pd.Grouper())
df.groupby(pd.Grouper(freq='M')).mean().head()
Out[33]:
rand_value
2019-01-31 14.870968
2019-02-28 15.000000
2019-03-31 15.387097
2019-04-30 15.700000
2019-05-31 14.193548
In [34]:
# 月平均を算出する2 .resample()
df['rand_value'].resample('M').mean().head()
# Serieseで返ってくる
Out[34]:
2019-01-31    14.870968
2019-02-28    15.000000
2019-03-31    15.387097
2019-04-30    15.700000
2019-05-31    14.193548
Freq: M, Name: rand_value, dtype: float64
In [35]:
# 年間の土曜日の日付データを作成する
pd.date_range(start='2019-01-01', end='2019-12-31', freq='W-SAT')
Out[35]:
DatetimeIndex(['2019-01-05', '2019-01-12', '2019-01-19', '2019-01-26',
               '2019-02-02', '2019-02-09', '2019-02-16', '2019-02-23',
               '2019-03-02', '2019-03-09', '2019-03-16', '2019-03-23',
               '2019-03-30', '2019-04-06', '2019-04-13', '2019-04-20',
               '2019-04-27', '2019-05-04', '2019-05-11', '2019-05-18',
               '2019-05-25', '2019-06-01', '2019-06-08', '2019-06-15',
               '2019-06-22', '2019-06-29', '2019-07-06', '2019-07-13',
               '2019-07-20', '2019-07-27', '2019-08-03', '2019-08-10',
               '2019-08-17', '2019-08-24', '2019-08-31', '2019-09-07',
               '2019-09-14', '2019-09-21', '2019-09-28', '2019-10-05',
               '2019-10-12', '2019-10-19', '2019-10-26', '2019-11-02',
               '2019-11-09', '2019-11-16', '2019-11-23', '2019-11-30',
               '2019-12-07', '2019-12-14', '2019-12-21', '2019-12-28'],
              dtype='datetime64[ns]', freq='W-SAT')
In [36]:
# 土曜日までの1週間単位でデータをまとめる
df_year = pd.DataFrame(df.groupby(pd.Grouper(freq='W-SAT')).sum(), columns=['rand_value'])
df_year.head()
Out[36]:
rand_value
2019-01-05 50
2019-01-12 67
2019-01-19 124
2019-01-26 127
2019-02-02 127
In [37]:
#欠損値の処理
df_nan = pd.read_pickle('data/write_data.pickle')

#欠損値の行を削除する
df_droped = df_nan.dropna()
df_droped
Out[37]:
column_1 column_2 column_3 column_4 column_5 column_6
rows
row1 1 1.01 aaa True aaa aaa
row2 2 2.01 bbb False bbb bbb
row6 6 6.01 bbb False bbb 8
row9 9 9.01 aaa True aaa FALSE
row10 10 10.01 bbb False bbb TRUE
In [38]:
#欠損値に0を代入して埋める
df_fillna = df_nan.fillna(0)
df_fillna
Out[38]:
column_1 column_2 column_3 column_4 column_5 column_6
rows
row1 1 1.01 aaa True aaa aaa
row2 2 2.01 bbb False bbb bbb
row3 3 3.01 aaa True 0 4.01
row4 4 4.01 bbb False 0 5.01
row5 5 5.01 aaa True 0 7
row6 6 6.01 bbb False bbb 8
row7 7 7.01 aaa True aaa 0
row8 8 8.01 bbb False bbb 0
row9 9 9.01 aaa True aaa FALSE
row10 10 10.01 bbb False bbb TRUE
In [39]:
#欠損値を1つ手前の値で補完する
df_fill = df_nan.fillna(method='ffill')
df_fill
Out[39]:
column_1 column_2 column_3 column_4 column_5 column_6
rows
row1 1 1.01 aaa True aaa aaa
row2 2 2.01 bbb False bbb bbb
row3 3 3.01 aaa True bbb 4.01
row4 4 4.01 bbb False bbb 5.01
row5 5 5.01 aaa True bbb 7
row6 6 6.01 bbb False bbb 8
row7 7 7.01 aaa True aaa 8
row8 8 8.01 bbb False bbb 8
row9 9 9.01 aaa True aaa FALSE
row10 10 10.01 bbb False bbb TRUE
In [40]:
# データの連結
df = pd.read_pickle('data/df_sample.pickle')
df_moved = pd.read_pickle('data/df_moved.pickle')

# 列方向の連結
df_merged = pd.concat([df, df_moved], axis=1)
df_merged.head()
C:\Users\keita\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\api.py:107: RuntimeWarning: '<' not supported between instances of 'float' and 'str', sort order is undefined for incomparable objects
  result = result.union(other)
Out[40]:
column_2 column_3 column_4 column_5 column_6 c2*c4 jadged judged J_bad J_good J_nomal
1.0 1.01 aaa True aaa aaa 1.01 bad bad NaN NaN NaN
2.0 2.01 bbb False bbb bbb 0.00 bad bad NaN NaN NaN
3.0 3.01 aaa True NaN 4.01 3.01 bad bad NaN NaN NaN
4.0 4.01 bbb False NaN 5.01 0.00 nomal nomal NaN NaN NaN
5.0 5.01 aaa True NaN 7 5.01 nomal nomal NaN NaN NaN
In [41]:
# 行方向に連結する
df_1 = df.loc[1:2,:]
df_2 = df.loc[9:10,:]

df = pd.concat([df_1, df_2], axis=0, sort=True)
df 
Out[41]:
c2*c4 column_2 column_3 column_4 column_5 column_6 jadged judged
float
1.0 1.01 1.01 aaa True aaa aaa bad bad
2.0 0.00 2.01 bbb False bbb bbb bad bad
9.0 9.01 9.01 aaa True aaa FALSE good good
10.0 0.00 10.01 bbb False bbb TRUE good good
In [42]:
# 基本的な統計量を算出する
np.random.seed(2)
df = pd.DataFrame(np.random.randint(0,11,100), columns=['value'])
print(df.loc[:,'value'].max(),
      df.loc[:,'value'].min(),
      df.loc[:,'value'].mode(), #最頻値
      df.loc[:,'value'].mean(),
      df.loc[:,'value'].median(),
      df.loc[:,'value'].std(ddof=0), # 母集団の標準偏差
      df.loc[:,'value'].std(ddof=1), # サンプルの標準偏差      
     )     
10 0 0    10
dtype: int32 5.55 6.0 3.1124748994971836 3.128154973039785
In [43]:
#カウント
df[df.loc[:,'value'] == 5].count()
Out[43]:
value    9
dtype: int64
In [44]:
# 要約された統計量を算出する
df.describe()
Out[44]:
value
count 100.000000
mean 5.550000
std 3.128155
min 0.000000
25% 3.000000
50% 6.000000
75% 8.000000
max 10.000000
In [45]:
# 相関係数
data = np.random.randn(300).reshape(100,3)
df = pd.DataFrame(data)
df.head()

df.corr()

from pandas.plotting import scatter_matrix
%matplotlib inline

_ = scatter_matrix(df)
In [46]:
# pandas.DataFrame から numpy.arrayへの変換
df[0].values
Out[46]:
array([ 1.02210506e+00,  1.53636029e+00, -1.18854160e+00, -3.18840605e-01,
        1.95254214e-01,  1.20769783e+00,  4.17160217e-01,  1.20769557e-01,
        2.35854982e-01,  1.47630693e+00, -2.57855698e+00,  2.23092726e-01,
        5.48677242e-01,  1.29975337e-01,  1.33354447e-01, -1.63213498e+00,
        1.16837169e+00,  4.08842472e-02, -3.95195402e-01, -1.17113099e+00,
        5.57721015e-01,  2.10202473e-02,  1.96504656e-01,  6.38925860e-01,
       -3.77376397e-02,  9.26374048e-03, -4.32477714e-02,  8.19477959e-01,
       -2.74608267e-01, -1.11840069e+00,  1.02868791e+00, -9.17827897e-02,
        1.00139308e+00,  1.36927887e+00, -3.32376106e-01, -8.98514459e-01,
        6.60488981e-01,  1.30103340e+00,  5.00463181e-01, -3.79377283e-01,
       -5.18945829e-01,  9.09490540e-01, -1.08860575e+00, -2.81766922e-01,
       -8.00976861e-01,  1.37357822e+00, -1.28768987e+00, -9.37719104e-01,
       -7.47252631e-01, -5.76454194e-01, -1.41699281e+00, -1.36057075e-01,
       -8.81027234e-01, -1.77330774e+00,  3.31707535e-02, -1.04990183e-01,
       -1.71142690e+00, -9.12673226e-01, -1.08172822e+00,  1.99335230e+00,
       -3.86961106e-01, -2.09225941e-01, -4.27889084e-01,  6.06603798e-01,
       -3.36370275e-01,  1.80052517e+00,  8.33292653e-01,  1.33480770e+00,
       -1.44644975e+00,  6.66240193e-01, -1.64386955e+00,  6.11191152e-01,
        4.77865842e-01,  9.18109263e-01, -7.38847589e-01,  1.57718123e+00,
        8.67298665e-01, -2.42036059e-01, -3.21914568e-01,  1.01525122e+00,
       -5.69088895e-01,  8.48138843e-01, -2.13080187e-02,  1.73029626e+00,
        3.23823588e-01,  1.20917022e+00, -8.33636664e-01, -1.29003470e+00,
       -3.13110100e-01,  3.57341451e-01, -4.10464725e-01, -1.84265262e-03,
       -6.61755432e-01, -6.22129678e-01, -8.87283517e-01,  1.16993295e+00,
        3.32767248e-01, -4.41954269e-01,  4.31653084e-01, -1.20600785e+00])
In [ ]:
# ---- end of sript ----