# chapter 2
# マジックコマンドのクリックリファレンス
%quickref

# numpy拡張マジックコマンド
%precision 3       

# matplotlib拡張マジックコマンド
%matplotlib inline

# 計算モジュール インポート
import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame

# 可視化モジュール インポート
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# 少数第三位まで表示
%precision 3

'%.3f'

# numpy
import numpy as np
ar = np.array([1,2,3,4,5])
print(ar)

print(ar.dtype)
print(ar.ndim) #次元数
print(ar.size) #要素数

[1 2 3 4 5]
int32
1
5

# 掛け算ができる listとは違って数学的
ar     # array([1, 2, 3, 4, 5])
ar * 2 # array([ 2,  4,  6,  8, 10])
ar / 2 # array([0.5, 1. , 1.5, 2. , 2.5])
ar **2 # array([ 1,  4,  9, 16, 25], dtype=int32)

array([ 1,  4,  9, 16, 25], dtype=int32)

# sort
ar_before = [3,7,2,5,6,7,2]
ar_after = ar_before.sort()

print(ar_before)#[2, 2, 3, 5, 6, 7, 7]
print(ar_after) #None
# 破壊的：元のar_beforeが書き換わってしまった。

# 降順で並び替える方法
ar_before = [3,7,2,5,6,7,2]
ar_before[::-1].sort()
print(ar_before)

# できない・・・

# メソドsortは破壊的。　嫌い
ar_before = [3,7,2,5,6,7,2]
ar_before.sort() # sortメソド　昇順
print(ar_before)

ar_before = [3,7,2,5,6,7,2]
ar_before.sort(reverse=True)# sortメソド　reverse=Trueで降順
print(ar_before) 

# 関数sortedは破壊的でない。　おすすめ。
ar_before = [3,7,2,5,6,7,2]
ar_after = sorted(ar_before)
print(ar_after)
print(ar_before) # 元のリストは破壊されていない。

ar_before = [3,7,2,5,6,7,2]
ar_after = sorted(ar_before, reverse=True) # reverse=Trueで降順
print(ar_after)
print(ar_before) # 元のリストは破壊されていない。

[2, 2, 3, 5, 6, 7, 7]
None
[3, 7, 2, 5, 6, 7, 2]
[2, 2, 3, 5, 6, 7, 7]
[7, 7, 6, 5, 3, 2, 2]
[2, 2, 3, 5, 6, 7, 7]
[3, 7, 2, 5, 6, 7, 2]
[7, 7, 6, 5, 3, 2, 2]
[3, 7, 2, 5, 6, 7, 2]

# メソド　max min sum cumsum
ar = np.array([1,2,3,4,5]) # listではむり

print(ar.max()) # 5
print(ar.min())
print(ar.sum())
print(ar.cumsum())

# 累積割合
ar.cumsum() / ar.sum()

5
1
15
[ 1  3  6 10 15]

array([0.067, 0.2  , 0.4  , 0.667, 1.   ])

# 乱数
import numpy.random as random
random.seed(0)
rand_data = random.random(100)
print(rand_data)
print(rand_data.max(), rand_data.min(), rand_data.sum()/100)
  
# 正規分布 平均0 分散1
dist_normal = random.randn(10)
print(dist_normal)

[0.549 0.715 0.603 0.545 0.424 0.646 0.438 0.892 0.964 0.383 0.792 0.529
 0.568 0.926 0.071 0.087 0.02  0.833 0.778 0.87  0.979 0.799 0.461 0.781
 0.118 0.64  0.143 0.945 0.522 0.415 0.265 0.774 0.456 0.568 0.019 0.618
 0.612 0.617 0.944 0.682 0.36  0.437 0.698 0.06  0.667 0.671 0.21  0.129
 0.315 0.364 0.57  0.439 0.988 0.102 0.209 0.161 0.653 0.253 0.466 0.244
 0.159 0.11  0.656 0.138 0.197 0.369 0.821 0.097 0.838 0.096 0.976 0.469
 0.977 0.605 0.739 0.039 0.283 0.12  0.296 0.119 0.318 0.414 0.064 0.692
 0.567 0.265 0.523 0.094 0.576 0.929 0.319 0.667 0.132 0.716 0.289 0.183
 0.587 0.02  0.829 0.005]
0.9883738380592262 0.004695476192547066 0.4727938395125177
[-1.165  0.901  0.466 -1.536  1.488  1.896  1.179 -0.18  -1.071  1.054]

# データの抽出　random.choice() 重複あり
random.choice(rand_data, 5)

# 重複なし replace = False
random.choice(rand_data, 5, replace=False)

array([0.094, 0.568, 0.071, 0.414, 0.612])

# 行列
ar = np.arange(9) # array([0, 1, 2, 3, 4, 5, 6, 7, 8])

matrix = np.arange(9).reshape(3, 3)
'''
array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])
'''
# 行列の抽出
ar[2:7] # array([2, 3, 4, 5, 6])

matrix[0,1] # 1
matrix[0,:] # array([0, 1, 2])

array([0, 1, 2])

# 行列の演算
matrix1 =np.arange(9).reshape(3,3)
matrix2 =np.arange(9).reshape(3,3)

# ドット積
np.dot(matrix1, matrix2)
'''
array([[ 15,  18,  21],
       [ 42,  54,  66],
       [ 69,  90, 111]]
'''

# 要素同士の積
matrix1 * matrix2
'''
array([[ 0,  1,  4],
       [ 9, 16, 25],
       [36, 49, 64]])
'''

'\narray([[ 0,  1,  4],\n       [ 9, 16, 25],\n       [36, 49, 64]])\n'

# ゼロ行列
np.zeros((3,4), np.int64) # サイズをタプルで入れる必要がある

# 1行列
np.ones((3,4), np.int64)

array([[1, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1]], dtype=int64)

# 練習問題
ar = np.arange(51)

print(np.sum(ar))

1275

# 練習問題
data = random.randn(10)
print(data.max(), data.min(), data.sum())

0.4717433848177106 -1.4341543937355707 -3.0313154100501407

# 練習問題
(np.ones((5,5)) * 3 ) ** 2

array([[9., 9., 9., 9., 9.],
       [9., 9., 9., 9., 9.],
       [9., 9., 9., 9., 9.],
       [9., 9., 9., 9., 9.],
       [9., 9., 9., 9., 9.]])

# chapter 2-3 scipy
# 線形代数モジュール
import scipy.linalg as linarg

# 最適化計算用の関数
from scipy.optimize import minimize_scalar

# 行列式　と　逆行列
matrix = np.array([[1,-1,-1],[-1,1,-1],[-1,-1,1]])
'''
array([[ 1, -1, -1],
       [-1,  1, -1],
       [-1, -1,  1]])
'''
# 行列式det を求める
linarg.det(matrix) # -4.000

# 逆行列invers　を求める
linarg.inv(matrix)
'''
array([[ 0. , -0.5, -0.5],
       [-0.5, -0. , -0.5],
       [-0.5, -0.5,  0. ]])
'''
# 確認する
matrix.dot(linarg.inv(matrix))
'''単位行列になるはず
array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])
'''
linarg.inv(matrix).dot(matrix)
'''
array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])
'''

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

# 固有値　と　固有ベクトル
eig_value, eig_vector = linarg.eig(matrix)
print('固有値：\n', eig_value)
print('固有ベクトル：\n',eig_vector)

固有値：
 [-1.+0.j  2.+0.j  2.+0.j]
固有ベクトル：
 [[ 0.577 -0.816  0.428]
 [ 0.577  0.408 -0.816]
 [ 0.577  0.408  0.389]]

# 最適化計算　ニュートン法
from scipy.optimize import newton

# 関数定義
def func1(x):
    return x**2 + 2*x + 1

# 解く
newton(func1,0) # 　-1.000　0は [ 0 = x**2 + 2*x + 1 ] を解くという意味

# 最小値を求める
minimize_scalar(func1,method='brent')

'''
     fun: 0.0
    nfev: 9
     nit: 4
 success: True
       x: -1.0000000000000002
'''

     fun: 0.0
    nfev: 9
     nit: 4
 success: True
       x: -1.0000000000000002

# 練習
matrix_a = ([[1,2,3],[1,3,2],[3,1,2]])

linarg.det(matrix_a) # 行列式 -12.000
linarg.inv(matrix_a) # 逆行列
'''
array([[-0.333,  0.083,  0.417],
       [-0.333,  0.583, -0.083],
       [ 0.667, -0.417, -0.083]])
'''
eig_value, eig_vector = linarg.eig(matrix_a)
print(eig_value) # 固有値 [ 6.   +0.j -1.414+0.j  1.414+0.j]
print(eig_vector)# 固有ベクトル
'''
[[-0.577 -0.722  0.16 ]
 [-0.577 -0.143 -0.811]
 [-0.577  0.677  0.563]]
 '''

[ 6.   +0.j -1.414+0.j  1.414+0.j]
[[-0.577 -0.722  0.16 ]
 [-0.577 -0.143 -0.811]
 [-0.577  0.677  0.563]]

'\n[[-0.577 -0.722  0.16 ]\n [-0.577 -0.143 -0.811]\n [-0.577  0.677  0.563]]\n '

# 練習　ニュートン法
def func(x):
    return (x**3 + 2*x +1)

newton(func,0) # -0.453

-0.453

# chapter 2-4 pandas
from pandas import Series, DataFrame

data = pd.Series([0,10,20,30,40,50])
data

0     0
1    10
2    20
3    30
4    40
5    50
dtype: int64

# indexをつける
index = ['a','b','c','d','e','f']
data = pd.Series([0,10,20,30,40,50], index=index)
data

a     0
b    10
c    20
d    30
e    40
f    50
dtype: int64

data.values # array([ 0, 10, 20, 30, 40, 50], dtype=int64)
data.index  # Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

# DataFrame

# ディクショナリを定義
dic = {'id':[100,101,102,103], 'name':['aaa','bbb','ccc','ddd'],'age':[5,3,7,8]}
dic

{'id': [100, 101, 102, 103],
 'name': ['aaa', 'bbb', 'ccc', 'ddd'],
 'age': [5, 3, 7, 8]}

# ディクショナリをDataFrameに変換
df = pd.DataFrame(dic)
df

# index後付
df.index =['a','b','c','d']
df

# 任意のcolumnを抽出
df.name
'''
a    aaa
b    bbb
c    ccc
d    ddd
Name: name, dtype: object
'''
df['name'] # でも同じ

# 複数のときはリストで渡すこと
df[['name', 'age']]

# 転置
df.T

# データの抽出 フィルター
df

# 列を抽出して5のものを調べる boolで出力される
df['age'] == 5

a     True
b    False
c    False
d    False
Name: age, dtype: bool

# このboolを利用してTrue部分を抽出する
df[df['age'] == 5]

# 複数条件で抽出するとき　isin
df['age']

a    5
b    3
c    7
d    8
Name: age, dtype: int64

df['age'].isin([3,7])

a    False
b     True
c     True
d    False
Name: age, dtype: bool

# このboolを利用して
df[df['age'].isin([3,7])]

# 練習：age 5 以上を抽出してみる
df['age'] >= 5

a     True
b    False
c     True
d     True
Name: age, dtype: bool

df[df['age'] >= 5]

# データの結合と削除
# age 列axis=1 を 削除drop
df.drop(['age'], axis=1)

df # 元データからageが消えたわけではない　df = df.drop([ ].axis=1) または パラメータinplace = Trueで置き換える

# index b,c を削除dorp axis=0
df.drop(['b', 'c'], axis = 0)

# データの結合　merge
df1=pd.DataFrame({'id':[100,101,102,103], 'name':['aaa','bbb','ccc','ddd']})
df2=pd.DataFrame({'id':[100,101,102,103], 'age':[5,7,9,3]})
df3=pd.DataFrame({'id':[100,101,102,103], 'sex':['male','male','female','female']})

# 共通のキー 'id' を自動的に見つけてマージしてくれる
df1_2  = pd.merge(df1, df2)
df1_2_3 = pd.merge(df1_2, df3)
df1_2_3

# 集計 

# 性別毎に年齢の平均値を集計する
df1_2_3.groupby('sex')['age'].mean()

# 性別毎に年齢の最大値を集計する
df1_2_3.groupby('sex')['age'].max()

sex
female    9
male      7
Name: age, dtype: int64

# ソート

# データ作成
df = df1_2_3
df.index = ['b','a','c','d']
df

# ソートインデックス
df.sort_index()

# ソート　age
df.age.sort_values()

d    3
b    5
a    7
c    9
Name: age, dtype: int64

# 欠損データ　nunの判定
df1=pd.DataFrame({'id':[100,101,102,103], 'data':[435,432,24,64]})
df = df.merge(df1)
df

# female を抽出する
df.isin(['female'])

# dataをnunにする
df['data'] = np.nan
df

# nan　を　isnullメソドで調べる
df.isnull()

# 個数を数える
df.isnull().sum()

id      0
name    0
age     0
sex     0
data    4
dtype: int64

# 練習：
data = {'ID':['1','2','3','4','5'],
        'sex':['F','F','M','M','F'],
        'Money':[1000, 2000, 500, 300, 700],
        'Name':['Saito', 'Horie', 'Kondo', 'Kawada', 'Matsubara']}

df = pd.DataFrame(data)
df

# 性別毎の平均Moneyを求める
df.groupby(['sex'])['Money'].mean()

sex
F    1233.333333
M     400.000000
Name: Money, dtype: float64

# IDでマージして　Money Math English の平均を求める
data2 = {'ID':['3','4','7'],
        'Math':[60,30,40],
        'English':[80,20,30]}

df2 = pd.DataFrame(data2)
df2

df = df.merge(df2)
df

print(df.Money.mean())
print(df.Math.mean())
print(df.English.mean())

400.0
45.0
50.0

# chapter 2-5 matplotlib seaborn
import matplotlib as mpl
import seaborn as sbn

import matplotlib.pyplot as plt
%matplotlib inline

# サンプルデータの作成
import numpy.random as random

random.seed(0)
x = random.randn(30)
y = np.sin(x) + np.random.randn(30)

# 折れ線グラフ（線を消している）
plt.figure(figsize=(20,6))
plt.plot(x,y, 'o')
plt.grid(True)

# 散布図
plt.figure(figsize=(20,6))
plt.scatter(x,y)
plt.title('title', fontsize=24)
plt.xlabel('xlabel', fontsize=18)
plt.ylabel('ylabel', fontsize=18)
plt.grid(True)

# 連続した線グラフ
random.seed(1)
x = np.arange(1000)

y = np.random.randn(1000).cumsum()
plt.figure(figsize=(20,6))
plt.plot(x, y, label='label_a')
plt.grid(True)
plt.legend(loc='best')
plt.show()

# グラフを分割 subplot
plt.figure(figsize=(20, 6))

# ２行１列配置

# １行目のグラフ
plt.subplot(2,1,1)

# ２行目のグラフ
plt.subplot(2,1,2)

<matplotlib.axes._subplots.AxesSubplot at 0x1f2f607f0b8>

# 関数グラフ

def func1(x):
    return (x**2 + 2*x + 1)


x = np.arange(-10,10)

plt.figure(figsize=(20,6))
plt.plot(x, func1(x))
plt.grid(True)

# ヒストグラム
random.seed(2)

plt.figure(figsize=(20,6))

# 対象データ　分割個数　範囲
data = np.random.randn(10**5) * 10 + 50

plt.hist(data, bins=60, range = (0,100))
plt.grid(True)

# ?plt.hist

# 練習
from scipy.optimize import newton

def func(x):
    return (5*x + 3)

x = np.arange(-10,10,0.1)
y = func(x)

plt.figure(figsize=(20,6))
plt.plot(x, y, 'o')
plt.xlim(-10, 10)
plt.grid(True)

newton(func, 0)

-0.600

# 練習
x = np.arange(-10, 10, 0.1)

y1 = np.sin(x)
y2 = np.cos(x)

plt.figure(figsize=(20, 6))
plt.xlim(-10, 10)
plt.plot(x, y1, label='sin(x)')
plt.plot(x, y2, label='cos(x)')
plt.legend(loc='best')

<matplotlib.legend.Legend at 0x1f2fba96b38>

# 練習
import numpy.random as random

random.seed(0)
x1 = random.uniform(0, 1, 100)

random.seed(1)
x2 = random.uniform(0, 1, 1000)

random.seed(0)
x3 = random.uniform(0, 1, 10000)

random.seed(1)
x4 = random.uniform(0, 1, 100000)

plt.figure(figsize=(20,10))

plt.subplot(4,1,1)
plt.hist(x1,bins=20, range = (0, 1))

plt.subplot(4,1,2)
plt.hist(x2,bins=20, range = (0, 1))

plt.subplot(4,1,3)
plt.hist(x3,bins=20, range = (0, 1))

plt.subplot(4,1,4)
plt.hist(x4,bins=20, range = (0, 1))

(array([5100., 5038., 4996., 4987., 5168., 5041., 4854., 4996., 4932.,
        4902., 4987., 5057., 4950., 5042., 4965., 4997., 4946., 4976.,
        5085., 4981.]),
 array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
        0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]),
 <a list of 20 Patch objects>)

# 練習　モンテカルロ法　乱数を使って円周率を求める
import numpy.random as random
import math

random.seed(0)
x = random.uniform(0, 1, 10000)
y = random.uniform(0, 1, 10000)
plt.plot(x, y, 'o')

# 原点からの距離　ユーグリッドノルム　(x**2 + y**2)**(1/2)

li = []
li_hypot = []
for x_elm, y_elm in zip(x, y):
    li.append((x_elm, y_elm))
    li_hypot.append(math.hypot(x_elm, y_elm))

li = np.array(li)
li_hypot = np.array(li_hypot)
type(li_hypot)
li_hypot < 1 # bool
print('円の中に入る個数：', sum(li_hypot < 1))
print('割合：', sum(li_hypot < 1) / len(li_hypot))

円の中に入る個数： 7868
割合： 0.7868

df = pd.DataFrame(li)
df.columns = ['x','y']
df2 = pd.DataFrame(li_hypot)
df = pd.merge(df, df2, on = df.index)
df.keys()
del df['key_0']
df.columns = ['x','y','hypot']

# 抽出
df_inner = df[df['hypot'] < 1]
df_inner

df_outer = df[df['hypot'] >= 1]
df_outer

plt.figure(figsize=(10,4))

plt.subplot(1,2,1)
plt.plot(df_inner['x'], df_inner['y'], 'o')

plt.subplot(1,2,2)
plt.plot(df_outer['x'], df_outer['y'], 'x')

[<matplotlib.lines.Line2D at 0x1f2fac4b748>]

# 半径1の円の面積はπ
# 当該グラフでは　1/4 の面積なのでπ/4 のはず
# グラフの正方形は１辺が1なので面積は1


print('円の中に入る個数：', sum(li_hypot < 1))
print('割合：', sum(li_hypot < 1) / len(li_hypot))

# だったので　割合 =π/4 : 1　のはず
# π/4 = 割合　→　π = 4 * 割合　より 

pai = 4 * (sum(li_hypot < 1) / len(li_hypot))
pai

円の中に入る個数： 7868
割合： 0.7868

3.1472

	id	name	age	sex	data
0	100	aaa	5	male	435
1	101	bbb	7	male	432
2	102	ccc	9	female	24
3	103	ddd	3	female	64

	id	name	age	sex	data
0	False	False	False	False	False
1	False	False	False	False	False
2	False	False	False	True	False
3	False	False	False	True	False

	ID	sex	Money	Name
0	1	F	1000	Saito
1	2	F	2000	Horie
2	3	M	500	Kondo
3	4	M	300	Kawada
4	5	F	700	Matsubara