強化学習の基礎、多腕バンディット問題

面白そうなのでやってみました。

ちなみに、本日のアイキャッチは画像生成AIであるStableDiffusionで「multi arms bandit problem by stable diffusion」というワードから生成した一枚です。

モジュールインポート　クラス作成

# 強化学習　バンディット問題
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

class Bandit():
    def __init__(self, arms=10):
        self.arms = arms
        self.rates = np.random.rand(self.arms)# 勝率
    
    def play(self, arm): # プレイして報酬を得る
        if self.rates[arm] > np.random.rand():
            return 1 # 勝利したときの報酬
        else:
            return 0 # 敗北したときの報酬
        
class NonStatBandit():
    def __init__(self, arms=10):
        self.arms = arms
        self.rates = np.random.rand(self.arms)# 勝率
    
    def play(self, arm): # プレイして報酬を得る
        self.rates += 0.01 * np.random.randn(self.arms) # 報酬にノイズを導入する
        
        if self.rates[arm] > np.random.rand():
            return 1 # 勝利したときの報酬
        else:
            return 0 # 敗北したときの報酬
        
        
class AlphaAgent:
    def __init__(self, epsilon, alpha, actions=10):
        self.epsilon = epsilon
        self.Qs = np.zeros(actions)
        self.alpha = alpha
    
    def get_action(self): 
        # ランダム　または　最大価値　の行動を選択
        if np.random.rand() < self.epsilon: # 貪欲を選択した場合
            return np.random.randint(0, len(self.Qs)) # ランダムに行動を選択する
        else:
            return np.argmax(self.Qs)
        
    def update(self, action, reward):
        # alphaで価値を更新する
        self.Qs[action] += self.alpha * (reward - self.Qs[action])

# 強化学習　バンディット問題

%matplotlib inline

import numpy as np

import matplotlib.pyplot as plt

class Bandit():

def __init__(self, arms=10):

self.arms = arms

self.rates = np.random.rand(self.arms)# 勝率

def play(self, arm): # プレイして報酬を得る

if self.rates[arm] > np.random.rand():

return 1 # 勝利したときの報酬

else:

return 0 # 敗北したときの報酬

class NonStatBandit():

def __init__(self, arms=10):

self.arms = arms

self.rates = np.random.rand(self.arms)# 勝率

def play(self, arm): # プレイして報酬を得る

self.rates += 0.01 * np.random.randn(self.arms) # 報酬にノイズを導入する

if self.rates[arm] > np.random.rand():

return 1 # 勝利したときの報酬

else:

return 0 # 敗北したときの報酬

class AlphaAgent:

def __init__(self, epsilon, alpha, actions=10):

self.epsilon = epsilon

self.Qs = np.zeros(actions)

self.alpha = alpha

def get_action(self):

# ランダム　または　最大価値　の行動を選択

if np.random.rand() < self.epsilon: # 貪欲を選択した場合

return np.random.randint(0, len(self.Qs)) # ランダムに行動を選択する

else:

return np.argmax(self.Qs)

def update(self, action, reward):

# alphaで価値を更新する

self.Qs[action] += self.alpha * (reward - self.Qs[action])

バンディットとエージェントで学習　定常問題

# 定常バンディット問題を解いてみる
np.random.seed(0)
bandit = Bandit(arms=10)

agent = AlphaAgent(epsilon=0.1, alpha=0.8, actions=10)
steps = 1_000

total_reward = 0
total_rewards = []
rates =[]

for step in range(steps):
    action = agent.get_action() #　行動を選択
    reward = bandit.play(action)# 行動し報酬を得る
    agent.update(action=action, reward=reward) # 選択した行動と得た報酬から行動価値を更新する。
    
    total_reward += reward
    total_rewards.append(total_reward) 
    rates.append(total_reward / (step + 1))
    
plt.bar(np.arange(0,10,1),bandit.rates, color='red', width=0.3, label='rates', align="center")
plt.bar(np.arange(0,10,1) + 0.3, agent.Qs, color='blue', width=0.3, label='Qs', align="center")
plt.xticks(np.arange(0,10,1))
plt.xticks(np.arange(0,10,1))
plt.legend()
plt.show()

print('total_reward : ', total_reward)
plt.plot(total_rewards, label='total_reward')
plt.legend()
plt.show()

print('final_rate : ', rates[-1])
plt.plot(rates, label='rates')
plt.legend()
plt.show()

# 定常バンディット問題を解いてみる

np.random.seed(0)

bandit = Bandit(arms=10)

agent = AlphaAgent(epsilon=0.1, alpha=0.8, actions=10)

steps = 1_000

total_reward = 0

total_rewards = []

rates =[]

for step in range(steps):

action = agent.get_action() #　行動を選択

reward = bandit.play(action)# 行動し報酬を得る

agent.update(action=action, reward=reward) # 選択した行動と得た報酬から行動価値を更新する。

total_reward += reward

total_rewards.append(total_reward)

rates.append(total_reward / (step + 1))

plt.bar(np.arange(0,10,1),bandit.rates, color='red', width=0.3, label='rates', align="center")

plt.bar(np.arange(0,10,1) + 0.3, agent.Qs, color='blue', width=0.3, label='Qs', align="center")

plt.xticks(np.arange(0,10,1))

plt.legend()

plt.show()

print('total_reward : ', total_reward)

plt.plot(total_rewards, label='total_reward')

plt.legend()

plt.show()

print('final_rate : ', rates[-1])

plt.plot(rates, label='rates')

plt.legend()

plt.show()

結果

モデルを２００回繰り返した平均結果

# 定常バンディット問題をruns = 200回ランダムに解いて平均を見る
#np.random.seed(0)
runs = 200    # モデル数
steps = 1_000 # 学習の更新ステップ数

actions = 10  # 選択可能な「行動」の数
epsilon = 0.1 # 貪欲率
alpha = 0.8   # 学習率

# 勝率の結果を保存するテーブルを生成する
all_rates = np.zeros((runs, steps)) # 試行NO. x １試行の中での強化ステップNO. ※引数はタプル型


for run in range(runs): 
    # バンディットとエージェントのインスタンスを生成する
    bandit = Bandit(arms=actions)
    agent = AlphaAgent(epsilon=epsilon, alpha=alpha, actions=actions)
    
    # 報酬の初期化
    total_reward = 0
    total_rewards = []
    rates = []
    
    for step in range(steps):
        action = agent.get_action() #　行動を選択
        reward = bandit.play(action)# 行動し報酬を得る
        agent.update(action=action, reward=reward) # 選択した行動と得た報酬から行動価値を更新する。

        total_reward += reward
        total_rewards.append(total_reward) 
        rates.append(total_reward / (step + 1))
    
    all_rates[run] = rates # 勝率の結果を保存する

# 定常バンディット問題をruns = 200回ランダムに解いて平均を見る

#np.random.seed(0)

runs = 200 # モデル数

steps = 1_000 # 学習の更新ステップ数

actions = 10 # 選択可能な「行動」の数

epsilon = 0.1 # 貪欲率

alpha = 0.8 # 学習率

# 勝率の結果を保存するテーブルを生成する

all_rates = np.zeros((runs, steps)) # 試行NO. x １試行の中での強化ステップNO. ※引数はタプル型

for run in range(runs):

# バンディットとエージェントのインスタンスを生成する

bandit = Bandit(arms=actions)

agent = AlphaAgent(epsilon=epsilon, alpha=alpha, actions=actions)

# 報酬の初期化

total_reward = 0

total_rewards = []

rates = []

for step in range(steps):

action = agent.get_action() #　行動を選択

reward = bandit.play(action)# 行動し報酬を得る

agent.update(action=action, reward=reward) # 選択した行動と得た報酬から行動価値を更新する。

total_reward += reward

total_rewards.append(total_reward)

rates.append(total_reward / (step + 1))

all_rates[run] = rates # 勝率の結果を保存する

# 200モデル分
for run in range(runs):
    plt.plot(all_rates[run])
plt.show()

# 200モデル分の平均
average_rates = all_rates.mean(axis=0)
plt.plot(average_rates)
plt.show()

# 200モデル分

for run in range(runs):

plt.plot(all_rates[run])

plt.show()

# 200モデル分の平均

average_rates = all_rates.mean(axis=0)

plt.plot(average_rates)

plt.show()

バンディットとエージェントで学習　非定常問題

# 非定常バンディット問題を解いてみる
np.random.seed(0)
bandit = NonStatBandit(arms=10)

agent = AlphaAgent(epsilon=0.1, alpha=0.8, actions=10)
steps = 1_000

total_reward = 0
total_rewards = []
rates =[]

for step in range(steps):
    action = agent.get_action() #　行動を選択
    reward = bandit.play(action)# 行動し報酬を得る
    agent.update(action=action, reward=reward) # 選択した行動と得た報酬から行動価値を更新する。
    
    total_reward += reward
    total_rewards.append(total_reward) 
    rates.append(total_reward / (step + 1))
    
plt.bar(np.arange(0,10,1),bandit.rates, color='red', width=0.3, label='rates', align="center")
plt.bar(np.arange(0,10,1) + 0.3, agent.Qs, color='blue', width=0.3, label='Qs', align="center")
plt.xticks(np.arange(0,10,1))
plt.xticks(np.arange(0,10,1))
plt.legend()
plt.show()

print('total_reward : ', total_reward)
plt.plot(total_rewards, label='total_reward')
plt.legend()
plt.show()

print('final_rate : ', rates[-1])
plt.plot(rates, label='rates')
plt.legend()
plt.show()

# 非定常バンディット問題を解いてみる

np.random.seed(0)

bandit = NonStatBandit(arms=10)

agent = AlphaAgent(epsilon=0.1, alpha=0.8, actions=10)

steps = 1_000

total_reward = 0

total_rewards = []

rates =[]

for step in range(steps):

action = agent.get_action() #　行動を選択

reward = bandit.play(action)# 行動し報酬を得る

agent.update(action=action, reward=reward) # 選択した行動と得た報酬から行動価値を更新する。

total_reward += reward

total_rewards.append(total_reward)

rates.append(total_reward / (step + 1))

plt.bar(np.arange(0,10,1),bandit.rates, color='red', width=0.3, label='rates', align="center")

plt.bar(np.arange(0,10,1) + 0.3, agent.Qs, color='blue', width=0.3, label='Qs', align="center")

plt.xticks(np.arange(0,10,1))

plt.legend()

plt.show()

print('total_reward : ', total_reward)

plt.plot(total_rewards, label='total_reward')

plt.legend()

plt.show()

print('final_rate : ', rates[-1])

plt.plot(rates, label='rates')

plt.legend()

plt.show()

結果

モデルを２００回繰り返した平均結果

# 非定常バンディット問題をruns = 200回ランダムに解いて平均を見る
#np.random.seed(0)
runs = 200    # モデル数
steps = 1_000 # 学習の更新ステップ数

actions = 10  # 選択可能な「行動」の数
epsilon = 0.1 # 貪欲率
alpha = 0.8   # 学習率

# 勝率の結果を保存するテーブルを生成する
all_rates = np.zeros((runs, steps)) # 試行NO. x １試行の中での強化ステップNO. ※引数はタプル型

for run in range(runs): 
    # バンディットとエージェントのインスタンスを生成する
    bandit = NonStatBandit(arms=actions)
    agent = AlphaAgent(epsilon=epsilon, alpha=alpha, actions=actions)
    
    # 報酬の初期化
    total_reward = 0
    total_rewards = []
    rates = []
    
    for step in range(steps):
        action = agent.get_action() #　行動を選択
        reward = bandit.play(action)# 行動し報酬を得る
        agent.update(action=action, reward=reward) # 選択した行動と得た報酬から行動価値を更新する。

        total_reward += reward
        total_rewards.append(total_reward) 
        rates.append(total_reward / (step + 1))
    
    all_rates[run] = rates # 勝率の結果を保存する

# 非定常バンディット問題をruns = 200回ランダムに解いて平均を見る

#np.random.seed(0)

runs = 200 # モデル数

steps = 1_000 # 学習の更新ステップ数

actions = 10 # 選択可能な「行動」の数

epsilon = 0.1 # 貪欲率

alpha = 0.8 # 学習率

# 勝率の結果を保存するテーブルを生成する

all_rates = np.zeros((runs, steps)) # 試行NO. x １試行の中での強化ステップNO. ※引数はタプル型

for run in range(runs):

# バンディットとエージェントのインスタンスを生成する

bandit = NonStatBandit(arms=actions)

agent = AlphaAgent(epsilon=epsilon, alpha=alpha, actions=actions)

# 報酬の初期化

total_reward = 0

total_rewards = []

rates = []

for step in range(steps):

action = agent.get_action() #　行動を選択

reward = bandit.play(action)# 行動し報酬を得る

agent.update(action=action, reward=reward) # 選択した行動と得た報酬から行動価値を更新する。

total_reward += reward

total_rewards.append(total_reward)

rates.append(total_reward / (step + 1))

all_rates[run] = rates # 勝率の結果を保存する

# 200モデル分
for run in range(runs):
    plt.plot(all_rates[run])
plt.show()

# 200モデル分の平均
average_rates = all_rates.mean(axis=0)
plt.plot(average_rates)
plt.show()

# 200モデル分

for run in range(runs):

plt.plot(all_rates[run])

plt.show()

# 200モデル分の平均

average_rates = all_rates.mean(axis=0)

plt.plot(average_rates)

plt.show()

この記事を書いた人
最新の記事

月	火	水	木	金	土	日
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31

強化学習：多腕バンディット問題　定常・非定常

強化学習の基礎、多腕バンディット問題

Keita N

最新記事 by Keita N (全て見る)