ddpg_pytorch

DDPGのコード例です。

念のため。

Set-ExecutionPolicy RemoteSigned -Scope Process -Force

しておきましょう。

うまく動かないかもしれません。

import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class OUActionNoise(object):
    def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None):
        self.mu = mu
        self.sigma = sigma
        self.theta = theta
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
        self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)

    def sotre_transiton(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = 1 - done
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        new_states = self.new_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        terminal = self.terminal_memory[batch]

        return states, actions, rewards, new_states, terminal
    
class CriticNetwork(nn.Module):
    def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions, name, chkpt_dir='temp/ddpg'):
        super(CriticNetwork, self).__init__()

        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.checkpoint_file = os.path.join(chkpt_dir, name+'_ddpg')

        self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)
        f1 = 1/ np.sqrt(self.fc1.weight.data.size()[0])
        T.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
        T.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
        self.bn1 = nn.LayerNorm(self.fc1_dims)

        self.fc2 =nn.Linear(fc1_dims, fc2_dims)
        f2 = 1/ np.sqrt(self.fc2.weight.data.size()[0])
        T.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
        self.bn2 = nn.LayerNorm(self.fc2_dims)

        self.action_value = nn.Linear(self.n_actions, fc2_dims)
        f3 = 0.003
        self.q = nn.Linear(self.fc2_dims, 1)
        T.nn.init.uniform_(self.q.weight.data, -f3, f3)
        T.nn.initi.uniform_(self.q.bias.data, -f3, f3)

        self.optimizer = optim.Adam(self.parameters(), lr=beta)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

        self.to(self.device)

    def forward(self, state, action):
        state_value = self.fc1(state)
        state_value = self.bn1(state_value)
        state_value = F.relu(state_value)
        state_value = self.fc2(state_value)
        state_value = self.bn2(state_value)

        action_value = F.relu(self.action_value(action))
        state_action_value = F.relu(T.add(state_value, action_value))
        state_action_value = self.q(state_action_value)

        return state_action_value
    
    def save_checkpoint(self):
        print('... saving checkpoint ...')
        T.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        print('... loading checkpoint ...')
        self.load_state_dict(T.load(self.checkpoint_file))

class ActorNetwork(nn.Module):
    def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, n_actions, name, chkpt_dir='tmp/ddpg'):
        super(ActorNetwork, self).__init()
        self.input_dims = input_dims
        self.a_actions = n_actions
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.checkpoint_file = os.path.join(chkpt_dir, name+'_ddpg')      
        self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)
        f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
        T.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
        T.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
        self.bn1 =  nn.LayerNorm(self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
        T.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
        T.nn.init.uniform_(self.fc2.bias.data, -f2, f2)        

        f3 = 0.003
        self.mu = nn.Linear(self.fc2_dims, self.n_actions)
        T.nn.init.uniform_(self.mu.weight.data, -f3, f3)
        T.nn.init.uniform_(self.mu.bias.data, -f3, f3)

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = self.fc1(state)
        x = self.bn1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = T.tanh(self.mu(x))
        
        return x

class Agent(object):
    def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=4000, layer2_size=300, bathc_size=64):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.bathc_size = bathc_size

        self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Actor')
        
        self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetActor')

        self.critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Critic')

        self.target_critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetCritic')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        self.update_network_parameters(tau=1)


    def choose_action(self, observation):
        self.actor.eval()
        observation = T.tensor(observation, dtype=T.float).to(self.aactor.device)
        mu = self.actor(observation).to(self.actor.device)
        mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to(self.actor.device)
        self.actor.train()
        
        return mu_prime.cpu().detach.numpy()
    
    def remenber(self, state, action, reward, new_state, done):
        self.memory.sotre_transiton(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done =self.memory.sample_buffer(self.batch_size)
        
        done = T.tensor(done.to(self.criti.device))
        new_state = T.tensor(new_state, dtype=T.float).to(self.criitic.device)
        aciton = T.tensor(action, dtype=T.float).to(self.critic.device)
        state = T.tensor(state, dtype=T.float).to(self.critic.device)

        self.target_actor.eval()
        self.target_critic.eval()
        self.critic.eval()
        
        target_actions = self.target_actor.forward(new_state)
        critic_value_ = self.target_critic.forward(new_state, target_actions)
        critic_value = self.critic.forward(state, action)

        target = []
        for j in range(self.batch_size):
            target.append(reward[j] + self.gamma*critic_value_[j]*done[j])
        
        target = T.tensor(target).to(self.critic.device)
        target = target.view(self.bathc_size, 1)

        self.critic.train()
        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.critic.eval()
        self.actor.optimizer.zero_grad()
        mu = self.actor.forward(state, mu)
        self.actor.train()
        actor_loss = -self.critic.forward(state, mu)
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
            (1-tau)*target_critic_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
            (1-tau)*target_actor_dict[name].clone()

        self.target_actor.load_state_dict(actor_state_dict)

    def save_models(self):
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()        
        self.target_actor.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()        
        self.target_actor.load_checkpoint()
        self.target_critic.load_checkpoint()       




        #https://www.youtube.com/watch?v=6Yd5WnYls_Y&t=1117s
        #https://youtu.be/6Yd5WnYls_Y?t=2912
        #48:32
"""
以上、用意したクラス
・ノイズ
・リプレイバッファ
・クリティックNN
・アクターNN
・エージェント
"""

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

import os

import torch as T

import torch.nn as nn

import torch.nn.functional as F

import torch.optim as optim

import numpy as np

class OUActionNoise(object):

def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None):

self.mu = mu

self.sigma = sigma

self.theta = theta

self.dt = dt

self.x0 = x0

self.reset()

def __call__(self):

x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \

self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)

self.x_prev = x

return x

def reset(self):

self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

class ReplayBuffer(object):

def __init__(self, max_size, input_shape, n_actions):

self.mem_size = max_size

self.mem_cntr = 0

self.state_memory = np.zeros((self.mem_size, input_shape))

self.new_state_memory = np.zeros((self.mem_size, input_shape))

self.action_memory = np.zeros((self.mem_size, n_actions))

self.reward_memory = np.zeros(self.mem_size)

self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)

def sotre_transiton(self, state, action, reward, state_, done):

index = self.mem_cntr % self.mem_size

self.state_memory[index] = state

self.action_memory[index] = action

self.reward_memory[index] = reward

self.new_state_memory[index] = state_

self.terminal_memory[index] = 1 - done

self.mem_cntr += 1

def sample_buffer(self, batch_size):

max_mem = min(self.mem_cntr, self.mem_size)

batch = np.random.choice(max_mem, batch_size)

states = self.state_memory[batch]

new_states = self.new_state_memory[batch]

actions = self.action_memory[batch]

rewards = self.reward_memory[batch]

terminal = self.terminal_memory[batch]

return states, actions, rewards, new_states, terminal

class CriticNetwork(nn.Module):

def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions, name, chkpt_dir='temp/ddpg'):

super(CriticNetwork, self).__init__()

self.input_dims = input_dims

self.fc1_dims = fc1_dims

self.fc2_dims = fc2_dims

self.n_actions = n_actions

self.checkpoint_file = os.path.join(chkpt_dir, name+'_ddpg')

self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)

f1 = 1/ np.sqrt(self.fc1.weight.data.size()[0])

T.nn.init.uniform_(self.fc1.weight.data, -f1, f1)

T.nn.init.uniform_(self.fc1.bias.data, -f1, f1)

self.bn1 = nn.LayerNorm(self.fc1_dims)

self.fc2 =nn.Linear(fc1_dims, fc2_dims)

f2 = 1/ np.sqrt(self.fc2.weight.data.size()[0])

T.nn.init.uniform_(self.fc2.weight.data, -f2, f2)

self.bn2 = nn.LayerNorm(self.fc2_dims)

self.action_value = nn.Linear(self.n_actions, fc2_dims)

f3 = 0.003

self.q = nn.Linear(self.fc2_dims, 1)

T.nn.init.uniform_(self.q.weight.data, -f3, f3)

T.nn.initi.uniform_(self.q.bias.data, -f3, f3)

self.optimizer = optim.Adam(self.parameters(), lr=beta)

self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

self.to(self.device)

def forward(self, state, action):

state_value = self.fc1(state)

state_value = self.bn1(state_value)

state_value = F.relu(state_value)

state_value = self.fc2(state_value)

state_value = self.bn2(state_value)

action_value = F.relu(self.action_value(action))

state_action_value = F.relu(T.add(state_value, action_value))

state_action_value = self.q(state_action_value)

return state_action_value

def save_checkpoint(self):

print('... saving checkpoint ...')

T.save(self.state_dict(), self.checkpoint_file)

def load_checkpoint(self):

print('... loading checkpoint ...')

self.load_state_dict(T.load(self.checkpoint_file))

class ActorNetwork(nn.Module):

def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, n_actions, name, chkpt_dir='tmp/ddpg'):

super(ActorNetwork, self).__init()

self.input_dims = input_dims

self.a_actions = n_actions

self.fc1_dims = fc1_dims

self.fc2_dims = fc2_dims

self.checkpoint_file = os.path.join(chkpt_dir, name+'_ddpg')

self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)

f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])

T.nn.init.uniform_(self.fc1.weight.data, -f1, f1)

T.nn.init.uniform_(self.fc1.bias.data, -f1, f1)

self.bn1 = nn.LayerNorm(self.fc1_dims)

self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)

f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])

T.nn.init.uniform_(self.fc2.weight.data, -f2, f2)

T.nn.init.uniform_(self.fc2.bias.data, -f2, f2)

f3 = 0.003

self.mu = nn.Linear(self.fc2_dims, self.n_actions)

T.nn.init.uniform_(self.mu.weight.data, -f3, f3)

T.nn.init.uniform_(self.mu.bias.data, -f3, f3)

self.optimizer = optim.Adam(self.parameters(), lr=alpha)

self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

self.to(self.device)

def forward(self, state):

x = self.fc1(state)

x = self.bn1(x)

x = self.fc2(x)

x = self.bn2(x)

x = T.tanh(self.mu(x))

return x

class Agent(object):

def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=4000, layer2_size=300, bathc_size=64):

self.gamma = gamma

self.tau = tau

self.memory = ReplayBuffer(max_size, input_dims, n_actions)

self.bathc_size = bathc_size

self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Actor')

self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetActor')

self.critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='Critic')

self.target_critic = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='TargetCritic')

self.noise = OUActionNoise(mu=np.zeros(n_actions))

self.update_network_parameters(tau=1)

def choose_action(self, observation):

self.actor.eval()

observation = T.tensor(observation, dtype=T.float).to(self.aactor.device)

mu = self.actor(observation).to(self.actor.device)

mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to(self.actor.device)

self.actor.train()

return mu_prime.cpu().detach.numpy()

def remenber(self, state, action, reward, new_state, done):

self.memory.sotre_transiton(state, action, reward, new_state, done)

def learn(self):

if self.memory.mem_cntr < self.batch_size:

return

state, action, reward, new_state, done =self.memory.sample_buffer(self.batch_size)

done = T.tensor(done.to(self.criti.device))

new_state = T.tensor(new_state, dtype=T.float).to(self.criitic.device)

aciton = T.tensor(action, dtype=T.float).to(self.critic.device)

state = T.tensor(state, dtype=T.float).to(self.critic.device)

self.target_actor.eval()

self.target_critic.eval()

self.critic.eval()

target_actions = self.target_actor.forward(new_state)

critic_value_ = self.target_critic.forward(new_state, target_actions)

critic_value = self.critic.forward(state, action)

target = []

for j in range(self.batch_size):

target.append(reward[j] + self.gamma*critic_value_[j]*done[j])

target = T.tensor(target).to(self.critic.device)

target = target.view(self.bathc_size, 1)

self.critic.train()

self.critic.optimizer.zero_grad()

critic_loss = F.mse_loss(target, critic_value)

critic_loss.backward()

self.critic.optimizer.step()

self.critic.eval()

self.actor.optimizer.zero_grad()

mu = self.actor.forward(state, mu)

self.actor.train()

actor_loss = -self.critic.forward(state, mu)

actor_loss = T.mean(actor_loss)

actor_loss.backward()

self.actor.optimizer.step()

self.update_network_parameters()

def update_network_parameters(self, tau=None):

if tau is None:

tau = self.tau

actor_params = self.actor.named_parameters()

critic_params = self.critic.named_parameters()

target_actor_params = self.target_actor.named_parameters()

target_critic_params = self.target_critic.named_parameters()

critic_state_dict = dict(critic_params)

actor_state_dict = dict(actor_params)

target_critic_dict = dict(target_critic_params)

target_actor_dict = dict(target_actor_params)

for name in critic_state_dict:

critic_state_dict[name] = tau*critic_state_dict[name].clone() + \

(1-tau)*target_critic_dict[name].clone()

self.target_critic.load_state_dict(critic_state_dict)

for name in actor_state_dict:

actor_state_dict[name] = tau*actor_state_dict[name].clone() + \

(1-tau)*target_actor_dict[name].clone()

self.target_actor.load_state_dict(actor_state_dict)

def save_models(self):

self.actor.save_checkpoint()

self.critic.save_checkpoint()

self.target_actor.save_checkpoint()

self.target_critic.save_checkpoint()

def load_models(self):

self.actor.load_checkpoint()

self.critic.load_checkpoint()

self.target_actor.load_checkpoint()

self.target_critic.load_checkpoint()

#https://www.youtube.com/watch?v=6Yd5WnYls_Y&t=1117s

#https://youtu.be/6Yd5WnYls_Y?t=2912

#48:32

"""

以上、用意したクラス

・ノイズ

・リプレイバッファ

・クリティックNN

・アクターNN

・エージェント

"""

そしてメインスクリプトです。

from ddpg_torch import Agent
import gym
import numpy as np
#from utils import plotLearning

env = gym.make('LunarLanderContinuous-v2')

agent = Agent(alpha=0.000025, neta=0.00025, input_dims=[8], tau=0.001, env=env,bathc_size=64, layer1_size=400, layer2_size=300, n_actions=2)

np.random.seed(0)

score_history = []
for i in range(1000):
    done = Falsescore = 0
    score = 0
    obs =env.reset()
    while not done:
        act =agent.choose_action(obs)
        new_state, reward, done, info = env.step(act)
        agent.remenber(obs, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        obs = new_state

    score_history.appen(score)
    print('epsisode', i, 'score %.2f' % score, '100 game sverage %.2f' % np.mean(score_history[-100:]))

    if i % 25 == 0:
        agent.save_models()

filename = ' lunar-lander.png'
#plotLearning(score_history, filename, window=100)

from ddpg_torch import Agent

import gym

import numpy as np

#from utils import plotLearning

env = gym.make('LunarLanderContinuous-v2')

agent = Agent(alpha=0.000025, neta=0.00025, input_dims=[8], tau=0.001, env=env,bathc_size=64, layer1_size=400, layer2_size=300, n_actions=2)

np.random.seed(0)

score_history = []

for i in range(1000):

done = Falsescore = 0

score = 0

obs =env.reset()

while not done:

act =agent.choose_action(obs)

new_state, reward, done, info = env.step(act)

agent.remenber(obs, act, reward, new_state, int(done))

agent.learn()

score += reward

obs = new_state

score_history.appen(score)

print('epsisode', i, 'score %.2f' % score, '100 game sverage %.2f' % np.mean(score_history[-100:]))

if i % 25 == 0:

agent.save_models()

filename = ' lunar-lander.png'

#plotLearning(score_history, filename, window=100)

この記事を書いた人
最新の記事

Keita N

最新記事 by Keita N (全て見る)

2025年8月
月	火	水	木	金	土	日
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31