maddpg_pytorch

MADDPGです。これもうまく動かないけど、メモ。

import os
import numpy as np
import torch as T
import torch.nn.functional as F
import torch.optim as optim
from make_env import make_env

""" pip install
pip uninstall torch
pip install torch==1.4.0

pip uninstall numpy
pip install numpy==1.14.5

"""

"""
#env = make_env('simple')
env = make_env('simple_adversary')
observation = env.reset()


print(observation)
print(observation[0])
print(env.action_space)
print(env.action_space[0])
"""

# リプレイバッファのクラスを作成する
class MultiAgentReplayBuffer:
    def __init__(self, max_size, critic_dims, actor_dims,
                 n_actions, n_agents, batch_size):
        # 引数をアトリビュートとして保存する
        self.mem_cntr = 0 # メモリーカウンター

        self.mem_size = max_size # メモリーサイズ
        self.critic_dims = critic_dims
        self.actor_dims = actor_dims
        self.n_actions = n_actions
        self.n_agents = n_agents
        self.batch_size = batch_size

        # メモリーの枠を確保する
        self.state_memory = np.zeros((self.mem_size, critic_dims))
        self.new_state_memory = np.zeros((self.mem_size, critic_dims))#同じ
        self.reward_memory = np.zeros((self.mem_size, n_agents))
        self.terminal_memory = np.zeros((self.mem_size, n_agents), dtype=bool)#最終状態は値がないようにマスクする
        
        # アクターメモリーの初期化（メソドの作成が必要）
        self.init_actor_memory()

        """ memo
        print(np.zeros((2,3)))
        [[0. 0. 0.]
        [0. 0. 0.]]
        """
    # アクターメモリーの初期化
    def init_actor_memory(self):
        self.actor_state_memory = []
        self.actor_new_state_memory = []
        self.actor_action_memory = []

        for i in range(self.n_agents):
            self.actor_state_memory.append(
                np.zeros((self.mem_size, self.actor_dims[i])))
            self.actor_new_state_memory.append(
                np.zeros((self.mem_size, self.actor_dims[i])))
            self.actor_action_memory.append(
                np.zeros((self.mem_size, self.n_actions)))

    # トランジションの保存
    def store_transition(self, raw_obs, state, action, reward,
                         raw_obs_, state_, done):
        if self.mem_cntr % self.mem_size == 0 and self.mem_cntr > 0:
            self.init_actor_memory()
        
        index = self.mem_cntr % self.mem_size

        for agent_idx in range(self.n_agents):
            self.actor_state_memory[agent_idx][index] = raw_obs[agent_idx]
            self.actor_new_state_memory[agent_idx][index] = raw_obs_[agent_idx]
            self.actor_action_memory[agent_idx][index] = action[agent_idx]

        self.state_memory[index] = state # ここで８次元と２８次元で食い違っている
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = done

        self.mem_cntr += 1

    # サンプルバッファー
    def sample_buffer(self):
        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, self.batch_size, replace=False)

        states = self.state_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        terminal = self.terminal_memory[batch]

        actor_states = []
        actor_new_states = []
        actions = []

        for agent_idx in range(self.n_agents):
            actor_states.append(self.actor_state_memory[agent_idx][batch])
            actor_new_states.append(self.actor_new_state_memory[agent_idx][batch])
            actions.append(self.actor_action_memory[agent_idx][batch])

        return actor_states, states, actions, rewards , \
                actor_new_states, states_, terminal
        
    def ready(self):
        if self.mem_cntr >= self.batch_size:
            return True
        return False

# クリティックのニューラルネットワークを作成する
class CriticNetwork(T.nn.Module):
    def __init__(self, beta, input_dims, fc1_dims, fc2_dims,
                 n_agents, n_actions, chkpt_dir, name):
        # CriticNetworkクラスの親クラスT.nn.Moduleの___init__()にアクセスして、初期化する
        super(CriticNetwork, self).__init__() # super()で親クラスの__init__()を呼び出す
        #super(CriticNetwork, self).__init__()# 親のクラス=super(現在のクラス名,現在のクラス) 

        self.chkpt_dir = chkpt_dir
        self.chkpt_file = os.path.join(chkpt_dir, name)

        self.fc1 = T.nn.Linear(input_dims+n_agents*n_actions, fc1_dims)
        self.fc2 = T.nn.Linear(fc1_dims, fc2_dims)
        self.q = T.nn.Linear(fc2_dims, 1) # 出力は一つのみ

        self.optimizer = optim.Adam(self.parameters(), lr=beta)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        if not os.path.exists(self.chkpt_file):
            with open(self.chkpt_file, 'w'):
               pass
    # 順伝播する
    def forward(self, state, action):
        # Q関数のNN入力は　現在のstateと方針piのNN出力としてのactionの２つである。 
        x = T.cat((state, action), dim=1) #列方向に合体する
        x = F.relu(self.fc1(x)) 
        x = F.relu(self.fc2(x))
        q = self.q(x) # 出力は一つのみ

        return q
    
    def save_checkpoint(self):

        if not os.path.exists(self.chkpt_file):
            with open(self.chkpt_file, 'w'):
               pass
        # state_dictは、モデルのパラメータを格納しているPythonの辞書オブジェクト
        T.save(self.state_dict(), self.chkpt_file)
    
    def load_checkpoint(self):
        self.load_state_dict(T.load(self.chkpt_file))

# アクターのニューラルネットワークを作成する
class ActorNetwork(T.nn.Module):
    def __init__(self, alpha, input_dims, fc1_dims, fc2_dims,
                 n_actions, chkpt_dir, name):
        # ActorNetworkクラスの親クラスT.nn.Moduleの___init__()にアクセスして、初期化する
        super(ActorNetwork, self).__init__() # 親のクラス=super(現在のクラス名,現在のクラス)       


        self.chkpt_file = os.path.join(chkpt_dir, name)

        self.fc1 = T.nn.Linear(input_dims, fc1_dims)
        self.fc2 = T.nn.Linear(fc1_dims, fc2_dims)
        self.pi = T.nn.Linear(fc2_dims, n_actions) # 方策piはactionの選択肢の数分用意する

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        if not os.path.exists(self.chkpt_file):
            with open(self.chkpt_file, 'w'):
               pass
    # 順伝播する
    def forward(self, state): 
        # 方針piのNN入力はstateのみで良い。出力はpi
        x = F.relu(self.fc1(state)) # ここでエラーが出ている
        x = F.relu(self.fc2(x))
        pi = T.softmax(self.pi(x), dim=1) # これも列方向 dim=1

        return pi
     
    def save_checkpoint(self):
        if not os.path.exists(self.chkpt_file):
            with open(self.chkpt_file, 'w'):
               pass
        # state_dictは、モデルのパラメータを格納しているPythonの辞書オブジェクト
        T.save(self.state_dict(), self.chkpt_file)
    
    def load_checkpoint(self):
        self.load_state_dict(T.load(self.chkpt_file))

class Agent:
    def __init__(self, agent_idx,
                 actor_dims, critic_dims,
                 n_agents, n_actions,
                 fc1=64, fc2=64,
                 alpha=0.01, beta=0.01,
                 gamma=0.95, tau=0.01,
                 chkpt_dir='tmp/maddpg/'):
        
        self.gamma = gamma
        self.tau = tau
        self.n_agents = n_agents
        self.n_actions = n_actions
        self.agent_name = 'agent_%s' % agent_idx
        self.chkpt_dir = chkpt_dir
        
        #アクターとクリティックのNNをインスタンス化
        self.actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions,
                                  chkpt_dir=self.chkpt_dir, name=self.agent_name+'_actor')
        self.critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions,
                                    chkpt_dir=self.chkpt_dir, name=self.agent_name+'_critic')

        
        # ターゲットアクターとターゲットクリティックのNNをインスタンス化
        self.target_actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions,
                                          chkpt_dir=self.chkpt_dir, name=self.agent_name+'_target_actor')
        self.target_critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions,
                                           chkpt_dir=self.chkpt_dir, name=self.agent_name+'_target_critic')

        # NNのパラメーターを更新する
        self.update_network_parameters(tau=1)

    # NNパラメータのアップデート
    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        # ターゲットアクターとアクターに対してNNパラメータのアップデート
        target_actor_params = self.target_actor.named_parameters()
        actor_params = self.actor.named_parameters()

        target_actor_state_dict = dict(target_actor_params)
        actor_state_dict = dict(actor_params)

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                    (1-tau)*target_actor_state_dict[name].clone()
        self.target_actor.load_state_dict(actor_state_dict)

        # ターゲットクリティックとクリティックに対してNNパラメータのアップデート
        target_critic_params = self.target_critic.named_parameters()
        critic_params = self.critic.named_parameters()

        target_critic_state_dict = dict(target_critic_params)
        critic_state_dict = dict(critic_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                    (1-tau)*target_critic_state_dict[name].clone()
        self.target_critic.load_state_dict(critic_state_dict)

    # actionを選択する
    def choose_action(self, observation):
        state = T.tensor(np.array([observation]), dtype=T.float).to(self.actor.device)
        actions = self.actor.forward(state)
        noise = T.rand(T.tensor(self.n_actions).to(self.actor.device)) #n_actionsは整数なのでtensorに変換する
        action = actions + noise

        return action.detach().cpu().numpy()[0]
    
    # モデルを保存する
    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()
    
    # モデルをロードする
    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

# MADDPクラスを作成する
class MADDPG:

    def __init__(self,
                 actor_dims, critic_dims, n_agents, n_actions,
                 fc1=64, fc2=64,
                 alpha=0.01, beta=0.01,
                 gamma=0.99, tau=0.01,
                 scenario='simple_adversary',
                 chkpt_dir='tmp/maddpg/'):
       
        self.agents = []
        self.n_agents = n_agents
        self.n_actions = n_actions
        chkpt_dir += scenario


        for agent_idx in range(self.n_agents):

            self.agents.append(Agent(agent_idx, 
                                     actor_dims[agent_idx], critic_dims,
                                     n_agents, n_actions,
                                     fc1=64, fc2=64,
                                     alpha=0.01, beta=0.01, 
                                     gamma=0.95, tau=0.01,
                                     chkpt_dir=chkpt_dir))


    def save_checkpoint(self):
        print('==== saving checkpoint ====')
        for agent in self.agents:
            agent.save_models()

    def load_checkpoint(self):
        print('==== loading checkpoint ====')
        for agent in self.agents:
            agent.load_models()
    
    def choose_action(self, raw_obs):
        actions = []
        for agent_idx, agent in enumerate(self.agents):
            action = agent.choose_action(raw_obs[agent_idx])
            actions.append(action)

        return actions
    
    def learn(self, memory):
        if not memory.ready():
            return
        
        # リプレイバッファーのメモリからデータを引っ張り出す
        actor_states, states, actions, rewards,\
        actor_new_states, states_, dones = memory.sample_buffer()

        # できれば,cudaを使いたい
        device = self.agents[0].actor.device

        states = T.tensor(states, dtype=T.float).to(device)
        actions = T.tensor(actions, dtype=T.float).to(device)
        rewards = T.tensor(rewards, dtype=T.float).to(device)
        states_ = T.tensor(states_, dtype=T.float).to(device)
        dones = T.tensor(dones).to(device)

        # 全てのエージェントの行動を入れる箱
        all_agents_new_actions = []
        all_agents_new_mu_actions = []
        old_agents_actions = []
        
        # エージェント毎に行動を空リストへappendしていく
        for agent_idx, agent in enumerate(self.agents):
            #まずは新しい状態new_statesを定義する
            new_states = T.tensor(actor_new_states[agent_idx],
                                  dtype=T.float).to(device)
            # ターゲットアクターNNを順伝搬
            new_pi = agent.target_actor.forward(new_states) # ここでActorNN.forwardへ飛ぶ1024x8
            
            # 新しい方針（行動）new_piをappendする
            all_agents_new_actions.append(new_pi)

            # 次はmu_statesをやっていく muは現在のstatesでの方策（行動）μ(θ)のこと
            mu_states = T.tensor(actor_states[agent_idx],
                                 dtype=T.float).to(device)
            # アクターNNを順伝搬
            pi = agent.actor.forward(mu_states)# ここでActorNN.forwardへ飛ぶ1024x8
        
            #  新しい方針（行動）new_piをappendする
            all_agents_new_mu_actions.append(pi)

            old_agents_actions.append(actions[agent_idx])

        new_actions = T.cat([acts for acts in all_agents_new_actions], dim=1)
        mu = T.cat([acts for acts in all_agents_new_mu_actions], dim=1)
        old_actions = T.cat([acts for acts in old_agents_actions], dim=1)

        for agent_idx, agent in enumerate(self.agents):
            
            # target_qの計算1024x1：ターゲットクリティック（次の状態、次の行動）1024x28, 1024x15
            critic_value_ = agent.target_critic(states_, new_actions).flatten()
            critic_value_[dones[:,0]] = 0.0 # バッチ1024全てのエージェント0のdonesを0.0にする
            
            # qの計算
            critic_value = agent.critic(states, old_actions).flatten()
            
            #収益計算（割引率考慮）:target = 即時報酬r + （割引率γ x 次の状態行動価値q）
            target = rewards[:, agent_idx] + (agent.gamma * critic_value_)
            
            #criticNNの損失計算  
            critic_loss = F.mse_loss(target.detach(), critic_value)
            
            # criticの誤差逆伝播
            agent.critic.optimizer.zero_grad() # 勾配初期化
            critic_loss.backward(retain_graph=True) # 損失関数から勾配を計算
            agent.critic.optimizer.step() 

            # =================================

            # actorNNの損失計算 
            actor_loss = agent.critic.forward(states, mu)#.flatten()
            actor_loss = - actor_loss.mean() # 本当にmeanか？
            #actor_loss = - actor_loss

            #actorの誤差逆伝播
            agent.actor.optimizer.zero_grad()
            actor_loss.backward(retain_graph=True) # ここでエラーが起こっている
            #actor_loss = actor_loss.detach() # 独自に追加：detach()を使用して、計算グラフを切り離す
            agent.actor.optimizer.step()


            """改良コードだが、動かなかったので元に戻した 
            with T.no_grad():
                agent.actor.optimizer.zero_grad()
                actor_loss_copy = actor_loss.clone() # コピーを作成する
                actor_loss_copy.backward(retain_graph=True) # コピーに対して誤差逆伝播を行う
                actor_loss = actor_loss_copy.detach() # 独自に追加：detach()を使用して、計算グラフを切り離す
                actor_loss = actor_loss_copy.detach() # detach()を使用して、計算グラフを切り離す
                agent.actor.optimizer.step()
            """

            # agentのパラメータ更新実行(actor, critic, target_actor, target_critic)
            agent.update_network_parameters()
            #以上を３エージェント分繰り返す

def obsavation_list_to_state_vector(observation):
    state = np.array([])
    for obs in observation:
        # 観察空間を縦につなげていく
        state = np.concatenate([state, obs])
    return state
    
# ここからがメインスクリプト
if __name__ == '__main__':

    # 勾配エラー検出をオンにする
    #T.autograd.set_detect_anomaly(True)

    # シナリオを定義する
    #scenario = 'simple'
    scenario = 'simple_adversary'

    # 環境を定義する
    env = make_env(scenario)
    # エージェントの数を定義する
    n_agents = env.n # 3
    print('n_agents : ', n_agents) # 1
    # アクターの次元を初期化する = []
    actor_dims = []

    # エージェントの数だけ繰り返す
    for i in range(n_agents):
        # エージェントの次元にエージェントiの観察空間の数を入れる
        actor_dims.append(env.observation_space[i].shape[0]) #8, 10, 10
    print(f'actor_dims : {actor_dims}') # actor_dims : [8, 10, 10]
        
    # 全てのエージェントについて、アクターの観察空間数を足し算した数をクリティックNNの入力次元とする
    # が、間違ってないか？アクターの観察空間数を全部足したらアクターNNの入力次元ではないか？
    critic_dims = sum(actor_dims) # 28 = 8 + 10 + 10
    
    # 行動空間の数を定義する
    n_actions = env.action_space[0].n # 5

    # MADDPGに基づいたエージェントのインスタンスを作成する
    # args: アクターの次元[8,10,10]、クリティックの次元28、エージェントの数3、行動空間の数5
    #       NN第一層のノード数64、NN第二層のノード数64、アクターNNの学習率0.01、クリティックNNの学習率0.01,
    #       シナリオsimple_adversary, チェックポイント保存用フォルダ

    maddpg_agents = MADDPG(actor_dims, critic_dims,
                           n_agents, n_actions,
                           fc1=64, fc2=64,
                           alpha=0.01, beta=0.01,
                           gamma=0.99, tau=0.01,
                           scenario=scenario,
                           chkpt_dir='tmp/maddpg/')
     
    # リプレイバッファーからのメモリーのインスタンスを作成する
    memory = MultiAgentReplayBuffer(1000000, critic_dims, actor_dims,
                                    n_actions, n_agents, batch_size=1024)
    
    # 出力頻度
    PRINT_INTERVAL = 500

    # 試行回数
    N_GAMES = 30000

    # 1試行中の最大ステップ数
    MAX_STEPS = 25

    # 初期化
    total_steps = 0
    best_score = 0
    
    # 学習=False , 評価検証=True
    evaluate = False # or True

    # 評価検証の場合は学習済みのモデルパラメータをダウンロードする
    if evaluate:
        maddpg_agents.load_checkpoint()

    # 試行回数分繰り返す
    for i in range(N_GAMES):
        # gym環境リセット　初期位置・初期条件
        obs = env.reset() 
        score = 0
        score_history = []
        done = [False] * n_agents # エージェントの数ぶん
        episode_step = 0

        # 全エージェントのdoneが格納されているdoneリストの各要素が全部Trueでない限り繰り返す。
        # つまり、全エージェントがゴールに到達したら繰り返しは終了する。
        while not any(done):
            if evaluate:
                env.render()
            
            # 環境obsのときエージェントがとる行動確率から行動を抽出し、決定する
            actions = maddpg_agents.choose_action(obs)

            # 決定した行動から、次の環境、報酬、ゴールしたかどうか、その他情報を得る
            obs_, reward, done, info = env.step(actions)

            # 環境obsをベクトルに変換して状態stateとする。
            state = obsavation_list_to_state_vector(obs)
            
            # 次の環境obs をベクトルに変換して次の状態state_とする。
            state_ = obsavation_list_to_state_vector(obs_)

            # ここで、最大ステップを超えたら全エージェントのdoneを強制的にTrueにする。
            if episode_step > MAX_STEPS:
                done = [True] * n_agents

            # リプレイバッファメモリーにトランジションを保存する
            memory.store_transition(obs, state, actions, reward, obs_, state_, done)

            # 100ステップ毎に実行する
            if total_steps % 100 == 0 and not evaluate:
                # インスタンスを引数にとるとどうなるのか？
                maddpg_agents.learn(memory)

            # 次の環境を現在の環境としてアップデートする
            obs = obs_  

            # 全エージェントの報酬をスコアとして加算する
            score += sum(reward)
            
            # ステップ数を更新する
            total_steps += 1

            # エピソードを更新する
            episode_step += 1

        # スコアを履歴に追加する
        score_history.append(score)

        # スコア履歴から平均スコアを算出する
        avg_score = np.mean(score_history[-100:])

        if not evaluate:
            # ベストスコアより平均スコアのほうが高ければ
            if avg_score > best_score:
                # チェックポイントを保存
                maddpg_agents.save_checkpoint()
                # 平均スコアをベストスコアとして上書きする
                best_score = avg_score

        if i % PRINT_INTERVAL == 0 and i > 0:
            print('(episode)', i, 'average_score {:.1f}'.format(avg_score))


print('Script is done')        
#1:47:03
# #https://www.youtube.com/watch?v=tZTQ6S9PfkE

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

import os

import numpy as np

import torch as T

import torch.nn.functional as F

import torch.optim as optim

from make_env import make_env

""" pip install

pip uninstall torch

pip install torch==1.4.0

pip uninstall numpy

pip install numpy==1.14.5

"""

#env = make_env('simple')

env = make_env('simple_adversary')

observation = env.reset()

print(observation)

print(observation[0])

print(env.action_space)

print(env.action_space[0])

"""

# リプレイバッファのクラスを作成する

class MultiAgentReplayBuffer:

def __init__(self, max_size, critic_dims, actor_dims,

n_actions, n_agents, batch_size):

# 引数をアトリビュートとして保存する

self.mem_cntr = 0 # メモリーカウンター

self.mem_size = max_size # メモリーサイズ

self.critic_dims = critic_dims

self.actor_dims = actor_dims

self.n_actions = n_actions

self.n_agents = n_agents

self.batch_size = batch_size

# メモリーの枠を確保する

self.state_memory = np.zeros((self.mem_size, critic_dims))

self.new_state_memory = np.zeros((self.mem_size, critic_dims))#同じ

self.reward_memory = np.zeros((self.mem_size, n_agents))

self.terminal_memory = np.zeros((self.mem_size, n_agents), dtype=bool)#最終状態は値がないようにマスクする

# アクターメモリーの初期化（メソドの作成が必要）

self.init_actor_memory()

""" memo

print(np.zeros((2,3)))

[[0. 0. 0.]

[0. 0. 0.]]

"""

# アクターメモリーの初期化

def init_actor_memory(self):

self.actor_state_memory = []

self.actor_new_state_memory = []

self.actor_action_memory = []

for i in range(self.n_agents):

self.actor_state_memory.append(

np.zeros((self.mem_size, self.actor_dims[i])))

self.actor_new_state_memory.append(

np.zeros((self.mem_size, self.actor_dims[i])))

self.actor_action_memory.append(

np.zeros((self.mem_size, self.n_actions)))

# トランジションの保存

def store_transition(self, raw_obs, state, action, reward,

raw_obs_, state_, done):

if self.mem_cntr % self.mem_size == 0 and self.mem_cntr > 0:

self.init_actor_memory()

index = self.mem_cntr % self.mem_size

for agent_idx in range(self.n_agents):

self.actor_state_memory[agent_idx][index] = raw_obs[agent_idx]

self.actor_new_state_memory[agent_idx][index] = raw_obs_[agent_idx]

self.actor_action_memory[agent_idx][index] = action[agent_idx]

self.state_memory[index] = state # ここで８次元と２８次元で食い違っている

self.new_state_memory[index] = state_

self.terminal_memory[index] = done

self.mem_cntr += 1

# サンプルバッファー

def sample_buffer(self):

max_mem = min(self.mem_cntr, self.mem_size)

batch = np.random.choice(max_mem, self.batch_size, replace=False)

states = self.state_memory[batch]

rewards = self.reward_memory[batch]

states_ = self.new_state_memory[batch]

terminal = self.terminal_memory[batch]

actor_states = []

actor_new_states = []

actions = []

for agent_idx in range(self.n_agents):

actor_states.append(self.actor_state_memory[agent_idx][batch])

actor_new_states.append(self.actor_new_state_memory[agent_idx][batch])

actions.append(self.actor_action_memory[agent_idx][batch])

return actor_states, states, actions, rewards , \

actor_new_states, states_, terminal

def ready(self):

if self.mem_cntr >= self.batch_size:

return True

return False

# クリティックのニューラルネットワークを作成する

class CriticNetwork(T.nn.Module):

def __init__(self, beta, input_dims, fc1_dims, fc2_dims,

n_agents, n_actions, chkpt_dir, name):

# CriticNetworkクラスの親クラスT.nn.Moduleの___init__()にアクセスして、初期化する

super(CriticNetwork, self).__init__() # super()で親クラスの__init__()を呼び出す

#super(CriticNetwork, self).__init__()# 親のクラス=super(現在のクラス名,現在のクラス)

self.chkpt_dir = chkpt_dir

self.chkpt_file = os.path.join(chkpt_dir, name)

self.fc1 = T.nn.Linear(input_dims+n_agents*n_actions, fc1_dims)

self.fc2 = T.nn.Linear(fc1_dims, fc2_dims)

self.q = T.nn.Linear(fc2_dims, 1) # 出力は一つのみ

self.optimizer = optim.Adam(self.parameters(), lr=beta)

self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

self.to(self.device)

if not os.path.exists(self.chkpt_file):

with open(self.chkpt_file, 'w'):

pass

# 順伝播する

def forward(self, state, action):

# Q関数のNN入力は　現在のstateと方針piのNN出力としてのactionの２つである。

x = T.cat((state, action), dim=1) #列方向に合体する

x = F.relu(self.fc1(x))

x = F.relu(self.fc2(x))

q = self.q(x) # 出力は一つのみ

return q

def save_checkpoint(self):

if not os.path.exists(self.chkpt_file):

with open(self.chkpt_file, 'w'):

pass

# state_dictは、モデルのパラメータを格納しているPythonの辞書オブジェクト

T.save(self.state_dict(), self.chkpt_file)

def load_checkpoint(self):

self.load_state_dict(T.load(self.chkpt_file))

# アクターのニューラルネットワークを作成する

class ActorNetwork(T.nn.Module):

def __init__(self, alpha, input_dims, fc1_dims, fc2_dims,

n_actions, chkpt_dir, name):

# ActorNetworkクラスの親クラスT.nn.Moduleの___init__()にアクセスして、初期化する

super(ActorNetwork, self).__init__() # 親のクラス=super(現在のクラス名,現在のクラス)

self.chkpt_file = os.path.join(chkpt_dir, name)

self.fc1 = T.nn.Linear(input_dims, fc1_dims)

self.fc2 = T.nn.Linear(fc1_dims, fc2_dims)

self.pi = T.nn.Linear(fc2_dims, n_actions) # 方策piはactionの選択肢の数分用意する

self.optimizer = optim.Adam(self.parameters(), lr=alpha)

self.device = T.device('cuda' if T.cuda.is_available() else 'cpu')

self.to(self.device)

if not os.path.exists(self.chkpt_file):

with open(self.chkpt_file, 'w'):

pass

# 順伝播する

def forward(self, state):

# 方針piのNN入力はstateのみで良い。出力はpi

x = F.relu(self.fc1(state)) # ここでエラーが出ている

x = F.relu(self.fc2(x))

pi = T.softmax(self.pi(x), dim=1) # これも列方向 dim=1

return pi

def save_checkpoint(self):

if not os.path.exists(self.chkpt_file):

with open(self.chkpt_file, 'w'):

pass

# state_dictは、モデルのパラメータを格納しているPythonの辞書オブジェクト

T.save(self.state_dict(), self.chkpt_file)

def load_checkpoint(self):

self.load_state_dict(T.load(self.chkpt_file))

class Agent:

def __init__(self, agent_idx,

actor_dims, critic_dims,

n_agents, n_actions,

fc1=64, fc2=64,

alpha=0.01, beta=0.01,

gamma=0.95, tau=0.01,

chkpt_dir='tmp/maddpg/'):

self.gamma = gamma

self.tau = tau

self.n_agents = n_agents

self.n_actions = n_actions

self.agent_name = 'agent_%s' % agent_idx

self.chkpt_dir = chkpt_dir

#アクターとクリティックのNNをインスタンス化

self.actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions,

chkpt_dir=self.chkpt_dir, name=self.agent_name+'_actor')

self.critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions,

chkpt_dir=self.chkpt_dir, name=self.agent_name+'_critic')

# ターゲットアクターとターゲットクリティックのNNをインスタンス化

self.target_actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions,

chkpt_dir=self.chkpt_dir, name=self.agent_name+'_target_actor')

self.target_critic = CriticNetwork(beta, critic_dims, fc1, fc2, n_agents, n_actions,

chkpt_dir=self.chkpt_dir, name=self.agent_name+'_target_critic')

# NNのパラメーターを更新する

self.update_network_parameters(tau=1)

# NNパラメータのアップデート

def update_network_parameters(self, tau=None):

if tau is None:

tau = self.tau

# ターゲットアクターとアクターに対してNNパラメータのアップデート

target_actor_params = self.target_actor.named_parameters()

actor_params = self.actor.named_parameters()

target_actor_state_dict = dict(target_actor_params)

actor_state_dict = dict(actor_params)

for name in actor_state_dict:

actor_state_dict[name] = tau*actor_state_dict[name].clone() + \

(1-tau)*target_actor_state_dict[name].clone()

self.target_actor.load_state_dict(actor_state_dict)

# ターゲットクリティックとクリティックに対してNNパラメータのアップデート

target_critic_params = self.target_critic.named_parameters()

critic_params = self.critic.named_parameters()

target_critic_state_dict = dict(target_critic_params)

critic_state_dict = dict(critic_params)

for name in critic_state_dict:

critic_state_dict[name] = tau*critic_state_dict[name].clone() + \

(1-tau)*target_critic_state_dict[name].clone()

self.target_critic.load_state_dict(critic_state_dict)

# actionを選択する

def choose_action(self, observation):

state = T.tensor(np.array([observation]), dtype=T.float).to(self.actor.device)

actions = self.actor.forward(state)

noise = T.rand(T.tensor(self.n_actions).to(self.actor.device)) #n_actionsは整数なのでtensorに変換する

action = actions + noise

return action.detach().cpu().numpy()[0]

# モデルを保存する

def save_models(self):

self.actor.save_checkpoint()

self.target_actor.save_checkpoint()

self.critic.save_checkpoint()

self.target_critic.save_checkpoint()

# モデルをロードする

def load_models(self):

self.actor.load_checkpoint()

self.target_actor.load_checkpoint()

self.critic.load_checkpoint()

self.target_critic.load_checkpoint()

# MADDPクラスを作成する

class MADDPG:

def __init__(self,

actor_dims, critic_dims, n_agents, n_actions,

fc1=64, fc2=64,

alpha=0.01, beta=0.01,

gamma=0.99, tau=0.01,

scenario='simple_adversary',

chkpt_dir='tmp/maddpg/'):

self.agents = []

self.n_agents = n_agents

self.n_actions = n_actions

chkpt_dir += scenario

for agent_idx in range(self.n_agents):

self.agents.append(Agent(agent_idx,

actor_dims[agent_idx], critic_dims,

n_agents, n_actions,

fc1=64, fc2=64,

alpha=0.01, beta=0.01,

gamma=0.95, tau=0.01,

chkpt_dir=chkpt_dir))

def save_checkpoint(self):

print('==== saving checkpoint ====')

for agent in self.agents:

agent.save_models()

def load_checkpoint(self):

print('==== loading checkpoint ====')

for agent in self.agents:

agent.load_models()

def choose_action(self, raw_obs):

actions = []

for agent_idx, agent in enumerate(self.agents):

action = agent.choose_action(raw_obs[agent_idx])

actions.append(action)

return actions

def learn(self, memory):

if not memory.ready():

return

# リプレイバッファーのメモリからデータを引っ張り出す

actor_states, states, actions, rewards,\

actor_new_states, states_, dones = memory.sample_buffer()

# できれば,cudaを使いたい

device = self.agents[0].actor.device

states = T.tensor(states, dtype=T.float).to(device)

actions = T.tensor(actions, dtype=T.float).to(device)

rewards = T.tensor(rewards, dtype=T.float).to(device)

states_ = T.tensor(states_, dtype=T.float).to(device)

dones = T.tensor(dones).to(device)

# 全てのエージェントの行動を入れる箱

all_agents_new_actions = []

all_agents_new_mu_actions = []

old_agents_actions = []

# エージェント毎に行動を空リストへappendしていく

for agent_idx, agent in enumerate(self.agents):

#まずは新しい状態new_statesを定義する

new_states = T.tensor(actor_new_states[agent_idx],

dtype=T.float).to(device)

# ターゲットアクターNNを順伝搬

new_pi = agent.target_actor.forward(new_states) # ここでActorNN.forwardへ飛ぶ1024x8

# 新しい方針（行動）new_piをappendする

all_agents_new_actions.append(new_pi)

# 次はmu_statesをやっていく muは現在のstatesでの方策（行動）μ(θ)のこと

mu_states = T.tensor(actor_states[agent_idx],

dtype=T.float).to(device)

# アクターNNを順伝搬

pi = agent.actor.forward(mu_states)# ここでActorNN.forwardへ飛ぶ1024x8

# 新しい方針（行動）new_piをappendする

all_agents_new_mu_actions.append(pi)

old_agents_actions.append(actions[agent_idx])

new_actions = T.cat([acts for acts in all_agents_new_actions], dim=1)

mu = T.cat([acts for acts in all_agents_new_mu_actions], dim=1)

old_actions = T.cat([acts for acts in old_agents_actions], dim=1)

for agent_idx, agent in enumerate(self.agents):

# target_qの計算1024x1：ターゲットクリティック（次の状態、次の行動）1024x28, 1024x15

critic_value_ = agent.target_critic(states_, new_actions).flatten()

critic_value_[dones[:,0]] = 0.0 # バッチ1024全てのエージェント0のdonesを0.0にする

# qの計算

critic_value = agent.critic(states, old_actions).flatten()

#収益計算（割引率考慮）:target = 即時報酬r + （割引率γ x 次の状態行動価値q）

target = rewards[:, agent_idx] + (agent.gamma * critic_value_)

#criticNNの損失計算

critic_loss = F.mse_loss(target.detach(), critic_value)

# criticの誤差逆伝播

agent.critic.optimizer.zero_grad() # 勾配初期化

critic_loss.backward(retain_graph=True) # 損失関数から勾配を計算

agent.critic.optimizer.step()

# =================================

# actorNNの損失計算

actor_loss = agent.critic.forward(states, mu)#.flatten()

actor_loss = - actor_loss.mean() # 本当にmeanか？

#actor_loss = - actor_loss

#actorの誤差逆伝播

agent.actor.optimizer.zero_grad()

actor_loss.backward(retain_graph=True) # ここでエラーが起こっている

#actor_loss = actor_loss.detach() # 独自に追加：detach()を使用して、計算グラフを切り離す

agent.actor.optimizer.step()

"""改良コードだが、動かなかったので元に戻した

with T.no_grad():

agent.actor.optimizer.zero_grad()

actor_loss_copy = actor_loss.clone() # コピーを作成する

actor_loss_copy.backward(retain_graph=True) # コピーに対して誤差逆伝播を行う

actor_loss = actor_loss_copy.detach() # 独自に追加：detach()を使用して、計算グラフを切り離す

actor_loss = actor_loss_copy.detach() # detach()を使用して、計算グラフを切り離す

agent.actor.optimizer.step()

"""

# agentのパラメータ更新実行(actor, critic, target_actor, target_critic)

agent.update_network_parameters()

#以上を３エージェント分繰り返す

def obsavation_list_to_state_vector(observation):

state = np.array([])

for obs in observation:

# 観察空間を縦につなげていく

state = np.concatenate([state, obs])

return state

# ここからがメインスクリプト

if __name__ == '__main__':

# 勾配エラー検出をオンにする

#T.autograd.set_detect_anomaly(True)

# シナリオを定義する

#scenario = 'simple'

scenario = 'simple_adversary'

# 環境を定義する

env = make_env(scenario)

# エージェントの数を定義する

n_agents = env.n # 3

print('n_agents : ', n_agents) # 1

# アクターの次元を初期化する = []

actor_dims = []

# エージェントの数だけ繰り返す

for i in range(n_agents):

# エージェントの次元にエージェントiの観察空間の数を入れる

actor_dims.append(env.observation_space[i].shape[0]) #8, 10, 10

print(f'actor_dims : {actor_dims}') # actor_dims : [8, 10, 10]

# 全てのエージェントについて、アクターの観察空間数を足し算した数をクリティックNNの入力次元とする

# が、間違ってないか？アクターの観察空間数を全部足したらアクターNNの入力次元ではないか？

critic_dims = sum(actor_dims) # 28 = 8 + 10 + 10

# 行動空間の数を定義する

n_actions = env.action_space[0].n # 5

# MADDPGに基づいたエージェントのインスタンスを作成する

# args: アクターの次元[8,10,10]、クリティックの次元28、エージェントの数3、行動空間の数5

# NN第一層のノード数64、NN第二層のノード数64、アクターNNの学習率0.01、クリティックNNの学習率0.01,

# シナリオsimple_adversary, チェックポイント保存用フォルダ

maddpg_agents = MADDPG(actor_dims, critic_dims,

n_agents, n_actions,

fc1=64, fc2=64,

alpha=0.01, beta=0.01,

gamma=0.99, tau=0.01,

scenario=scenario,

chkpt_dir='tmp/maddpg/')

# リプレイバッファーからのメモリーのインスタンスを作成する

memory = MultiAgentReplayBuffer(1000000, critic_dims, actor_dims,

n_actions, n_agents, batch_size=1024)

# 出力頻度

PRINT_INTERVAL = 500

# 試行回数

N_GAMES = 30000

# 1試行中の最大ステップ数

MAX_STEPS = 25

# 初期化

total_steps = 0

best_score = 0

# 学習=False , 評価検証=True

evaluate = False # or True

# 評価検証の場合は学習済みのモデルパラメータをダウンロードする

if evaluate:

maddpg_agents.load_checkpoint()

# 試行回数分繰り返す

for i in range(N_GAMES):

# gym環境リセット　初期位置・初期条件

obs = env.reset()

score = 0

score_history = []

done = [False] * n_agents # エージェントの数ぶん

episode_step = 0

# 全エージェントのdoneが格納されているdoneリストの各要素が全部Trueでない限り繰り返す。

# つまり、全エージェントがゴールに到達したら繰り返しは終了する。

while not any(done):

if evaluate:

env.render()

# 環境obsのときエージェントがとる行動確率から行動を抽出し、決定する

actions = maddpg_agents.choose_action(obs)

# 決定した行動から、次の環境、報酬、ゴールしたかどうか、その他情報を得る

obs_, reward, done, info = env.step(actions)

# 環境obsをベクトルに変換して状態stateとする。

state = obsavation_list_to_state_vector(obs)

# 次の環境obs をベクトルに変換して次の状態state_とする。

state_ = obsavation_list_to_state_vector(obs_)

# ここで、最大ステップを超えたら全エージェントのdoneを強制的にTrueにする。

if episode_step > MAX_STEPS:

done = [True] * n_agents

# リプレイバッファメモリーにトランジションを保存する

memory.store_transition(obs, state, actions, reward, obs_, state_, done)

# 100ステップ毎に実行する

if total_steps % 100 == 0 and not evaluate:

# インスタンスを引数にとるとどうなるのか？

maddpg_agents.learn(memory)

# 次の環境を現在の環境としてアップデートする

obs = obs_

# 全エージェントの報酬をスコアとして加算する

score += sum(reward)

# ステップ数を更新する

total_steps += 1

# エピソードを更新する

episode_step += 1

# スコアを履歴に追加する

score_history.append(score)

# スコア履歴から平均スコアを算出する

avg_score = np.mean(score_history[-100:])

if not evaluate:

# ベストスコアより平均スコアのほうが高ければ

if avg_score > best_score:

# チェックポイントを保存

maddpg_agents.save_checkpoint()

# 平均スコアをベストスコアとして上書きする

best_score = avg_score

if i % PRINT_INTERVAL == 0 and i > 0:

print('(episode)', i, 'average_score {:.1f}'.format(avg_score))

print('Script is done')

#1:47:03

# #https://www.youtube.com/watch?v=tZTQ6S9PfkE

メイン

"""
Code for creating a multiagent environment with one of the scenarios listed
in ./scenarios/.
Can be called by using, for example:
    env = make_env('simple_speaker_listener')
After producing the env object, can be used similarly to an OpenAI gym
environment.

A policy using this environment must output actions in the form of a list
for all agents. Each element of the list should be a numpy array,
of size (env.world.dim_p + env.world.dim_c, 1). Physical actions precede
communication actions in this array. See environment.py for more details.
"""

def make_env(scenario_name, benchmark=False):
    '''
    Creates a MultiAgentEnv object as env. This can be used similar to a gym
    environment by calling env.reset() and env.step().
    Use env.render() to view the environment on the screen.

    Input:
        scenario_name   :   name of the scenario from ./scenarios/ to be Returns
                            (without the .py extension)
        benchmark       :   whether you want to produce benchmarking data
                            (usually only done during evaluation)

    Some useful env properties (see environment.py):
        .observation_space  :   Returns the observation space for each agent
        .action_space       :   Returns the action space for each agent
        .n                  :   Returns the number of Agents
    '''
    from multiagent.environment import MultiAgentEnv
    import multiagent.scenarios as scenarios

    # load scenario from script
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    if benchmark:        
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
    else:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
    return env

"""

Code for creating a multiagent environment with one of the scenarios listed

in ./scenarios/.

Can be called by using, for example:

env = make_env('simple_speaker_listener')

After producing the env object, can be used similarly to an OpenAI gym

environment.

A policy using this environment must output actions in the form of a list

for all agents. Each element of the list should be a numpy array,

of size (env.world.dim_p + env.world.dim_c, 1). Physical actions precede

communication actions in this array. See environment.py for more details.

"""

def make_env(scenario_name, benchmark=False):

'''

Creates a MultiAgentEnv object as env. This can be used similar to a gym

environment by calling env.reset() and env.step().

Use env.render() to view the environment on the screen.

Input:

scenario_name : name of the scenario from ./scenarios/ to be Returns

(without the .py extension)

benchmark : whether you want to produce benchmarking data

(usually only done during evaluation)

Some useful env properties (see environment.py):

.observation_space : Returns the observation space for each agent

.action_space : Returns the action space for each agent

.n : Returns the number of Agents

'''

from multiagent.environment import MultiAgentEnv

import multiagent.scenarios as scenarios

# load scenario from script

scenario = scenarios.load(scenario_name + ".py").Scenario()

# create world

world = scenario.make_world()

# create multiagent environment

if benchmark:

env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)

else:

env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)

return env

この記事を書いた人
最新の記事

Keita N

最新記事 by Keita N (全て見る)

2025年8月
月	火	水	木	金	土	日
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31