From b08d096bfa69e2a7bb8c903c3a4e8fb466f8f11d Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 23 Nov 2023 14:51:34 +0800 Subject: [PATCH 1/8] add selfplay test --- openrl/selfplay/opponents/utils.py | 3 + openrl/selfplay/strategies/__init__.py | 41 -- openrl/selfplay/strategies/base_strategy.py | 39 -- openrl/selfplay/strategies/strategies.py | 413 ------------------ tests/test_selfplay/test_selfplay_strategy.py | 91 ---- tests/test_selfplay/test_train_selfplay.py | 120 +++++ 6 files changed, 123 insertions(+), 584 deletions(-) delete mode 100644 openrl/selfplay/strategies/__init__.py delete mode 100644 openrl/selfplay/strategies/base_strategy.py delete mode 100644 openrl/selfplay/strategies/strategies.py delete mode 100644 tests/test_selfplay/test_selfplay_strategy.py create mode 100644 tests/test_selfplay/test_train_selfplay.py diff --git a/openrl/selfplay/opponents/utils.py b/openrl/selfplay/opponents/utils.py index d1d983d5..42ddbb2b 100644 --- a/openrl/selfplay/opponents/utils.py +++ b/openrl/selfplay/opponents/utils.py @@ -28,6 +28,9 @@ def check_opponent_template(opponent_template: Union[str, Path]): + assert isinstance(opponent_template, Path) or isinstance( + opponent_template, str + ), f"opponent_template {opponent_template} must be a Path or str" if isinstance(opponent_template, str): opponent_template = Path(opponent_template) assert ( diff --git a/openrl/selfplay/strategies/__init__.py b/openrl/selfplay/strategies/__init__.py deleted file mode 100644 index 2908f8b4..00000000 --- a/openrl/selfplay/strategies/__init__.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright 2023 The OpenRL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""""" -from openrl.selfplay.strategies.strategies import ( - NaiveSelfplayStrategy, - OnlyLatestSelfplayStrategy, - VarExistEnemySelfplayStrategy, - WeightExistEnemySelfplayStrategy, - WeightSelfplayStrategy, - WinRateSelfplayStrategy, -) - - -def make_strategy(strategy_name): - if strategy_name == "Naive": - selfplay_strategy = NaiveSelfplayStrategy - elif strategy_name == "OnlyLatest": - selfplay_strategy = OnlyLatestSelfplayStrategy - elif strategy_name == "Weight": - selfplay_strategy = WeightSelfplayStrategy - elif strategy_name == "WinRate": - selfplay_strategy = WinRateSelfplayStrategy - elif strategy_name == "VarExistEnemy": - selfplay_strategy = VarExistEnemySelfplayStrategy - elif strategy_name == "WeightExistEnemy": - selfplay_strategy = WeightExistEnemySelfplayStrategy - return selfplay_strategy diff --git a/openrl/selfplay/strategies/base_strategy.py b/openrl/selfplay/strategies/base_strategy.py deleted file mode 100644 index 4e280b13..00000000 --- a/openrl/selfplay/strategies/base_strategy.py +++ /dev/null @@ -1,39 +0,0 @@ -from abc import abstractmethod - - -class BaseSelfplayStrategy: - @abstractmethod - def __init__(self, all_args, nenvs, exist_enemy_num): - raise NotImplementedError - - @abstractmethod - def getcnt(self): - raise NotImplementedError - - @abstractmethod - def update_enemy_ids(self, new_enemy_ids): - raise NotImplementedError - - @abstractmethod - def restore(self, model_dir): - raise NotImplementedError - - @abstractmethod - def get_qlist(self): - raise NotImplementedError - - @abstractmethod - def update_weight(self, enemy_loses): - raise NotImplementedError - - @abstractmethod - def update_win_rate(self, dones, enemy_wins): - raise NotImplementedError - - @abstractmethod - def push_newone(self): - raise NotImplementedError - - @abstractmethod - def get_plist(self): - raise NotImplementedError diff --git a/openrl/selfplay/strategies/strategies.py b/openrl/selfplay/strategies/strategies.py deleted file mode 100644 index 28e492ec..00000000 --- a/openrl/selfplay/strategies/strategies.py +++ /dev/null @@ -1,413 +0,0 @@ -import json - -import numpy as np - -from openrl.selfplay.strategies.base_strategy import BaseSelfplayStrategy - - -class SelfplayStrategy(BaseSelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - # qlist和history_cnt的数据结构 - self.all_args = all_args - self.qlist = [] - self.history_cnt = 0 - self.enemy_ids = [0] * nenvs - self.length = nenvs - - def getcnt(self): - return self.history_cnt - - def update_enemy_ids(self, new_enemy_ids): - self.enemy_ids = new_enemy_ids - - def restore(self, model_dir): - with open(model_dir + "/enemy_history_info.json") as f_obj: - enemy_info = json.load(f_obj) - self.qlist = enemy_info["qlist"] - self.history_cnt = enemy_info["history_cnt"] - - def get_qlist(self): - return self.qlist - - def update_weight(self, enemy_loses): - pass - - def update_win_rate(self, dones, enemy_wins): - pass - - def push_newone(self): - pass - - -class RatioSelfplayStrategy(SelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(RatioSelfplayStrategy, self).__init__(all_args, nenvs) - - def push_newone(self): - self.history_cnt += 1 - - def get_plist(self): - if self.history_cnt == 1: - return [1] - temp_plist = np.logspace( - 0, self.history_cnt - 1, self.history_cnt, endpoint=True, base=1.5 - ) - temp_plist[-1] = sum(temp_plist[:-1]) * 4 - temp_plist /= sum(temp_plist) - return temp_plist - - -class NaiveSelfplayStrategy(SelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(NaiveSelfplayStrategy, self).__init__(all_args, nenvs, exist_enemy_num) - - def push_newone(self): - self.history_cnt += 1 - - def get_plist(self): - return [1] * (self.history_cnt - 1) + [4 * (self.history_cnt - 1)] - - def save_new_one(self): - return True - - -class OnlyLatestSelfplayStrategy(SelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(OnlyLatestSelfplayStrategy, self).__init__( - all_args, nenvs, exist_enemy_num - ) - self.play_list = [] - self.max_play_num = all_args.max_play_num - self.least_win_rate = all_args.least_win_rate - - def push_newone(self): - self.play_list.append([]) - self.history_cnt += 1 - - def get_plist(self): - return [0] * (self.history_cnt - 1) + [1] - - def save_new_one(self, least_win_rate): - if sum(np.array(self.play_list[-1]) == -1) >= least_win_rate * ( - len(self.play_list[-1]) + 1 - ) and len(self.play_list[-1]) >= (self.max_play_num - 10): - return True - - def update_play_list(self, win_enemy_ids, tie_enemy_ids, lose_enemy_ids): - for win_enemy_id in win_enemy_ids: - self.play_list[win_enemy_id].append(1) - for tie_enemy_id in tie_enemy_ids: - self.play_list[tie_enemy_id].append(0) - for lose_enemy_id in lose_enemy_ids: - self.play_list[lose_enemy_id].append(-1) - self.cut_overflow() - - def update_win_rate(self, enemy_wins, enemy_ties, enemy_loses): - win_enemy_ids = np.array(self.enemy_ids)[enemy_wins] - tie_enemy_ids = np.array(self.enemy_ids)[enemy_ties] - lose_enemy_ids = np.array(self.enemy_ids)[enemy_loses] - self.update_play_list(win_enemy_ids, tie_enemy_ids, lose_enemy_ids) - - def cut_overflow(self): - for index in range(len(self.play_list)): - if len(self.play_list[index]) > self.max_play_num: - self.play_list[index] = self.play_list[index][ - (-1) * self.max_play_num : - ] - - def get_info_list(self, info_list): - return_info = [] - for info in info_list: - if info == "win": - equal_num = 1 - elif info == "tie": - equal_num = 0 - elif info == "lose": - equal_num = -1 - num_list = [] - for enemy_play_list in self.play_list: - if info == "play": - num_list.append(len(enemy_play_list)) - else: - num_list.append(int(sum(np.array(enemy_play_list) == equal_num))) - return_info.append(num_list) - return tuple(return_info) - - def get_enemy_play_dict(self): - win_num_list, tie_num_list, lose_num_list, play_num_list = self.get_info_list( - ["win", "tie", "lose", "play"] - ) - return { - "win_num_list": list(win_num_list), - "tie_num_list": list(tie_num_list), - "lose_num_list": list(lose_num_list), - "play_num_list": list(play_num_list), - } - - -class WeightSelfplayStrategy(SelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(WeightSelfplayStrategy, self).__init__(all_args, nenvs, exist_enemy_num) - self.recent_weight = 0.8 - self.recent_num = 3 - self.gama = 1 / (nenvs) - - def push_newone(self): - self.history_cnt += 1 - if self.history_cnt <= self.recent_num: - return - elif self.history_cnt == self.recent_num + 1: - self.qlist = [1] - else: - self.qlist.append(max(self.qlist)) - - def get_plist(self): - temp_plist = np.zeros([self.history_cnt]) - temp_plist[: (-1 * self.recent_num)] = ( - np.exp(self.qlist) / sum(np.exp(self.qlist)) * (1 - self.recent_weight) - ) - temp_plist[(-1 * self.recent_num) :] = self.recent_weight / self.recent_num - return temp_plist - - def update_weight(self, enemy_loses): - if self.history_cnt < self.recent_num + 2: - return - lose_enemy_ids = np.array(self.enemy_ids)[ - enemy_loses - ] # 输了的enemy_ids,进行更新,其中可能有重复的enemy_id - for enemy_id in lose_enemy_ids: - if enemy_id <= len(self.qlist) - 1: - divide_num = ( - len(self.qlist) - * np.exp(self.qlist[enemy_id]) - / sum(np.exp(self.qlist)) - ) - next_weight = self.qlist[enemy_id] - self.gama / divide_num - self.qlist[enemy_id] = next_weight - - -class WinRateSelfplayStrategy(SelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(WinRateSelfplayStrategy, self).__init__(all_args, nenvs, exist_enemy_num) - self.max_play_num = all_args.max_play_num - self.play_list = ( - [] - ) # 在该list中,每个对手维护一个长度不超过max_play_num的列表,1为该对手获胜, 0为平, -1为我方获胜 - self.recent_list = [] - self.recent_list_max_len = all_args.recent_list_max_len - self.latest_weight = all_args.latest_weight - self.least_win_rate = all_args.least_win_rate - self.stage2_least_win_rate = all_args.least_win_rate - self.stage = 1 - self.newest_pos = all_args.newest_pos - self.newest_weight = all_args.newest_weight - - def push_newone(self): - self.play_list.append([]) - self.history_cnt += 1 - - def get_info_list(self, info_list): - return_info = [] - for info in info_list: - if info == "win": - equal_num = 1 - elif info == "tie": - equal_num = 0 - elif info == "lose": - equal_num = -1 - num_list = [] - for enemy_play_list in self.play_list: - if info == "play": - num_list.append(len(enemy_play_list)) - else: - num_list.append(int(sum(np.array(enemy_play_list) == equal_num))) - return_info.append(num_list) - return tuple(return_info) - - def get_plist(self): - def f_hard(win_rate_list): - p = 1 - return win_rate_list**p - - def f_var(win_rate_list): - return (1 - win_rate_list) * win_rate_list - - win_num_list, tie_num_list, play_num_list = self.get_info_list( - ["win", "tie", "play"] - ) - win_rate_list = ( - np.array(win_num_list) + 0.5 * np.array(tie_num_list) + 0.5 - ) / (np.array(play_num_list) + 1) - return f_hard(win_rate_list) - - def update_play_list(self, win_enemy_ids, tie_enemy_ids, lose_enemy_ids): - if self.stage == 2: - win_enemy_num = (np.array(win_enemy_ids) != self.newest_pos).sum() - tie_enemy_num = (np.array(tie_enemy_ids) != self.newest_pos).sum() - lose_enemy_num = (np.array(lose_enemy_ids) != self.newest_pos).sum() - self.recent_list += ( - [1] * win_enemy_num + [0] * tie_enemy_num + [-1] * lose_enemy_num - ) - for win_enemy_id in win_enemy_ids: - self.play_list[win_enemy_id].append(1) - for tie_enemy_id in tie_enemy_ids: - self.play_list[tie_enemy_id].append(0) - for lose_enemy_id in lose_enemy_ids: - self.play_list[lose_enemy_id].append(-1) - self.cut_overflow() - - def update_win_rate(self, enemy_wins, enemy_ties, enemy_loses): - win_enemy_ids = np.array(self.enemy_ids)[enemy_wins] - tie_enemy_ids = np.array(self.enemy_ids)[enemy_ties] - lose_enemy_ids = np.array(self.enemy_ids)[enemy_loses] - self.update_play_list(win_enemy_ids, tie_enemy_ids, lose_enemy_ids) - - def restore(self, model_dir): - with open(model_dir + "/enemy_history_info.json") as f_obj: - enemy_info = json.load(f_obj) - self.history_cnt = enemy_info["history_cnt"] - self.play_list = enemy_info["play_list"] - - def get_enemy_play_dict(self): - win_num_list, tie_num_list, lose_num_list, play_num_list = self.get_info_list( - ["win", "tie", "lose", "play"] - ) - return { - "win_num_list": list(win_num_list), - "tie_num_list": list(tie_num_list), - "lose_num_list": list(lose_num_list), - "play_num_list": list(play_num_list), - } - - def update_win_info(self, data): - win_enemy_ids, tie_enemy_ids, lose_enemy_ids = ( - data["win_enemy_ids"], - data["tie_enemy_ids"], - data["lose_enemy_ids"], - ) - self.update_play_list(win_enemy_ids, tie_enemy_ids, lose_enemy_ids) - - def cut_overflow(self): - for index in range(len(self.play_list)): - if len(self.play_list[index]) > self.max_play_num: - self.play_list[index] = self.play_list[index][ - (-1) * self.max_play_num : - ] - if len(self.recent_list) > self.recent_list_max_len: - self.recent_list = self.recent_list[(-1) * self.recent_list_max_len :] - - def save_new_one(self, least_win_rate): - if self.stage == 1: - if sum(np.array(self.play_list[-1]) == -1) >= least_win_rate * ( - len(self.play_list[-1]) + 1 - ) and len(self.play_list[-1]) >= (self.max_play_num - 10): - if self.getcnt() - self.all_args.exist_enemy_num == 1: - return True - self.stage = 2 - print("switch to stage 2") - if self.stage == 2: - if sum(np.array(self.recent_list) == -1) >= self.stage2_least_win_rate * ( - len(self.recent_list) + 1 - ) and len(self.recent_list) >= (self.recent_list_max_len - 10): - self.stage = 1 - self.recent_list = [] - return True - return False - - -class ExistEnemySelfplayStrategy(WinRateSelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(ExistEnemySelfplayStrategy, self).__init__( - all_args, nenvs, exist_enemy_num - ) - self.all_args = all_args - self.enemy_ids = [0] * nenvs # 第一个step就会更新,所以初始化无所谓 - # 列表的前exist_enemy_num个为已存在的对手 - if exist_enemy_num > 0: - self.play_list = [[]] * exist_enemy_num - self.history_cnt = exist_enemy_num - self.exist_enemy_num = exist_enemy_num - self.max_enemy_num = all_args.max_enemy_num - - def get_final_plist(self, f_hard, f_var): - raise NotImplementedError - - def get_plist(self): - def f_hard(win_rate_list): - p = 2 - return win_rate_list**p - - def f_var(win_rate_list): - return (1 - win_rate_list) * win_rate_list - - plist = self.get_final_plist(f_hard, f_var) - if self.max_enemy_num != -1: - if self.history_cnt - self.exist_enemy_num > self.max_enemy_num: - mask_index = np.array( - list( - range( - self.exist_enemy_num, self.history_cnt - self.max_enemy_num - ) - ) - ) - zero_vec = np.zeros( - self.history_cnt - self.exist_enemy_num - self.max_enemy_num - ) - plist[mask_index] = zero_vec - - return plist - - -class VarExistEnemySelfplayStrategy(ExistEnemySelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(VarExistEnemySelfplayStrategy, self).__init__( - all_args, nenvs, exist_enemy_num - ) - - def get_final_plist(self, f_hard, f_var): - win_num_list, tie_num_list, play_num_list = self.get_info_list( - ["win", "tie", "play"] - ) - win_rate_list = ( - np.array(win_num_list) + 0.5 * np.array(tie_num_list) + 0.5 - ) / (np.array(play_num_list) + 1) - win_rate_list = f_var(win_rate_list) - - return win_rate_list - - -class WeightExistEnemySelfplayStrategy(ExistEnemySelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(WeightExistEnemySelfplayStrategy, self).__init__( - all_args, nenvs, exist_enemy_num - ) - - def get_final_plist(self, f_hard, f_var): - win_num_list, tie_num_list, play_num_list = self.get_info_list( - ["win", "tie", "play"] - ) - win_rate_list = ( - np.array(win_num_list) + 0.5 * np.array(tie_num_list) + 0.5 - ) / (np.array(play_num_list) + 1) - - if self.stage == 1: - win_rate_list = f_hard(win_rate_list)[:-1] - # if self.newest_pos != -1: - # win_rate_list[self.newest_pos] = 0 - win_rate_list = ( - win_rate_list / (sum(win_rate_list) + 1e-8) * (1 - self.latest_weight) - ) - return list(win_rate_list) + [self.latest_weight] - elif self.stage == 2: - win_rate_list = f_hard(win_rate_list) - if self.newest_pos != -1: - win_rate_list[self.newest_pos] = self.newest_weight - index_without_newest = list(range(self.history_cnt)) - index_without_newest.remove(self.newest_pos) - win_rate_list[index_without_newest] /= sum( - win_rate_list[index_without_newest] - ) - win_rate_list[index_without_newest] *= 1 - self.newest_weight - else: - win_rate_list /= sum(win_rate_list) - return win_rate_list diff --git a/tests/test_selfplay/test_selfplay_strategy.py b/tests/test_selfplay/test_selfplay_strategy.py deleted file mode 100644 index 61b04052..00000000 --- a/tests/test_selfplay/test_selfplay_strategy.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright 2023 The OpenRL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""""" -import os -import sys - -import pytest - -from openrl.selfplay.strategies import ( - NaiveSelfplayStrategy, - OnlyLatestSelfplayStrategy, - VarExistEnemySelfplayStrategy, - WeightExistEnemySelfplayStrategy, - WeightSelfplayStrategy, - WinRateSelfplayStrategy, -) - - -@pytest.fixture(scope="module", params=[""]) -def config(request): - from openrl.configs.config import create_config_parser - - cfg_parser = create_config_parser() - cfg = cfg_parser.parse_args(request.param.split()) - return cfg - - -@pytest.mark.unittest -def test_naive_selfplay(config): - strategy = NaiveSelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - strategy.update_win_rate(dones=True, enemy_wins=1) - strategy.push_newone() - - -@pytest.mark.unittest -def test_only_latest_selfplay(config): - strategy = OnlyLatestSelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - strategy.push_newone() - - -@pytest.mark.unittest -def test_weight_selfplay(config): - strategy = WeightSelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - strategy.push_newone() - - -@pytest.mark.unittest -def test_win_rate_selfplay(config): - strategy = WinRateSelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - - -@pytest.mark.unittest -def test_var_exist_enemy_selfplay(config): - strategy = VarExistEnemySelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - strategy.push_newone() - - -@pytest.mark.unittest -def test_weight_exist_enemy_selfplay(config): - strategy = WeightExistEnemySelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - strategy.push_newone() - - -if __name__ == "__main__": - sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py new file mode 100644 index 00000000..9e7b501f --- /dev/null +++ b/tests/test_selfplay/test_train_selfplay.py @@ -0,0 +1,120 @@ +import os +import sys + +import numpy as np +import pytest +import torch + +from openrl.configs.config import create_config_parser +from openrl.envs.common import make +from openrl.envs.wrappers import FlattenObservation +from openrl.envs.wrappers.pettingzoo_wrappers import RecordWinner +from openrl.modules.common import PPONet as Net +from openrl.runners.common import PPOAgent as Agent +from openrl.selfplay.wrappers.opponent_pool_wrapper import OpponentPoolWrapper +from openrl.selfplay.wrappers.random_opponent_wrapper import RandomOpponentWrapper + + +@pytest.fixture( + scope="module", + params=[ + "RandomOpponent", + "LastOpponent", + ], +) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"]) + for i, c in enumerate(cfg.callbacks): + if c["id"] == "SelfplayCallback": + c["args"][ + "opponent_template" + ] = "./examples/selfplay/opponent_templates/tictactoe_opponent" + cfg.callbacks[i] = c + elif c["id"] == "SelfplayAPI": + c["args"]["sample_strategy"] = request.param + cfg.callbacks[i] = c + else: + pass + + return cfg + + +def train(cfg): + # Create environment + env_num = 2 + render_model = None + env = make( + "tictactoe_v3", + render_mode=render_model, + env_num=env_num, + asynchronous=True, + opponent_wrappers=[RecordWinner, OpponentPoolWrapper], + env_wrappers=[FlattenObservation], + cfg=cfg, + ) + # Create neural network + + net = Net(env, cfg=cfg, device="cuda" if torch.cuda.is_available() else "cpu") + # Create agent + agent = Agent(net) + # Begin training + agent.train(total_time_steps=100) + env.close() + agent.save("./selfplay_agent/") + return agent + + +def evaluation(): + from examples.selfplay.tictactoe_utils.tictactoe_render import TictactoeRender + + print("Evaluation...") + env_num = 1 + env = make( + "tictactoe_v3", + env_num=env_num, + asynchronous=True, + opponent_wrappers=[TictactoeRender, RandomOpponentWrapper], + env_wrappers=[FlattenObservation], + auto_reset=False, + ) + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args([]) + net = Net(env, cfg=cfg, device="cuda" if torch.cuda.is_available() else "cpu") + + agent = Agent(net) + + agent.load("./selfplay_agent/") + agent.set_env(env) + env.reset(seed=0) + + total_reward = 0.0 + ep_num = 2 + for ep_now in range(ep_num): + obs, info = env.reset() + done = False + step = 0 + + while not np.any(done): + # predict next action based on the observation + action, _ = agent.act(obs, info, deterministic=True) + obs, r, done, info = env.step(action) + step += 1 + + if np.any(done): + total_reward += np.mean(r) > 0 + print(f"{ep_now}/{ep_num}: reward: {np.mean(r)}") + print(f"win rate: {total_reward/ep_num}") + env.close() + print("Evaluation finished.") + + +@pytest.mark.unittest +def test_train_selfplay(config): + train(config) + evaluation() + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 7373b044228d9b34b621730076973dde5e55c98d Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 23 Nov 2023 15:16:45 +0800 Subject: [PATCH 2/8] add selfplay test --- tests/test_selfplay/test_train_selfplay.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index 9e7b501f..c2ae29be 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -59,22 +59,20 @@ def train(cfg): # Create agent agent = Agent(net) # Begin training - agent.train(total_time_steps=100) + agent.train(total_time_steps=20) env.close() agent.save("./selfplay_agent/") return agent def evaluation(): - from examples.selfplay.tictactoe_utils.tictactoe_render import TictactoeRender - print("Evaluation...") env_num = 1 env = make( "tictactoe_v3", env_num=env_num, asynchronous=True, - opponent_wrappers=[TictactoeRender, RandomOpponentWrapper], + opponent_wrappers=[RandomOpponentWrapper], env_wrappers=[FlattenObservation], auto_reset=False, ) From 7ded5d55c8b2793bbf9624edebbe9d2d64857a4d Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 23 Nov 2023 16:25:25 +0800 Subject: [PATCH 3/8] add selfplay test --- examples/selfplay/selfplay.yaml | 2 +- openrl/selfplay/callbacks/selfplay_api.py | 6 +++++- setup.py | 10 ++++++++-- tests/test_selfplay/test_train_selfplay.py | 14 +++++++++++--- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/examples/selfplay/selfplay.yaml b/examples/selfplay/selfplay.yaml index 7a7c1bbe..8a05611d 100644 --- a/examples/selfplay/selfplay.yaml +++ b/examples/selfplay/selfplay.yaml @@ -1,6 +1,6 @@ globals: selfplay_api_host: 127.0.0.1 - selfplay_api_port: 10086 + selfplay_api_port: 13486 seed: 0 selfplay_api: diff --git a/openrl/selfplay/callbacks/selfplay_api.py b/openrl/selfplay/callbacks/selfplay_api.py index 3d148749..cdf9d04d 100644 --- a/openrl/selfplay/callbacks/selfplay_api.py +++ b/openrl/selfplay/callbacks/selfplay_api.py @@ -57,7 +57,10 @@ def _init_callback(self) -> None: success = self.api_client.set_sample_strategy(self.sample_strategy) try_time -= 1 if try_time <= 0: - raise RuntimeError("Failed to set sample strategy.") + raise RuntimeError( + f"Failed to set sample strategy: {self.sample_strategy}. host:" + f" {self.host}, port: {self.port}" + ) def _on_step(self) -> bool: # print("To send request to API server.") @@ -72,5 +75,6 @@ def _on_training_end(self) -> None: print(f"deleting {application_name}") serve.delete(application_name) del self.bind + serve.shutdown() if self.verbose >= 2: print(f"delete {application_name} done!") diff --git a/setup.py b/setup.py index 84da342e..7c8f31a5 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def get_install_requires() -> list: return [ "setuptools>=67.0", - "gymnasium", + "gymnasium>=0.29", "click", "termcolor", "gym", @@ -71,7 +71,13 @@ def get_extra_requires() -> dict: "evaluate", ], "selfplay": ["ray[default]", "ray[serve]", "pettingzoo[classic]", "trueskill"], - "selfplay_test": ["pettingzoo[mpe]", "pettingzoo[butterfly]"], + "selfplay_test": [ + "ray[default]", + "ray[serve]", + "fastapi", + "pettingzoo[mpe]", + "pettingzoo[butterfly]", + ], "retro": ["gym-retro"], "super_mario": ["gym-super-mario-bros"], "atari": ["gymnasium[atari]", "gymnasium[accept-rom-license]"], diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index c2ae29be..7e440bad 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -3,6 +3,7 @@ import numpy as np import pytest +import ray import torch from openrl.configs.config import create_config_parser @@ -18,22 +19,29 @@ @pytest.fixture( scope="module", params=[ - "RandomOpponent", - "LastOpponent", + {"port": 13486, "strategy": "RandomOpponent"}, + {"port": 13487, "strategy": "LastOpponent"}, ], ) def config(request): cfg_parser = create_config_parser() cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"]) + cfg.selfplay_api.port = request.param["port"] for i, c in enumerate(cfg.callbacks): if c["id"] == "SelfplayCallback": c["args"][ "opponent_template" ] = "./examples/selfplay/opponent_templates/tictactoe_opponent" + port = c["args"]["api_address"].split(":")[-1].split("/")[0] + c["args"]["api_address"] = c["args"]["api_address"].replace( + port, str(request.param["port"]) + ) cfg.callbacks[i] = c elif c["id"] == "SelfplayAPI": - c["args"]["sample_strategy"] = request.param + c["args"]["sample_strategy"] = request.param["strategy"] + c["args"]["port"] = request.param["port"] cfg.callbacks[i] = c + else: pass From 2bd465841e915f1fc0b2a462f93790ed9b411369 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 23 Nov 2023 16:36:24 +0800 Subject: [PATCH 4/8] add selfplay test --- tests/test_selfplay/test_train_selfplay.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index 7e440bad..9af75a1c 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -3,7 +3,6 @@ import numpy as np import pytest -import ray import torch from openrl.configs.config import create_config_parser @@ -20,7 +19,6 @@ scope="module", params=[ {"port": 13486, "strategy": "RandomOpponent"}, - {"port": 13487, "strategy": "LastOpponent"}, ], ) def config(request): @@ -67,7 +65,7 @@ def train(cfg): # Create agent agent = Agent(net) # Begin training - agent.train(total_time_steps=20) + agent.train(total_time_steps=10) env.close() agent.save("./selfplay_agent/") return agent From 17cf742192511b40550e48265e3a2468df586150 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 23 Nov 2023 16:54:04 +0800 Subject: [PATCH 5/8] add selfplay test --- tests/test_selfplay/test_train_selfplay.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index 9af75a1c..bdeb40c1 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -1,4 +1,5 @@ import os +import socket import sys import numpy as np @@ -15,10 +16,17 @@ from openrl.selfplay.wrappers.random_opponent_wrapper import RandomOpponentWrapper +def find_free_port(): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + return s.getsockname()[1] + + @pytest.fixture( scope="module", params=[ - {"port": 13486, "strategy": "RandomOpponent"}, + {"port": find_free_port(), "strategy": "RandomOpponent"}, ], ) def config(request): @@ -54,7 +62,7 @@ def train(cfg): "tictactoe_v3", render_mode=render_model, env_num=env_num, - asynchronous=True, + asynchronous=False, opponent_wrappers=[RecordWinner, OpponentPoolWrapper], env_wrappers=[FlattenObservation], cfg=cfg, From 434495499708752f0d32fb519e6fbf9f2ad63110 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 24 Nov 2023 13:02:32 +0800 Subject: [PATCH 6/8] add selfplay test --- .github/workflows/unit_test.yml | 2 +- tests/test_selfplay/test_train_selfplay.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml index e327cdf5..9671f935 100644 --- a/.github/workflows/unit_test.yml +++ b/.github/workflows/unit_test.yml @@ -31,7 +31,7 @@ jobs: - name: do_unittest timeout-minutes: 40 run: | - xvfb-run -s "-screen 0 1400x900x24" python3 -m pytest tests --cov=openrl --cov-report=xml -m unittest --cov-report=term-missing --durations=0 -v --color=yes + xvfb-run -s "-screen 0 1400x900x24" python3 -m pytest tests --cov=openrl --cov-report=xml -m unittest --cov-report=term-missing --durations=0 -v --color=yes -s - name: Upload coverage reports to Codecov with GitHub Action uses: codecov/codecov-action@v3 with: diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index bdeb40c1..34d28fc3 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -27,12 +27,14 @@ def find_free_port(): scope="module", params=[ {"port": find_free_port(), "strategy": "RandomOpponent"}, + {"port": find_free_port(), "strategy": "LastOpponent"}, ], ) def config(request): cfg_parser = create_config_parser() cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"]) cfg.selfplay_api.port = request.param["port"] + print("port:",request.param["port"]) for i, c in enumerate(cfg.callbacks): if c["id"] == "SelfplayCallback": c["args"][ From a702e8d93f9e9f2a6b064ec32b7e92d7efd850b1 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 24 Nov 2023 13:28:51 +0800 Subject: [PATCH 7/8] add selfplay test --- openrl/selfplay/callbacks/selfplay_api.py | 2 +- openrl/selfplay/selfplay_api/selfplay_api.py | 2 +- setup.py | 9 +++++++-- tests/test_selfplay/test_train_selfplay.py | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/openrl/selfplay/callbacks/selfplay_api.py b/openrl/selfplay/callbacks/selfplay_api.py index cdf9d04d..e2214ecb 100644 --- a/openrl/selfplay/callbacks/selfplay_api.py +++ b/openrl/selfplay/callbacks/selfplay_api.py @@ -50,7 +50,7 @@ def _init_callback(self) -> None: ) self.bind = SelfplayAPIServer.bind() - serve.run(self.bind) + serve.run(self.bind, route_prefix="/selfplay") success = False try_time = 10 while not success: diff --git a/openrl/selfplay/selfplay_api/selfplay_api.py b/openrl/selfplay/selfplay_api/selfplay_api.py index 2c346b46..307c4fcc 100644 --- a/openrl/selfplay/selfplay_api/selfplay_api.py +++ b/openrl/selfplay/selfplay_api/selfplay_api.py @@ -33,7 +33,7 @@ from openrl.selfplay.selfplay_api.opponent_model import BattleResult -@serve.deployment(route_prefix="/selfplay") +@serve.deployment() @serve.ingress(app) class SelfplayAPIServer(BaseSelfplayAPIServer): @app.post("/set_sample_strategy") diff --git a/setup.py b/setup.py index 7c8f31a5..043a7267 100644 --- a/setup.py +++ b/setup.py @@ -70,9 +70,14 @@ def get_extra_requires() -> dict: "datasets==2.13", "evaluate", ], - "selfplay": ["ray[default]", "ray[serve]", "pettingzoo[classic]", "trueskill"], + "selfplay": [ + "ray[default]>=2.7", + "ray[serve]", + "pettingzoo[classic]", + "trueskill", + ], "selfplay_test": [ - "ray[default]", + "ray[default]>=2.7", "ray[serve]", "fastapi", "pettingzoo[mpe]", diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index 34d28fc3..a67ea964 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -34,7 +34,7 @@ def config(request): cfg_parser = create_config_parser() cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"]) cfg.selfplay_api.port = request.param["port"] - print("port:",request.param["port"]) + print("port:", request.param["port"]) for i, c in enumerate(cfg.callbacks): if c["id"] == "SelfplayCallback": c["args"][ From 714a9ec6eae5b988d2a978836a2fea4d92aa37ce Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 24 Nov 2023 14:27:53 +0800 Subject: [PATCH 8/8] add selfplay test --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 043a7267..28cffd3c 100644 --- a/setup.py +++ b/setup.py @@ -73,12 +73,14 @@ def get_extra_requires() -> dict: "selfplay": [ "ray[default]>=2.7", "ray[serve]", + "async_timeout", "pettingzoo[classic]", "trueskill", ], "selfplay_test": [ "ray[default]>=2.7", "ray[serve]", + "async_timeout", "fastapi", "pettingzoo[mpe]", "pettingzoo[butterfly]",