From b08d096bfa69e2a7bb8c903c3a4e8fb466f8f11d Mon Sep 17 00:00:00 2001
From: huangshiyu <huangsy1314@163.com>
Date: Thu, 23 Nov 2023 14:51:34 +0800
Subject: [PATCH 1/8] add selfplay test

---
 openrl/selfplay/opponents/utils.py            |   3 +
 openrl/selfplay/strategies/__init__.py        |  41 --
 openrl/selfplay/strategies/base_strategy.py   |  39 --
 openrl/selfplay/strategies/strategies.py      | 413 ------------------
 tests/test_selfplay/test_selfplay_strategy.py |  91 ----
 tests/test_selfplay/test_train_selfplay.py    | 120 +++++
 6 files changed, 123 insertions(+), 584 deletions(-)
 delete mode 100644 openrl/selfplay/strategies/__init__.py
 delete mode 100644 openrl/selfplay/strategies/base_strategy.py
 delete mode 100644 openrl/selfplay/strategies/strategies.py
 delete mode 100644 tests/test_selfplay/test_selfplay_strategy.py
 create mode 100644 tests/test_selfplay/test_train_selfplay.py

diff --git a/openrl/selfplay/opponents/utils.py b/openrl/selfplay/opponents/utils.py
index d1d983d5..42ddbb2b 100644
--- a/openrl/selfplay/opponents/utils.py
+++ b/openrl/selfplay/opponents/utils.py
@@ -28,6 +28,9 @@
 
 
 def check_opponent_template(opponent_template: Union[str, Path]):
+    assert isinstance(opponent_template, Path) or isinstance(
+        opponent_template, str
+    ), f"opponent_template {opponent_template} must be a Path or str"
     if isinstance(opponent_template, str):
         opponent_template = Path(opponent_template)
     assert (
diff --git a/openrl/selfplay/strategies/__init__.py b/openrl/selfplay/strategies/__init__.py
deleted file mode 100644
index 2908f8b4..00000000
--- a/openrl/selfplay/strategies/__init__.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# Copyright 2023 The OpenRL Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""""""
-from openrl.selfplay.strategies.strategies import (
-    NaiveSelfplayStrategy,
-    OnlyLatestSelfplayStrategy,
-    VarExistEnemySelfplayStrategy,
-    WeightExistEnemySelfplayStrategy,
-    WeightSelfplayStrategy,
-    WinRateSelfplayStrategy,
-)
-
-
-def make_strategy(strategy_name):
-    if strategy_name == "Naive":
-        selfplay_strategy = NaiveSelfplayStrategy
-    elif strategy_name == "OnlyLatest":
-        selfplay_strategy = OnlyLatestSelfplayStrategy
-    elif strategy_name == "Weight":
-        selfplay_strategy = WeightSelfplayStrategy
-    elif strategy_name == "WinRate":
-        selfplay_strategy = WinRateSelfplayStrategy
-    elif strategy_name == "VarExistEnemy":
-        selfplay_strategy = VarExistEnemySelfplayStrategy
-    elif strategy_name == "WeightExistEnemy":
-        selfplay_strategy = WeightExistEnemySelfplayStrategy
-    return selfplay_strategy
diff --git a/openrl/selfplay/strategies/base_strategy.py b/openrl/selfplay/strategies/base_strategy.py
deleted file mode 100644
index 4e280b13..00000000
--- a/openrl/selfplay/strategies/base_strategy.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from abc import abstractmethod
-
-
-class BaseSelfplayStrategy:
-    @abstractmethod
-    def __init__(self, all_args, nenvs, exist_enemy_num):
-        raise NotImplementedError
-
-    @abstractmethod
-    def getcnt(self):
-        raise NotImplementedError
-
-    @abstractmethod
-    def update_enemy_ids(self, new_enemy_ids):
-        raise NotImplementedError
-
-    @abstractmethod
-    def restore(self, model_dir):
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_qlist(self):
-        raise NotImplementedError
-
-    @abstractmethod
-    def update_weight(self, enemy_loses):
-        raise NotImplementedError
-
-    @abstractmethod
-    def update_win_rate(self, dones, enemy_wins):
-        raise NotImplementedError
-
-    @abstractmethod
-    def push_newone(self):
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_plist(self):
-        raise NotImplementedError
diff --git a/openrl/selfplay/strategies/strategies.py b/openrl/selfplay/strategies/strategies.py
deleted file mode 100644
index 28e492ec..00000000
--- a/openrl/selfplay/strategies/strategies.py
+++ /dev/null
@@ -1,413 +0,0 @@
-import json
-
-import numpy as np
-
-from openrl.selfplay.strategies.base_strategy import BaseSelfplayStrategy
-
-
-class SelfplayStrategy(BaseSelfplayStrategy):
-    def __init__(self, all_args, nenvs, exist_enemy_num):
-        # qlist和history_cnt的数据结构
-        self.all_args = all_args
-        self.qlist = []
-        self.history_cnt = 0
-        self.enemy_ids = [0] * nenvs
-        self.length = nenvs
-
-    def getcnt(self):
-        return self.history_cnt
-
-    def update_enemy_ids(self, new_enemy_ids):
-        self.enemy_ids = new_enemy_ids
-
-    def restore(self, model_dir):
-        with open(model_dir + "/enemy_history_info.json") as f_obj:
-            enemy_info = json.load(f_obj)
-        self.qlist = enemy_info["qlist"]
-        self.history_cnt = enemy_info["history_cnt"]
-
-    def get_qlist(self):
-        return self.qlist
-
-    def update_weight(self, enemy_loses):
-        pass
-
-    def update_win_rate(self, dones, enemy_wins):
-        pass
-
-    def push_newone(self):
-        pass
-
-
-class RatioSelfplayStrategy(SelfplayStrategy):
-    def __init__(self, all_args, nenvs, exist_enemy_num):
-        super(RatioSelfplayStrategy, self).__init__(all_args, nenvs)
-
-    def push_newone(self):
-        self.history_cnt += 1
-
-    def get_plist(self):
-        if self.history_cnt == 1:
-            return [1]
-        temp_plist = np.logspace(
-            0, self.history_cnt - 1, self.history_cnt, endpoint=True, base=1.5
-        )
-        temp_plist[-1] = sum(temp_plist[:-1]) * 4
-        temp_plist /= sum(temp_plist)
-        return temp_plist
-
-
-class NaiveSelfplayStrategy(SelfplayStrategy):
-    def __init__(self, all_args, nenvs, exist_enemy_num):
-        super(NaiveSelfplayStrategy, self).__init__(all_args, nenvs, exist_enemy_num)
-
-    def push_newone(self):
-        self.history_cnt += 1
-
-    def get_plist(self):
-        return [1] * (self.history_cnt - 1) + [4 * (self.history_cnt - 1)]
-
-    def save_new_one(self):
-        return True
-
-
-class OnlyLatestSelfplayStrategy(SelfplayStrategy):
-    def __init__(self, all_args, nenvs, exist_enemy_num):
-        super(OnlyLatestSelfplayStrategy, self).__init__(
-            all_args, nenvs, exist_enemy_num
-        )
-        self.play_list = []
-        self.max_play_num = all_args.max_play_num
-        self.least_win_rate = all_args.least_win_rate
-
-    def push_newone(self):
-        self.play_list.append([])
-        self.history_cnt += 1
-
-    def get_plist(self):
-        return [0] * (self.history_cnt - 1) + [1]
-
-    def save_new_one(self, least_win_rate):
-        if sum(np.array(self.play_list[-1]) == -1) >= least_win_rate * (
-            len(self.play_list[-1]) + 1
-        ) and len(self.play_list[-1]) >= (self.max_play_num - 10):
-            return True
-
-    def update_play_list(self, win_enemy_ids, tie_enemy_ids, lose_enemy_ids):
-        for win_enemy_id in win_enemy_ids:
-            self.play_list[win_enemy_id].append(1)
-        for tie_enemy_id in tie_enemy_ids:
-            self.play_list[tie_enemy_id].append(0)
-        for lose_enemy_id in lose_enemy_ids:
-            self.play_list[lose_enemy_id].append(-1)
-        self.cut_overflow()
-
-    def update_win_rate(self, enemy_wins, enemy_ties, enemy_loses):
-        win_enemy_ids = np.array(self.enemy_ids)[enemy_wins]
-        tie_enemy_ids = np.array(self.enemy_ids)[enemy_ties]
-        lose_enemy_ids = np.array(self.enemy_ids)[enemy_loses]
-        self.update_play_list(win_enemy_ids, tie_enemy_ids, lose_enemy_ids)
-
-    def cut_overflow(self):
-        for index in range(len(self.play_list)):
-            if len(self.play_list[index]) > self.max_play_num:
-                self.play_list[index] = self.play_list[index][
-                    (-1) * self.max_play_num :
-                ]
-
-    def get_info_list(self, info_list):
-        return_info = []
-        for info in info_list:
-            if info == "win":
-                equal_num = 1
-            elif info == "tie":
-                equal_num = 0
-            elif info == "lose":
-                equal_num = -1
-            num_list = []
-            for enemy_play_list in self.play_list:
-                if info == "play":
-                    num_list.append(len(enemy_play_list))
-                else:
-                    num_list.append(int(sum(np.array(enemy_play_list) == equal_num)))
-            return_info.append(num_list)
-        return tuple(return_info)
-
-    def get_enemy_play_dict(self):
-        win_num_list, tie_num_list, lose_num_list, play_num_list = self.get_info_list(
-            ["win", "tie", "lose", "play"]
-        )
-        return {
-            "win_num_list": list(win_num_list),
-            "tie_num_list": list(tie_num_list),
-            "lose_num_list": list(lose_num_list),
-            "play_num_list": list(play_num_list),
-        }
-
-
-class WeightSelfplayStrategy(SelfplayStrategy):
-    def __init__(self, all_args, nenvs, exist_enemy_num):
-        super(WeightSelfplayStrategy, self).__init__(all_args, nenvs, exist_enemy_num)
-        self.recent_weight = 0.8
-        self.recent_num = 3
-        self.gama = 1 / (nenvs)
-
-    def push_newone(self):
-        self.history_cnt += 1
-        if self.history_cnt <= self.recent_num:
-            return
-        elif self.history_cnt == self.recent_num + 1:
-            self.qlist = [1]
-        else:
-            self.qlist.append(max(self.qlist))
-
-    def get_plist(self):
-        temp_plist = np.zeros([self.history_cnt])
-        temp_plist[: (-1 * self.recent_num)] = (
-            np.exp(self.qlist) / sum(np.exp(self.qlist)) * (1 - self.recent_weight)
-        )
-        temp_plist[(-1 * self.recent_num) :] = self.recent_weight / self.recent_num
-        return temp_plist
-
-    def update_weight(self, enemy_loses):
-        if self.history_cnt < self.recent_num + 2:
-            return
-        lose_enemy_ids = np.array(self.enemy_ids)[
-            enemy_loses
-        ]  # 输了的enemy_ids,进行更新,其中可能有重复的enemy_id
-        for enemy_id in lose_enemy_ids:
-            if enemy_id <= len(self.qlist) - 1:
-                divide_num = (
-                    len(self.qlist)
-                    * np.exp(self.qlist[enemy_id])
-                    / sum(np.exp(self.qlist))
-                )
-                next_weight = self.qlist[enemy_id] - self.gama / divide_num
-                self.qlist[enemy_id] = next_weight
-
-
-class WinRateSelfplayStrategy(SelfplayStrategy):
-    def __init__(self, all_args, nenvs, exist_enemy_num):
-        super(WinRateSelfplayStrategy, self).__init__(all_args, nenvs, exist_enemy_num)
-        self.max_play_num = all_args.max_play_num
-        self.play_list = (
-            []
-        )  # 在该list中，每个对手维护一个长度不超过max_play_num的列表，1为该对手获胜, 0为平, -1为我方获胜
-        self.recent_list = []
-        self.recent_list_max_len = all_args.recent_list_max_len
-        self.latest_weight = all_args.latest_weight
-        self.least_win_rate = all_args.least_win_rate
-        self.stage2_least_win_rate = all_args.least_win_rate
-        self.stage = 1
-        self.newest_pos = all_args.newest_pos
-        self.newest_weight = all_args.newest_weight
-
-    def push_newone(self):
-        self.play_list.append([])
-        self.history_cnt += 1
-
-    def get_info_list(self, info_list):
-        return_info = []
-        for info in info_list:
-            if info == "win":
-                equal_num = 1
-            elif info == "tie":
-                equal_num = 0
-            elif info == "lose":
-                equal_num = -1
-            num_list = []
-            for enemy_play_list in self.play_list:
-                if info == "play":
-                    num_list.append(len(enemy_play_list))
-                else:
-                    num_list.append(int(sum(np.array(enemy_play_list) == equal_num)))
-            return_info.append(num_list)
-        return tuple(return_info)
-
-    def get_plist(self):
-        def f_hard(win_rate_list):
-            p = 1
-            return win_rate_list**p
-
-        def f_var(win_rate_list):
-            return (1 - win_rate_list) * win_rate_list
-
-        win_num_list, tie_num_list, play_num_list = self.get_info_list(
-            ["win", "tie", "play"]
-        )
-        win_rate_list = (
-            np.array(win_num_list) + 0.5 * np.array(tie_num_list) + 0.5
-        ) / (np.array(play_num_list) + 1)
-        return f_hard(win_rate_list)
-
-    def update_play_list(self, win_enemy_ids, tie_enemy_ids, lose_enemy_ids):
-        if self.stage == 2:
-            win_enemy_num = (np.array(win_enemy_ids) != self.newest_pos).sum()
-            tie_enemy_num = (np.array(tie_enemy_ids) != self.newest_pos).sum()
-            lose_enemy_num = (np.array(lose_enemy_ids) != self.newest_pos).sum()
-            self.recent_list += (
-                [1] * win_enemy_num + [0] * tie_enemy_num + [-1] * lose_enemy_num
-            )
-        for win_enemy_id in win_enemy_ids:
-            self.play_list[win_enemy_id].append(1)
-        for tie_enemy_id in tie_enemy_ids:
-            self.play_list[tie_enemy_id].append(0)
-        for lose_enemy_id in lose_enemy_ids:
-            self.play_list[lose_enemy_id].append(-1)
-        self.cut_overflow()
-
-    def update_win_rate(self, enemy_wins, enemy_ties, enemy_loses):
-        win_enemy_ids = np.array(self.enemy_ids)[enemy_wins]
-        tie_enemy_ids = np.array(self.enemy_ids)[enemy_ties]
-        lose_enemy_ids = np.array(self.enemy_ids)[enemy_loses]
-        self.update_play_list(win_enemy_ids, tie_enemy_ids, lose_enemy_ids)
-
-    def restore(self, model_dir):
-        with open(model_dir + "/enemy_history_info.json") as f_obj:
-            enemy_info = json.load(f_obj)
-        self.history_cnt = enemy_info["history_cnt"]
-        self.play_list = enemy_info["play_list"]
-
-    def get_enemy_play_dict(self):
-        win_num_list, tie_num_list, lose_num_list, play_num_list = self.get_info_list(
-            ["win", "tie", "lose", "play"]
-        )
-        return {
-            "win_num_list": list(win_num_list),
-            "tie_num_list": list(tie_num_list),
-            "lose_num_list": list(lose_num_list),
-            "play_num_list": list(play_num_list),
-        }
-
-    def update_win_info(self, data):
-        win_enemy_ids, tie_enemy_ids, lose_enemy_ids = (
-            data["win_enemy_ids"],
-            data["tie_enemy_ids"],
-            data["lose_enemy_ids"],
-        )
-        self.update_play_list(win_enemy_ids, tie_enemy_ids, lose_enemy_ids)
-
-    def cut_overflow(self):
-        for index in range(len(self.play_list)):
-            if len(self.play_list[index]) > self.max_play_num:
-                self.play_list[index] = self.play_list[index][
-                    (-1) * self.max_play_num :
-                ]
-        if len(self.recent_list) > self.recent_list_max_len:
-            self.recent_list = self.recent_list[(-1) * self.recent_list_max_len :]
-
-    def save_new_one(self, least_win_rate):
-        if self.stage == 1:
-            if sum(np.array(self.play_list[-1]) == -1) >= least_win_rate * (
-                len(self.play_list[-1]) + 1
-            ) and len(self.play_list[-1]) >= (self.max_play_num - 10):
-                if self.getcnt() - self.all_args.exist_enemy_num == 1:
-                    return True
-                self.stage = 2
-                print("switch to stage 2")
-        if self.stage == 2:
-            if sum(np.array(self.recent_list) == -1) >= self.stage2_least_win_rate * (
-                len(self.recent_list) + 1
-            ) and len(self.recent_list) >= (self.recent_list_max_len - 10):
-                self.stage = 1
-                self.recent_list = []
-                return True
-        return False
-
-
-class ExistEnemySelfplayStrategy(WinRateSelfplayStrategy):
-    def __init__(self, all_args, nenvs, exist_enemy_num):
-        super(ExistEnemySelfplayStrategy, self).__init__(
-            all_args, nenvs, exist_enemy_num
-        )
-        self.all_args = all_args
-        self.enemy_ids = [0] * nenvs  # 第一个step就会更新，所以初始化无所谓
-        # 列表的前exist_enemy_num个为已存在的对手
-        if exist_enemy_num > 0:
-            self.play_list = [[]] * exist_enemy_num
-        self.history_cnt = exist_enemy_num
-        self.exist_enemy_num = exist_enemy_num
-        self.max_enemy_num = all_args.max_enemy_num
-
-    def get_final_plist(self, f_hard, f_var):
-        raise NotImplementedError
-
-    def get_plist(self):
-        def f_hard(win_rate_list):
-            p = 2
-            return win_rate_list**p
-
-        def f_var(win_rate_list):
-            return (1 - win_rate_list) * win_rate_list
-
-        plist = self.get_final_plist(f_hard, f_var)
-        if self.max_enemy_num != -1:
-            if self.history_cnt - self.exist_enemy_num > self.max_enemy_num:
-                mask_index = np.array(
-                    list(
-                        range(
-                            self.exist_enemy_num, self.history_cnt - self.max_enemy_num
-                        )
-                    )
-                )
-                zero_vec = np.zeros(
-                    self.history_cnt - self.exist_enemy_num - self.max_enemy_num
-                )
-                plist[mask_index] = zero_vec
-
-        return plist
-
-
-class VarExistEnemySelfplayStrategy(ExistEnemySelfplayStrategy):
-    def __init__(self, all_args, nenvs, exist_enemy_num):
-        super(VarExistEnemySelfplayStrategy, self).__init__(
-            all_args, nenvs, exist_enemy_num
-        )
-
-    def get_final_plist(self, f_hard, f_var):
-        win_num_list, tie_num_list, play_num_list = self.get_info_list(
-            ["win", "tie", "play"]
-        )
-        win_rate_list = (
-            np.array(win_num_list) + 0.5 * np.array(tie_num_list) + 0.5
-        ) / (np.array(play_num_list) + 1)
-        win_rate_list = f_var(win_rate_list)
-
-        return win_rate_list
-
-
-class WeightExistEnemySelfplayStrategy(ExistEnemySelfplayStrategy):
-    def __init__(self, all_args, nenvs, exist_enemy_num):
-        super(WeightExistEnemySelfplayStrategy, self).__init__(
-            all_args, nenvs, exist_enemy_num
-        )
-
-    def get_final_plist(self, f_hard, f_var):
-        win_num_list, tie_num_list, play_num_list = self.get_info_list(
-            ["win", "tie", "play"]
-        )
-        win_rate_list = (
-            np.array(win_num_list) + 0.5 * np.array(tie_num_list) + 0.5
-        ) / (np.array(play_num_list) + 1)
-
-        if self.stage == 1:
-            win_rate_list = f_hard(win_rate_list)[:-1]
-            # if self.newest_pos != -1:
-            #     win_rate_list[self.newest_pos] = 0
-            win_rate_list = (
-                win_rate_list / (sum(win_rate_list) + 1e-8) * (1 - self.latest_weight)
-            )
-            return list(win_rate_list) + [self.latest_weight]
-        elif self.stage == 2:
-            win_rate_list = f_hard(win_rate_list)
-            if self.newest_pos != -1:
-                win_rate_list[self.newest_pos] = self.newest_weight
-                index_without_newest = list(range(self.history_cnt))
-                index_without_newest.remove(self.newest_pos)
-                win_rate_list[index_without_newest] /= sum(
-                    win_rate_list[index_without_newest]
-                )
-                win_rate_list[index_without_newest] *= 1 - self.newest_weight
-            else:
-                win_rate_list /= sum(win_rate_list)
-            return win_rate_list
diff --git a/tests/test_selfplay/test_selfplay_strategy.py b/tests/test_selfplay/test_selfplay_strategy.py
deleted file mode 100644
index 61b04052..00000000
--- a/tests/test_selfplay/test_selfplay_strategy.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# Copyright 2023 The OpenRL Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""""""
-import os
-import sys
-
-import pytest
-
-from openrl.selfplay.strategies import (
-    NaiveSelfplayStrategy,
-    OnlyLatestSelfplayStrategy,
-    VarExistEnemySelfplayStrategy,
-    WeightExistEnemySelfplayStrategy,
-    WeightSelfplayStrategy,
-    WinRateSelfplayStrategy,
-)
-
-
-@pytest.fixture(scope="module", params=[""])
-def config(request):
-    from openrl.configs.config import create_config_parser
-
-    cfg_parser = create_config_parser()
-    cfg = cfg_parser.parse_args(request.param.split())
-    return cfg
-
-
-@pytest.mark.unittest
-def test_naive_selfplay(config):
-    strategy = NaiveSelfplayStrategy(config, 1, 1)
-    strategy.get_plist()
-    strategy.update_weight(enemy_loses=1)
-    strategy.update_win_rate(dones=True, enemy_wins=1)
-    strategy.push_newone()
-
-
-@pytest.mark.unittest
-def test_only_latest_selfplay(config):
-    strategy = OnlyLatestSelfplayStrategy(config, 1, 1)
-    strategy.get_plist()
-    strategy.update_weight(enemy_loses=1)
-    strategy.push_newone()
-
-
-@pytest.mark.unittest
-def test_weight_selfplay(config):
-    strategy = WeightSelfplayStrategy(config, 1, 1)
-    strategy.get_plist()
-    strategy.update_weight(enemy_loses=1)
-    strategy.push_newone()
-
-
-@pytest.mark.unittest
-def test_win_rate_selfplay(config):
-    strategy = WinRateSelfplayStrategy(config, 1, 1)
-    strategy.get_plist()
-    strategy.update_weight(enemy_loses=1)
-
-
-@pytest.mark.unittest
-def test_var_exist_enemy_selfplay(config):
-    strategy = VarExistEnemySelfplayStrategy(config, 1, 1)
-    strategy.get_plist()
-    strategy.update_weight(enemy_loses=1)
-    strategy.push_newone()
-
-
-@pytest.mark.unittest
-def test_weight_exist_enemy_selfplay(config):
-    strategy = WeightExistEnemySelfplayStrategy(config, 1, 1)
-    strategy.get_plist()
-    strategy.update_weight(enemy_loses=1)
-    strategy.push_newone()
-
-
-if __name__ == "__main__":
-    sys.exit(pytest.main(["-sv", os.path.basename(__file__)]))
diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py
new file mode 100644
index 00000000..9e7b501f
--- /dev/null
+++ b/tests/test_selfplay/test_train_selfplay.py
@@ -0,0 +1,120 @@
+import os
+import sys
+
+import numpy as np
+import pytest
+import torch
+
+from openrl.configs.config import create_config_parser
+from openrl.envs.common import make
+from openrl.envs.wrappers import FlattenObservation
+from openrl.envs.wrappers.pettingzoo_wrappers import RecordWinner
+from openrl.modules.common import PPONet as Net
+from openrl.runners.common import PPOAgent as Agent
+from openrl.selfplay.wrappers.opponent_pool_wrapper import OpponentPoolWrapper
+from openrl.selfplay.wrappers.random_opponent_wrapper import RandomOpponentWrapper
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        "RandomOpponent",
+        "LastOpponent",
+    ],
+)
+def config(request):
+    cfg_parser = create_config_parser()
+    cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"])
+    for i, c in enumerate(cfg.callbacks):
+        if c["id"] == "SelfplayCallback":
+            c["args"][
+                "opponent_template"
+            ] = "./examples/selfplay/opponent_templates/tictactoe_opponent"
+            cfg.callbacks[i] = c
+        elif c["id"] == "SelfplayAPI":
+            c["args"]["sample_strategy"] = request.param
+            cfg.callbacks[i] = c
+        else:
+            pass
+
+    return cfg
+
+
+def train(cfg):
+    # Create environment
+    env_num = 2
+    render_model = None
+    env = make(
+        "tictactoe_v3",
+        render_mode=render_model,
+        env_num=env_num,
+        asynchronous=True,
+        opponent_wrappers=[RecordWinner, OpponentPoolWrapper],
+        env_wrappers=[FlattenObservation],
+        cfg=cfg,
+    )
+    # Create neural network
+
+    net = Net(env, cfg=cfg, device="cuda" if torch.cuda.is_available() else "cpu")
+    # Create agent
+    agent = Agent(net)
+    # Begin training
+    agent.train(total_time_steps=100)
+    env.close()
+    agent.save("./selfplay_agent/")
+    return agent
+
+
+def evaluation():
+    from examples.selfplay.tictactoe_utils.tictactoe_render import TictactoeRender
+
+    print("Evaluation...")
+    env_num = 1
+    env = make(
+        "tictactoe_v3",
+        env_num=env_num,
+        asynchronous=True,
+        opponent_wrappers=[TictactoeRender, RandomOpponentWrapper],
+        env_wrappers=[FlattenObservation],
+        auto_reset=False,
+    )
+
+    cfg_parser = create_config_parser()
+    cfg = cfg_parser.parse_args([])
+    net = Net(env, cfg=cfg, device="cuda" if torch.cuda.is_available() else "cpu")
+
+    agent = Agent(net)
+
+    agent.load("./selfplay_agent/")
+    agent.set_env(env)
+    env.reset(seed=0)
+
+    total_reward = 0.0
+    ep_num = 2
+    for ep_now in range(ep_num):
+        obs, info = env.reset()
+        done = False
+        step = 0
+
+        while not np.any(done):
+            # predict next action based on the observation
+            action, _ = agent.act(obs, info, deterministic=True)
+            obs, r, done, info = env.step(action)
+            step += 1
+
+            if np.any(done):
+                total_reward += np.mean(r) > 0
+                print(f"{ep_now}/{ep_num}: reward: {np.mean(r)}")
+    print(f"win rate: {total_reward/ep_num}")
+    env.close()
+    print("Evaluation finished.")
+
+
+@pytest.mark.unittest
+def test_train_selfplay(config):
+    train(config)
+    evaluation()
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-sv", os.path.basename(__file__)]))

From 7373b044228d9b34b621730076973dde5e55c98d Mon Sep 17 00:00:00 2001
From: huangshiyu <huangsy1314@163.com>
Date: Thu, 23 Nov 2023 15:16:45 +0800
Subject: [PATCH 2/8] add selfplay test

---
 tests/test_selfplay/test_train_selfplay.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py
index 9e7b501f..c2ae29be 100644
--- a/tests/test_selfplay/test_train_selfplay.py
+++ b/tests/test_selfplay/test_train_selfplay.py
@@ -59,22 +59,20 @@ def train(cfg):
     # Create agent
     agent = Agent(net)
     # Begin training
-    agent.train(total_time_steps=100)
+    agent.train(total_time_steps=20)
     env.close()
     agent.save("./selfplay_agent/")
     return agent
 
 
 def evaluation():
-    from examples.selfplay.tictactoe_utils.tictactoe_render import TictactoeRender
-
     print("Evaluation...")
     env_num = 1
     env = make(
         "tictactoe_v3",
         env_num=env_num,
         asynchronous=True,
-        opponent_wrappers=[TictactoeRender, RandomOpponentWrapper],
+        opponent_wrappers=[RandomOpponentWrapper],
         env_wrappers=[FlattenObservation],
         auto_reset=False,
     )

From 7ded5d55c8b2793bbf9624edebbe9d2d64857a4d Mon Sep 17 00:00:00 2001
From: huangshiyu <huangsy1314@163.com>
Date: Thu, 23 Nov 2023 16:25:25 +0800
Subject: [PATCH 3/8] add selfplay test

---
 examples/selfplay/selfplay.yaml            |  2 +-
 openrl/selfplay/callbacks/selfplay_api.py  |  6 +++++-
 setup.py                                   | 10 ++++++++--
 tests/test_selfplay/test_train_selfplay.py | 14 +++++++++++---
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/examples/selfplay/selfplay.yaml b/examples/selfplay/selfplay.yaml
index 7a7c1bbe..8a05611d 100644
--- a/examples/selfplay/selfplay.yaml
+++ b/examples/selfplay/selfplay.yaml
@@ -1,6 +1,6 @@
 globals:
   selfplay_api_host: 127.0.0.1
-  selfplay_api_port: 10086
+  selfplay_api_port: 13486
 
 seed: 0
 selfplay_api:
diff --git a/openrl/selfplay/callbacks/selfplay_api.py b/openrl/selfplay/callbacks/selfplay_api.py
index 3d148749..cdf9d04d 100644
--- a/openrl/selfplay/callbacks/selfplay_api.py
+++ b/openrl/selfplay/callbacks/selfplay_api.py
@@ -57,7 +57,10 @@ def _init_callback(self) -> None:
             success = self.api_client.set_sample_strategy(self.sample_strategy)
             try_time -= 1
             if try_time <= 0:
-                raise RuntimeError("Failed to set sample strategy.")
+                raise RuntimeError(
+                    f"Failed to set sample strategy: {self.sample_strategy}. host:"
+                    f" {self.host}, port: {self.port}"
+                )
 
     def _on_step(self) -> bool:
         # print("To send request to API server.")
@@ -72,5 +75,6 @@ def _on_training_end(self) -> None:
             print(f"deleting {application_name}")
         serve.delete(application_name)
         del self.bind
+        serve.shutdown()
         if self.verbose >= 2:
             print(f"delete {application_name} done!")
diff --git a/setup.py b/setup.py
index 84da342e..7c8f31a5 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@
 def get_install_requires() -> list:
     return [
         "setuptools>=67.0",
-        "gymnasium",
+        "gymnasium>=0.29",
         "click",
         "termcolor",
         "gym",
@@ -71,7 +71,13 @@ def get_extra_requires() -> dict:
             "evaluate",
         ],
         "selfplay": ["ray[default]", "ray[serve]", "pettingzoo[classic]", "trueskill"],
-        "selfplay_test": ["pettingzoo[mpe]", "pettingzoo[butterfly]"],
+        "selfplay_test": [
+            "ray[default]",
+            "ray[serve]",
+            "fastapi",
+            "pettingzoo[mpe]",
+            "pettingzoo[butterfly]",
+        ],
         "retro": ["gym-retro"],
         "super_mario": ["gym-super-mario-bros"],
         "atari": ["gymnasium[atari]", "gymnasium[accept-rom-license]"],
diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py
index c2ae29be..7e440bad 100644
--- a/tests/test_selfplay/test_train_selfplay.py
+++ b/tests/test_selfplay/test_train_selfplay.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pytest
+import ray
 import torch
 
 from openrl.configs.config import create_config_parser
@@ -18,22 +19,29 @@
 @pytest.fixture(
     scope="module",
     params=[
-        "RandomOpponent",
-        "LastOpponent",
+        {"port": 13486, "strategy": "RandomOpponent"},
+        {"port": 13487, "strategy": "LastOpponent"},
     ],
 )
 def config(request):
     cfg_parser = create_config_parser()
     cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"])
+    cfg.selfplay_api.port = request.param["port"]
     for i, c in enumerate(cfg.callbacks):
         if c["id"] == "SelfplayCallback":
             c["args"][
                 "opponent_template"
             ] = "./examples/selfplay/opponent_templates/tictactoe_opponent"
+            port = c["args"]["api_address"].split(":")[-1].split("/")[0]
+            c["args"]["api_address"] = c["args"]["api_address"].replace(
+                port, str(request.param["port"])
+            )
             cfg.callbacks[i] = c
         elif c["id"] == "SelfplayAPI":
-            c["args"]["sample_strategy"] = request.param
+            c["args"]["sample_strategy"] = request.param["strategy"]
+            c["args"]["port"] = request.param["port"]
             cfg.callbacks[i] = c
+
         else:
             pass
 

From 2bd465841e915f1fc0b2a462f93790ed9b411369 Mon Sep 17 00:00:00 2001
From: huangshiyu <huangsy1314@163.com>
Date: Thu, 23 Nov 2023 16:36:24 +0800
Subject: [PATCH 4/8] add selfplay test

---
 tests/test_selfplay/test_train_selfplay.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py
index 7e440bad..9af75a1c 100644
--- a/tests/test_selfplay/test_train_selfplay.py
+++ b/tests/test_selfplay/test_train_selfplay.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 import pytest
-import ray
 import torch
 
 from openrl.configs.config import create_config_parser
@@ -20,7 +19,6 @@
     scope="module",
     params=[
         {"port": 13486, "strategy": "RandomOpponent"},
-        {"port": 13487, "strategy": "LastOpponent"},
     ],
 )
 def config(request):
@@ -67,7 +65,7 @@ def train(cfg):
     # Create agent
     agent = Agent(net)
     # Begin training
-    agent.train(total_time_steps=20)
+    agent.train(total_time_steps=10)
     env.close()
     agent.save("./selfplay_agent/")
     return agent

From 17cf742192511b40550e48265e3a2468df586150 Mon Sep 17 00:00:00 2001
From: huangshiyu <huangsy1314@163.com>
Date: Thu, 23 Nov 2023 16:54:04 +0800
Subject: [PATCH 5/8] add selfplay test

---
 tests/test_selfplay/test_train_selfplay.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py
index 9af75a1c..bdeb40c1 100644
--- a/tests/test_selfplay/test_train_selfplay.py
+++ b/tests/test_selfplay/test_train_selfplay.py
@@ -1,4 +1,5 @@
 import os
+import socket
 import sys
 
 import numpy as np
@@ -15,10 +16,17 @@
 from openrl.selfplay.wrappers.random_opponent_wrapper import RandomOpponentWrapper
 
 
+def find_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+
+
 @pytest.fixture(
     scope="module",
     params=[
-        {"port": 13486, "strategy": "RandomOpponent"},
+        {"port": find_free_port(), "strategy": "RandomOpponent"},
     ],
 )
 def config(request):
@@ -54,7 +62,7 @@ def train(cfg):
         "tictactoe_v3",
         render_mode=render_model,
         env_num=env_num,
-        asynchronous=True,
+        asynchronous=False,
         opponent_wrappers=[RecordWinner, OpponentPoolWrapper],
         env_wrappers=[FlattenObservation],
         cfg=cfg,

From 434495499708752f0d32fb519e6fbf9f2ad63110 Mon Sep 17 00:00:00 2001
From: huangshiyu <huangsy1314@163.com>
Date: Fri, 24 Nov 2023 13:02:32 +0800
Subject: [PATCH 6/8] add selfplay test

---
 .github/workflows/unit_test.yml            | 2 +-
 tests/test_selfplay/test_train_selfplay.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
index e327cdf5..9671f935 100644
--- a/.github/workflows/unit_test.yml
+++ b/.github/workflows/unit_test.yml
@@ -31,7 +31,7 @@ jobs:
       - name: do_unittest
         timeout-minutes: 40
         run: |
-          xvfb-run -s "-screen 0 1400x900x24" python3 -m pytest tests --cov=openrl --cov-report=xml -m unittest --cov-report=term-missing --durations=0 -v --color=yes
+          xvfb-run -s "-screen 0 1400x900x24" python3 -m pytest tests --cov=openrl --cov-report=xml -m unittest --cov-report=term-missing --durations=0 -v --color=yes -s
       - name: Upload coverage reports to Codecov with GitHub Action
         uses: codecov/codecov-action@v3
         with:
diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py
index bdeb40c1..34d28fc3 100644
--- a/tests/test_selfplay/test_train_selfplay.py
+++ b/tests/test_selfplay/test_train_selfplay.py
@@ -27,12 +27,14 @@ def find_free_port():
     scope="module",
     params=[
         {"port": find_free_port(), "strategy": "RandomOpponent"},
+        {"port": find_free_port(), "strategy": "LastOpponent"},
     ],
 )
 def config(request):
     cfg_parser = create_config_parser()
     cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"])
     cfg.selfplay_api.port = request.param["port"]
+    print("port:",request.param["port"])
     for i, c in enumerate(cfg.callbacks):
         if c["id"] == "SelfplayCallback":
             c["args"][

From a702e8d93f9e9f2a6b064ec32b7e92d7efd850b1 Mon Sep 17 00:00:00 2001
From: huangshiyu <huangsy1314@163.com>
Date: Fri, 24 Nov 2023 13:28:51 +0800
Subject: [PATCH 7/8] add selfplay test

---
 openrl/selfplay/callbacks/selfplay_api.py    | 2 +-
 openrl/selfplay/selfplay_api/selfplay_api.py | 2 +-
 setup.py                                     | 9 +++++++--
 tests/test_selfplay/test_train_selfplay.py   | 2 +-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/openrl/selfplay/callbacks/selfplay_api.py b/openrl/selfplay/callbacks/selfplay_api.py
index cdf9d04d..e2214ecb 100644
--- a/openrl/selfplay/callbacks/selfplay_api.py
+++ b/openrl/selfplay/callbacks/selfplay_api.py
@@ -50,7 +50,7 @@ def _init_callback(self) -> None:
         )
 
         self.bind = SelfplayAPIServer.bind()
-        serve.run(self.bind)
+        serve.run(self.bind, route_prefix="/selfplay")
         success = False
         try_time = 10
         while not success:
diff --git a/openrl/selfplay/selfplay_api/selfplay_api.py b/openrl/selfplay/selfplay_api/selfplay_api.py
index 2c346b46..307c4fcc 100644
--- a/openrl/selfplay/selfplay_api/selfplay_api.py
+++ b/openrl/selfplay/selfplay_api/selfplay_api.py
@@ -33,7 +33,7 @@
 from openrl.selfplay.selfplay_api.opponent_model import BattleResult
 
 
-@serve.deployment(route_prefix="/selfplay")
+@serve.deployment()
 @serve.ingress(app)
 class SelfplayAPIServer(BaseSelfplayAPIServer):
     @app.post("/set_sample_strategy")
diff --git a/setup.py b/setup.py
index 7c8f31a5..043a7267 100644
--- a/setup.py
+++ b/setup.py
@@ -70,9 +70,14 @@ def get_extra_requires() -> dict:
             "datasets==2.13",
             "evaluate",
         ],
-        "selfplay": ["ray[default]", "ray[serve]", "pettingzoo[classic]", "trueskill"],
+        "selfplay": [
+            "ray[default]>=2.7",
+            "ray[serve]",
+            "pettingzoo[classic]",
+            "trueskill",
+        ],
         "selfplay_test": [
-            "ray[default]",
+            "ray[default]>=2.7",
             "ray[serve]",
             "fastapi",
             "pettingzoo[mpe]",
diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py
index 34d28fc3..a67ea964 100644
--- a/tests/test_selfplay/test_train_selfplay.py
+++ b/tests/test_selfplay/test_train_selfplay.py
@@ -34,7 +34,7 @@ def config(request):
     cfg_parser = create_config_parser()
     cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"])
     cfg.selfplay_api.port = request.param["port"]
-    print("port:",request.param["port"])
+    print("port:", request.param["port"])
     for i, c in enumerate(cfg.callbacks):
         if c["id"] == "SelfplayCallback":
             c["args"][

From 714a9ec6eae5b988d2a978836a2fea4d92aa37ce Mon Sep 17 00:00:00 2001
From: huangshiyu <huangsy1314@163.com>
Date: Fri, 24 Nov 2023 14:27:53 +0800
Subject: [PATCH 8/8] add selfplay test

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index 043a7267..28cffd3c 100644
--- a/setup.py
+++ b/setup.py
@@ -73,12 +73,14 @@ def get_extra_requires() -> dict:
         "selfplay": [
             "ray[default]>=2.7",
             "ray[serve]",
+            "async_timeout",
             "pettingzoo[classic]",
             "trueskill",
         ],
         "selfplay_test": [
             "ray[default]>=2.7",
             "ray[serve]",
+            "async_timeout",
             "fastapi",
             "pettingzoo[mpe]",
             "pettingzoo[butterfly]",