Merge pull request #19 from CarperAI/release

Small fixes
NeuralMMO · Sep 15, 2023 · e3a1aae · e3a1aae
2 parents 8d223cc + bb93bf9
commit e3a1aae
Show file tree

Hide file tree

Showing 10 changed files with 175 additions and 61 deletions.
diff --git a/curriculum_generation/sample_evaluation_task.py b/curriculum_generation/sample_evaluation_task.py
@@ -0,0 +1,54 @@
+"""Manual test for creating learning curriculum manually"""
+# pylint: disable=invalid-name,redefined-outer-name,bad-builtin
+# pylint: disable=wildcard-import,unused-wildcard-import
+from typing import List
+
+from nmmo.task import constraint as c
+from nmmo.task.base_predicates import AttainSkill, CountEvent, EarnGold, TickGE
+from nmmo.task.task_spec import TaskSpec
+
+curriculum: List[TaskSpec] = []
+
+# Stay alive as long as possible
+curriculum.append(
+    TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024})
+)
+
+# Perform these 10 times
+essential_skills = [
+    "EAT_FOOD",
+    "DRINK_WATER",
+    "SCORE_HIT",
+    "PLAYER_KILL",
+    "HARVEST_ITEM",
+    "EQUIP_ITEM",
+    "CONSUME_ITEM",
+    "LEVEL_UP",
+    "EARN_GOLD",
+    "LIST_ITEM",
+    "BUY_ITEM",
+    "GIVE_ITEM",
+    "DESTROY_ITEM",
+    "GIVE_GOLD",
+]
+for event_code in essential_skills:
+    curriculum.append(
+        TaskSpec(
+            eval_fn=CountEvent,
+            eval_fn_kwargs={"event": event_code, "N": 10},
+        )
+    )
+
+# Reach skill level 10
+for skill in c.combat_skills + c.harvest_skills:
+    curriculum.append(
+        TaskSpec(
+            eval_fn=AttainSkill,
+            eval_fn_kwargs={"skill": skill, "level": 10, "num_agent": 1},
+        )
+    )
+
+# Earn gold 50
+curriculum.append(
+    TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": 50})
+)
diff --git a/environment.py b/environment.py
@@ -33,14 +33,15 @@ def __init__(self, args: Namespace):
 
 class Postprocessor(StatPostprocessor):
     def __init__(self, env, is_multiagent, agent_id,
-      early_stop_agent_num=0,
-      sqrt_achievement_rewards=False,
-      heal_bonus_weight=0,
-      meander_bonus_weight=0,
-      explore_bonus_weight=0,
-      clip_unique_event=3,
+        eval_mode=False,
+        early_stop_agent_num=0,
+        sqrt_achievement_rewards=False,
+        heal_bonus_weight=0,
+        meander_bonus_weight=0,
+        explore_bonus_weight=0,
+        clip_unique_event=3,
     ):
-        super().__init__(env, agent_id)
+        super().__init__(env, agent_id, eval_mode)
         self.early_stop_agent_num = early_stop_agent_num
         self.sqrt_achievement_rewards = sqrt_achievement_rewards
         self.heal_bonus_weight = heal_bonus_weight
@@ -117,6 +118,7 @@ def env_creator():
         env = pufferlib.emulation.PettingZooPufferEnv(env,
             postprocessor_cls=Postprocessor,
             postprocessor_kwargs={
+                'eval_mode': args.eval_mode,
                 'early_stop_agent_num': args.early_stop_agent_num,
                 'sqrt_achievement_rewards': args.sqrt_achievement_rewards,
                 'heal_bonus_weight': args.heal_bonus_weight,

diff --git a/evaluate.py b/evaluate.py
@@ -5,7 +5,11 @@
 import time
 from types import SimpleNamespace
 from pathlib import Path
+from collections import defaultdict
+from dataclasses import asdict
+from itertools import cycle
 
+import numpy as np
 import torch
 import pandas as pd
 
@@ -54,11 +58,11 @@ def save_replays(policy_store_dir, save_dir):
     args.early_stop_agent_num = 0  # run the full episode
     args.resilient_population = 0  # no resilient agents
 
-    # TODO: custom models will require different policy creation functions
+    # NOTE: This creates a dummy learner agent. Is it necessary?
     from reinforcement_learning import policy  # import your policy
     def make_policy(envs):
         learner_policy = policy.Baseline(
-            envs,
+            envs._driver_env,
             input_size=args.input_size,
             hidden_size=args.hidden_size,
             task_size=args.task_size
@@ -83,8 +87,10 @@ def make_policy(envs):
 
     # Load the policies into the policy pool
     evaluator.policy_pool.update_policies({
-        p.name: p.policy(make_policy, evaluator.buffers[0], evaluator.device)
-        for p in policy_store._all_policies().values()
+        p.name: p.policy(
+            policy_args=[evaluator.buffers[0]], 
+            device=evaluator.device
+        ) for p in list(policy_store._all_policies().values())
     })
 
     # Set up the replay helper
@@ -121,7 +127,7 @@ def create_policy_ranker(policy_store_dir, ranker_file="openskill.pickle"):
     file = os.path.join(policy_store_dir, ranker_file)
     if os.path.exists(file):
         if os.path.exists(file + ".lock"):
-            raise ValueError("Policy ranker file is locked.")
+            raise ValueError("Policy ranker file is locked. Delete the lock file.")
         logging.info("Using policy ranker from %s", file)
         policy_ranker = pufferlib.utils.PersistentObject(
             file,
@@ -135,21 +141,34 @@ def create_policy_ranker(policy_store_dir, ranker_file="openskill.pickle"):
         )
     return policy_ranker
 
-def rank_policies(policy_store_dir, device):
+class AllPolicySelector(pufferlib.policy_ranker.PolicySelector):
+    def select_policies(self, policies):
+        # Return all policy names in the alpahebetical order
+        # Loops circularly if more policies are needed than available
+        loop = cycle([
+            policies[name] for name in sorted(policies.keys()
+        )])
+        return [next(loop) for _ in range(self._num)]
+
+def rank_policies(policy_store_dir, eval_curriculum_file, device):
     # CHECK ME: can be custom models with different architectures loaded here?
     policy_store = setup_policy_store(policy_store_dir)
-    num_policies = len(policy_store._all_policies())
     policy_ranker = create_policy_ranker(policy_store_dir)
+    num_policies = len(policy_store._all_policies())
+    policy_selector = AllPolicySelector(num_policies)
 
-    # TODO: task-condition agents when generating replays
     args = SimpleNamespace(**config.Config.asdict())
     args.data_dir = policy_store_dir
+    args.eval_mode = True
+    args.num_envs = 5  # sample a bit longer in each env
+    args.num_buffers = 1
     args.learner_weight = 0  # evaluate mode
     args.selfplay_num_policies = num_policies + 1
     args.early_stop_agent_num = 0  # run the full episode
     args.resilient_population = 0  # no resilient agents
+    args.tasks_path = eval_curriculum_file  # task-conditioning
 
-    # TODO: custom models will require different policy creation functions
+    # NOTE: This creates a dummy learner agent. Is it necessary?
     from reinforcement_learning import policy  # import your policy
     def make_policy(envs):
         learner_policy = policy.Baseline(
@@ -174,17 +193,25 @@ def make_policy(envs):
         num_buffers=args.num_buffers,
         selfplay_learner_weight=args.learner_weight,
         selfplay_num_policies=args.selfplay_num_policies,
-        batch_size=args.rollout_batch_size,
+        batch_size=args.eval_batch_size,
         policy_store=policy_store,
         policy_ranker=policy_ranker, # so that a new ranker is created
+        policy_selector=policy_selector,
     )
 
     rank_file = os.path.join(policy_store_dir, "ranking.txt")
     with open(rank_file, "w") as f:
         pass
 
-    while evaluator.global_step < args.train_num_steps:
-        evaluator.evaluate()
+    results = defaultdict(list)
+    while evaluator.global_step < args.eval_num_steps:
+        _, stats, infos = evaluator.evaluate()
+
+        for pol, vals in infos.items():
+            results[pol].extend([
+                e[1] for e in infos[pol]['team_results']
+            ])
+
         ratings = evaluator.policy_ranker.ratings()
         dataframe = pd.DataFrame(
             {
@@ -202,18 +229,35 @@ def make_policy(envs):
                 + "\n\n"
             )
 
+        # Reset the envs and start the new episodes
+        # NOTE: The below line will probably end the episode in the middle, 
+        #   so we won't be able to sample scores from the successful agents.
+        #   Thus, the scores will be biased towards the agents that die early.
+        #   Still, the numbers we get this way is better than frequently
+        #   updating the scores because the openskill ranking only takes the mean.
+        #evaluator.buffers[0]._async_reset()
+
         # CHECK ME: delete the policy_ranker lock file
         Path(evaluator.policy_ranker.lock.lock_file).unlink(missing_ok=True)
 
     evaluator.close()
+    for pol, res in results.items():
+        aggregated = {}
+        keys = asdict(res[0]).keys()
+        for k in keys:
+            if k == 'policy_id':
+                continue
+            aggregated[k] = np.mean([asdict(e)[k] for e in res])
+        results[pol] = aggregated
+    print('Evaluation complete. Average stats:\n', results)
 
 
 if __name__ == "__main__":
     """Usage: python evaluate.py -p <policy_store_dir> -s <replay_save_dir>
 
     -p, --policy-store-dir: Directory to load policy checkpoints from for evaluation/ranking
     -s, --replay-save-dir: Directory to save replays (Default: replays/)
-    -e, --eval-mode: Evaluate mode (Default: False)
+    -r, --replay-mode: Replay save mode (Default: False)
     -d, --device: Device to use for evaluation/ranking (Default: cuda if available, otherwise cpu)
 
     To generate replay from your checkpoints, put them together in policy_store_dir, run the following command, 
@@ -247,12 +291,11 @@ def make_policy(envs):
         help="Directory to save replays (Default: replays/)",
     )
     parser.add_argument(
-        "-e",
-        "--eval-mode",
-        dest="eval_mode",
-        type=bool,
-        default=False,
-        help="Evaluate mode (Default: False)",
+        "-r",
+        "--replay-mode",
+        dest="replay_mode",
+        action="store_true",
+        help="Replay mode (Default: False)",
     )
     parser.add_argument(
         "-d",
@@ -262,15 +305,23 @@ def make_policy(envs):
         default="cuda" if torch.cuda.is_available() else "cpu",
         help="Device to use for evaluation/ranking (Default: cuda if available, otherwise cpu)",
     )
+    parser.add_argument(
+        "-t",
+        "--task-file",
+        dest="task_file",
+        type=str,
+        default="reinforcement_learning/eval_task_with_embedding.pkl",
+        help="Task file to use for evaluation",
+    )
 
     # Parse and check the arguments
     eval_args = parser.parse_args()
     assert eval_args.policy_store_dir is not None, "Policy store directory must be specified"
 
-    if eval_args.eval_mode:
-        logging.info("Ranking checkpoints from %s", eval_args.policy_store_dir)
-        logging.info("Replays will NOT be generated")
-        rank_policies(eval_args.policy_store_dir, eval_args.device)
-    else:
+    if getattr(eval_args, "replay_mode", False):
         logging.info("Generating replays from the checkpoints in %s", eval_args.policy_store_dir)
         save_replays(eval_args.policy_store_dir, eval_args.replay_save_dir)
+    else:
+        logging.info("Ranking checkpoints from %s", eval_args.policy_store_dir)
+        logging.info("Replays will NOT be generated")
+        rank_policies(eval_args.policy_store_dir, eval_args.task_file, eval_args.device)
diff --git a/leader_board.py b/leader_board.py
@@ -36,6 +36,7 @@ class TeamResult:
     time_alive: int = 0,
     earned_gold: int = 0,
     completed_task_count: int = 0,
+    max_task_progress: float = 0,
     damage_received: int = 0,
     damage_inflicted: int = 0,
     ration_consumed: int = 0,
@@ -72,6 +73,7 @@ def names(cls) -> List[str]:
             "time_alive",
             "earned_gold",
             "completed_task_count",
+            "max_task_progress",
             "damage_received",
             "damage_inflicted",
             "ration_consumed",
@@ -110,8 +112,9 @@ class StatPostprocessor(pufferlib.emulation.Postprocessor):
     """Postprocessing actions and metrics of Neural MMO.
        Process wandb/leader board stats, and save replays.
     """
-    def __init__(self, env, agent_id):
+    def __init__(self, env, agent_id, eval_mode=False):
         super().__init__(env, is_multiagent=True, agent_id=agent_id)
+        self.eval_mode = eval_mode
         self._reset_episode_stats()
 
     def reset(self, observation):
@@ -125,6 +128,7 @@ def _reset_episode_stats(self):
         self._cod_starved = 0
         self._cod_dehydrated = 0
         self._task_completed = 0
+        self._max_task_progress = 0
         self._task_with_2_reward_signal = 0
         self._task_with_0p2_max_progress = 0
         self._curriculum = defaultdict(list)
@@ -156,19 +160,20 @@ def _update_stats(self, agent):
         task = self.env.agent_task_map[agent.ent_id][0]
         # For each task spec, record whether its max progress and reward count
         self._curriculum[task.spec_name].append((task._max_progress, task.reward_signal_count))
+        self._max_task_progress = task._max_progress
         if task.reward_signal_count >= 2:
-            self._task_with_2_reward_signal += 1.0
+            self._task_with_2_reward_signal = 1.0
         if task._max_progress >= 0.2:
-            self._task_with_0p2_max_progress += 1.0
+            self._task_with_0p2_max_progress = 1.0
         if task.completed:
-            self._task_completed += 1.0
+            self._task_completed = 1.0
 
         if agent.damage.val > 0:
-            self._cod_attacked += 1.0
+            self._cod_attacked = 1.0
         elif agent.food.val == 0:
-            self._cod_starved += 1.0
+            self._cod_starved = 1.0
         elif agent.water.val == 0:
-            self._cod_dehydrated += 1.0
+            self._cod_dehydrated = 1.0
 
         self._combat_level.append(agent.attack_level)
         self._harvest_level.append(max(
@@ -249,6 +254,7 @@ def reward_done_info(self, reward, done, info):
             info["stats"][key] = float(val)
 
         # Fill in the "TeamResult"
+        result.max_task_progress = self._max_task_progress
         result.total_score = self._curr_unique_count
         result.time_alive = self._time_alive
         result.earned_gold = achieved["achieved/earned_gold"]
@@ -268,6 +274,10 @@ def reward_done_info(self, reward, done, info):
 
         info["team_results"] = (self.agent_id, result)
 
+        if self.eval_mode:
+            # "return" is used for ranking in the eval mode, so put the task progress here
+            info["return"] = self._max_task_progress  # this is 1 if done
+
         return reward, done, info
 
 # Event processing utilities for Neural MMO.

diff --git a/reinforcement_learning/config.py b/reinforcement_learning/config.py
@@ -16,8 +16,10 @@ class Config:
     num_cores = None  # Number of cores to use for training
     num_envs = 6  # Number of environments to use for training
     num_buffers = 2  # Number of buffers to use for training
-    rollout_batch_size = 32768 # Number of steps to rollout
+    rollout_batch_size = 2**15 # Number of steps to rollout
+    eval_batch_size = 2**15 # Number of steps to rollout for eval
     train_num_steps = 10_000_000  # Number of steps to train
+    eval_num_steps = 1_000_000  # Number of steps to evaluate
     checkpoint_interval = 30  # Interval to save models
     run_name = f"nmmo_{time.strftime('%Y%m%d_%H%M%S')}"  # Run name
     runs_dir = "/tmp/runs"  # Directory for runs
@@ -47,6 +49,7 @@ class Config:
     map_size = 128  # Size of maps to use for training
     resilient_population = 0.2  # Percentage of agents to be resilient to starvation/dehydration
     tasks_path = None  # Path to tasks to use for training
+    eval_mode = False # Run the postprocessor in the eval mode
     early_stop_agent_num = 8  # Stop the episode when the number of agents reaches this number
     sqrt_achievement_rewards=False # Use the log of achievement rewards
     heal_bonus_weight = 0.03

diff --git a/reinforcement_learning/eval_task_with_embedding.pkl b/reinforcement_learning/eval_task_with_embedding.pkl