Skip to content

Commit

Permalink
Merge pull request #19 from CarperAI/release
Browse files Browse the repository at this point in the history
Small fixes
  • Loading branch information
jsuarez5341 committed Sep 15, 2023
2 parents 8d223cc + bb93bf9 commit e3a1aae
Show file tree
Hide file tree
Showing 10 changed files with 175 additions and 61 deletions.
54 changes: 54 additions & 0 deletions curriculum_generation/sample_evaluation_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Manual test for creating learning curriculum manually"""
# pylint: disable=invalid-name,redefined-outer-name,bad-builtin
# pylint: disable=wildcard-import,unused-wildcard-import
from typing import List

from nmmo.task import constraint as c
from nmmo.task.base_predicates import AttainSkill, CountEvent, EarnGold, TickGE
from nmmo.task.task_spec import TaskSpec

curriculum: List[TaskSpec] = []

# Stay alive as long as possible
curriculum.append(
TaskSpec(eval_fn=TickGE, eval_fn_kwargs={"num_tick": 1024})
)

# Perform these 10 times
essential_skills = [
"EAT_FOOD",
"DRINK_WATER",
"SCORE_HIT",
"PLAYER_KILL",
"HARVEST_ITEM",
"EQUIP_ITEM",
"CONSUME_ITEM",
"LEVEL_UP",
"EARN_GOLD",
"LIST_ITEM",
"BUY_ITEM",
"GIVE_ITEM",
"DESTROY_ITEM",
"GIVE_GOLD",
]
for event_code in essential_skills:
curriculum.append(
TaskSpec(
eval_fn=CountEvent,
eval_fn_kwargs={"event": event_code, "N": 10},
)
)

# Reach skill level 10
for skill in c.combat_skills + c.harvest_skills:
curriculum.append(
TaskSpec(
eval_fn=AttainSkill,
eval_fn_kwargs={"skill": skill, "level": 10, "num_agent": 1},
)
)

# Earn gold 50
curriculum.append(
TaskSpec(eval_fn=EarnGold, eval_fn_kwargs={"amount": 50})
)
16 changes: 9 additions & 7 deletions environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,15 @@ def __init__(self, args: Namespace):

class Postprocessor(StatPostprocessor):
def __init__(self, env, is_multiagent, agent_id,
early_stop_agent_num=0,
sqrt_achievement_rewards=False,
heal_bonus_weight=0,
meander_bonus_weight=0,
explore_bonus_weight=0,
clip_unique_event=3,
eval_mode=False,
early_stop_agent_num=0,
sqrt_achievement_rewards=False,
heal_bonus_weight=0,
meander_bonus_weight=0,
explore_bonus_weight=0,
clip_unique_event=3,
):
super().__init__(env, agent_id)
super().__init__(env, agent_id, eval_mode)
self.early_stop_agent_num = early_stop_agent_num
self.sqrt_achievement_rewards = sqrt_achievement_rewards
self.heal_bonus_weight = heal_bonus_weight
Expand Down Expand Up @@ -117,6 +118,7 @@ def env_creator():
env = pufferlib.emulation.PettingZooPufferEnv(env,
postprocessor_cls=Postprocessor,
postprocessor_kwargs={
'eval_mode': args.eval_mode,
'early_stop_agent_num': args.early_stop_agent_num,
'sqrt_achievement_rewards': args.sqrt_achievement_rewards,
'heal_bonus_weight': args.heal_bonus_weight,
Expand Down
99 changes: 75 additions & 24 deletions evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
import time
from types import SimpleNamespace
from pathlib import Path
from collections import defaultdict
from dataclasses import asdict
from itertools import cycle

import numpy as np
import torch
import pandas as pd

Expand Down Expand Up @@ -54,11 +58,11 @@ def save_replays(policy_store_dir, save_dir):
args.early_stop_agent_num = 0 # run the full episode
args.resilient_population = 0 # no resilient agents

# TODO: custom models will require different policy creation functions
# NOTE: This creates a dummy learner agent. Is it necessary?
from reinforcement_learning import policy # import your policy
def make_policy(envs):
learner_policy = policy.Baseline(
envs,
envs._driver_env,
input_size=args.input_size,
hidden_size=args.hidden_size,
task_size=args.task_size
Expand All @@ -83,8 +87,10 @@ def make_policy(envs):

# Load the policies into the policy pool
evaluator.policy_pool.update_policies({
p.name: p.policy(make_policy, evaluator.buffers[0], evaluator.device)
for p in policy_store._all_policies().values()
p.name: p.policy(
policy_args=[evaluator.buffers[0]],
device=evaluator.device
) for p in list(policy_store._all_policies().values())
})

# Set up the replay helper
Expand Down Expand Up @@ -121,7 +127,7 @@ def create_policy_ranker(policy_store_dir, ranker_file="openskill.pickle"):
file = os.path.join(policy_store_dir, ranker_file)
if os.path.exists(file):
if os.path.exists(file + ".lock"):
raise ValueError("Policy ranker file is locked.")
raise ValueError("Policy ranker file is locked. Delete the lock file.")
logging.info("Using policy ranker from %s", file)
policy_ranker = pufferlib.utils.PersistentObject(
file,
Expand All @@ -135,21 +141,34 @@ def create_policy_ranker(policy_store_dir, ranker_file="openskill.pickle"):
)
return policy_ranker

def rank_policies(policy_store_dir, device):
class AllPolicySelector(pufferlib.policy_ranker.PolicySelector):
def select_policies(self, policies):
# Return all policy names in the alpahebetical order
# Loops circularly if more policies are needed than available
loop = cycle([
policies[name] for name in sorted(policies.keys()
)])
return [next(loop) for _ in range(self._num)]

def rank_policies(policy_store_dir, eval_curriculum_file, device):
# CHECK ME: can be custom models with different architectures loaded here?
policy_store = setup_policy_store(policy_store_dir)
num_policies = len(policy_store._all_policies())
policy_ranker = create_policy_ranker(policy_store_dir)
num_policies = len(policy_store._all_policies())
policy_selector = AllPolicySelector(num_policies)

# TODO: task-condition agents when generating replays
args = SimpleNamespace(**config.Config.asdict())
args.data_dir = policy_store_dir
args.eval_mode = True
args.num_envs = 5 # sample a bit longer in each env
args.num_buffers = 1
args.learner_weight = 0 # evaluate mode
args.selfplay_num_policies = num_policies + 1
args.early_stop_agent_num = 0 # run the full episode
args.resilient_population = 0 # no resilient agents
args.tasks_path = eval_curriculum_file # task-conditioning

# TODO: custom models will require different policy creation functions
# NOTE: This creates a dummy learner agent. Is it necessary?
from reinforcement_learning import policy # import your policy
def make_policy(envs):
learner_policy = policy.Baseline(
Expand All @@ -174,17 +193,25 @@ def make_policy(envs):
num_buffers=args.num_buffers,
selfplay_learner_weight=args.learner_weight,
selfplay_num_policies=args.selfplay_num_policies,
batch_size=args.rollout_batch_size,
batch_size=args.eval_batch_size,
policy_store=policy_store,
policy_ranker=policy_ranker, # so that a new ranker is created
policy_selector=policy_selector,
)

rank_file = os.path.join(policy_store_dir, "ranking.txt")
with open(rank_file, "w") as f:
pass

while evaluator.global_step < args.train_num_steps:
evaluator.evaluate()
results = defaultdict(list)
while evaluator.global_step < args.eval_num_steps:
_, stats, infos = evaluator.evaluate()

for pol, vals in infos.items():
results[pol].extend([
e[1] for e in infos[pol]['team_results']
])

ratings = evaluator.policy_ranker.ratings()
dataframe = pd.DataFrame(
{
Expand All @@ -202,18 +229,35 @@ def make_policy(envs):
+ "\n\n"
)

# Reset the envs and start the new episodes
# NOTE: The below line will probably end the episode in the middle,
# so we won't be able to sample scores from the successful agents.
# Thus, the scores will be biased towards the agents that die early.
# Still, the numbers we get this way is better than frequently
# updating the scores because the openskill ranking only takes the mean.
#evaluator.buffers[0]._async_reset()

# CHECK ME: delete the policy_ranker lock file
Path(evaluator.policy_ranker.lock.lock_file).unlink(missing_ok=True)

evaluator.close()
for pol, res in results.items():
aggregated = {}
keys = asdict(res[0]).keys()
for k in keys:
if k == 'policy_id':
continue
aggregated[k] = np.mean([asdict(e)[k] for e in res])
results[pol] = aggregated
print('Evaluation complete. Average stats:\n', results)


if __name__ == "__main__":
"""Usage: python evaluate.py -p <policy_store_dir> -s <replay_save_dir>
-p, --policy-store-dir: Directory to load policy checkpoints from for evaluation/ranking
-s, --replay-save-dir: Directory to save replays (Default: replays/)
-e, --eval-mode: Evaluate mode (Default: False)
-r, --replay-mode: Replay save mode (Default: False)
-d, --device: Device to use for evaluation/ranking (Default: cuda if available, otherwise cpu)
To generate replay from your checkpoints, put them together in policy_store_dir, run the following command,
Expand Down Expand Up @@ -247,12 +291,11 @@ def make_policy(envs):
help="Directory to save replays (Default: replays/)",
)
parser.add_argument(
"-e",
"--eval-mode",
dest="eval_mode",
type=bool,
default=False,
help="Evaluate mode (Default: False)",
"-r",
"--replay-mode",
dest="replay_mode",
action="store_true",
help="Replay mode (Default: False)",
)
parser.add_argument(
"-d",
Expand All @@ -262,15 +305,23 @@ def make_policy(envs):
default="cuda" if torch.cuda.is_available() else "cpu",
help="Device to use for evaluation/ranking (Default: cuda if available, otherwise cpu)",
)
parser.add_argument(
"-t",
"--task-file",
dest="task_file",
type=str,
default="reinforcement_learning/eval_task_with_embedding.pkl",
help="Task file to use for evaluation",
)

# Parse and check the arguments
eval_args = parser.parse_args()
assert eval_args.policy_store_dir is not None, "Policy store directory must be specified"

if eval_args.eval_mode:
logging.info("Ranking checkpoints from %s", eval_args.policy_store_dir)
logging.info("Replays will NOT be generated")
rank_policies(eval_args.policy_store_dir, eval_args.device)
else:
if getattr(eval_args, "replay_mode", False):
logging.info("Generating replays from the checkpoints in %s", eval_args.policy_store_dir)
save_replays(eval_args.policy_store_dir, eval_args.replay_save_dir)
else:
logging.info("Ranking checkpoints from %s", eval_args.policy_store_dir)
logging.info("Replays will NOT be generated")
rank_policies(eval_args.policy_store_dir, eval_args.task_file, eval_args.device)
24 changes: 17 additions & 7 deletions leader_board.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class TeamResult:
time_alive: int = 0,
earned_gold: int = 0,
completed_task_count: int = 0,
max_task_progress: float = 0,
damage_received: int = 0,
damage_inflicted: int = 0,
ration_consumed: int = 0,
Expand Down Expand Up @@ -72,6 +73,7 @@ def names(cls) -> List[str]:
"time_alive",
"earned_gold",
"completed_task_count",
"max_task_progress",
"damage_received",
"damage_inflicted",
"ration_consumed",
Expand Down Expand Up @@ -110,8 +112,9 @@ class StatPostprocessor(pufferlib.emulation.Postprocessor):
"""Postprocessing actions and metrics of Neural MMO.
Process wandb/leader board stats, and save replays.
"""
def __init__(self, env, agent_id):
def __init__(self, env, agent_id, eval_mode=False):
super().__init__(env, is_multiagent=True, agent_id=agent_id)
self.eval_mode = eval_mode
self._reset_episode_stats()

def reset(self, observation):
Expand All @@ -125,6 +128,7 @@ def _reset_episode_stats(self):
self._cod_starved = 0
self._cod_dehydrated = 0
self._task_completed = 0
self._max_task_progress = 0
self._task_with_2_reward_signal = 0
self._task_with_0p2_max_progress = 0
self._curriculum = defaultdict(list)
Expand Down Expand Up @@ -156,19 +160,20 @@ def _update_stats(self, agent):
task = self.env.agent_task_map[agent.ent_id][0]
# For each task spec, record whether its max progress and reward count
self._curriculum[task.spec_name].append((task._max_progress, task.reward_signal_count))
self._max_task_progress = task._max_progress
if task.reward_signal_count >= 2:
self._task_with_2_reward_signal += 1.0
self._task_with_2_reward_signal = 1.0
if task._max_progress >= 0.2:
self._task_with_0p2_max_progress += 1.0
self._task_with_0p2_max_progress = 1.0
if task.completed:
self._task_completed += 1.0
self._task_completed = 1.0

if agent.damage.val > 0:
self._cod_attacked += 1.0
self._cod_attacked = 1.0
elif agent.food.val == 0:
self._cod_starved += 1.0
self._cod_starved = 1.0
elif agent.water.val == 0:
self._cod_dehydrated += 1.0
self._cod_dehydrated = 1.0

self._combat_level.append(agent.attack_level)
self._harvest_level.append(max(
Expand Down Expand Up @@ -249,6 +254,7 @@ def reward_done_info(self, reward, done, info):
info["stats"][key] = float(val)

# Fill in the "TeamResult"
result.max_task_progress = self._max_task_progress
result.total_score = self._curr_unique_count
result.time_alive = self._time_alive
result.earned_gold = achieved["achieved/earned_gold"]
Expand All @@ -268,6 +274,10 @@ def reward_done_info(self, reward, done, info):

info["team_results"] = (self.agent_id, result)

if self.eval_mode:
# "return" is used for ranking in the eval mode, so put the task progress here
info["return"] = self._max_task_progress # this is 1 if done

return reward, done, info

# Event processing utilities for Neural MMO.
Expand Down
5 changes: 4 additions & 1 deletion reinforcement_learning/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ class Config:
num_cores = None # Number of cores to use for training
num_envs = 6 # Number of environments to use for training
num_buffers = 2 # Number of buffers to use for training
rollout_batch_size = 32768 # Number of steps to rollout
rollout_batch_size = 2**15 # Number of steps to rollout
eval_batch_size = 2**15 # Number of steps to rollout for eval
train_num_steps = 10_000_000 # Number of steps to train
eval_num_steps = 1_000_000 # Number of steps to evaluate
checkpoint_interval = 30 # Interval to save models
run_name = f"nmmo_{time.strftime('%Y%m%d_%H%M%S')}" # Run name
runs_dir = "/tmp/runs" # Directory for runs
Expand Down Expand Up @@ -47,6 +49,7 @@ class Config:
map_size = 128 # Size of maps to use for training
resilient_population = 0.2 # Percentage of agents to be resilient to starvation/dehydration
tasks_path = None # Path to tasks to use for training
eval_mode = False # Run the postprocessor in the eval mode
early_stop_agent_num = 8 # Stop the episode when the number of agents reaches this number
sqrt_achievement_rewards=False # Use the log of achievement rewards
heal_bonus_weight = 0.03
Expand Down
Binary file not shown.
Loading

0 comments on commit e3a1aae

Please sign in to comment.