Skip to content

Commit

Permalink
Add amp
Browse files Browse the repository at this point in the history
  • Loading branch information
sbsekiguchi committed May 7, 2024
1 parent b468b79 commit d970061
Show file tree
Hide file tree
Showing 82 changed files with 10,577 additions and 39 deletions.
12 changes: 11 additions & 1 deletion bin/test_reproductions
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright 2021,2022,2023 Sony Group Corporation.
# Copyright 2021,2022,2023,2024 Sony Group Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -82,6 +82,16 @@ do
echo "Test run of showcase for ${ALGORITHM}"
python ${SCRIPT} --gpu ${GPU_ID} --snapshot-dir ${SNAPSHOT_DIR} --showcase \
--showcase_runs ${SHOWCASE_RUNS}
elif [ ${BASE_ENV} = "pybullet" ] && [ ${ALGORITHM} = "amp" ]; then
TMP_ENV="FakeAMPNNablaRL-v1"
echo "Test run of training for ${ALGORITHM}"
python ${SCRIPT} --gpu ${GPU_ID} --save-dir "${RESULT_BASE_DIR}/${ALGORITHM}" --seed ${SEED} \
--total_iterations ${TOTAL_ITERATIONS} --save_timing ${TOTAL_ITERATIONS} --actor_num 1 \
--args_file_path ${TMP_ENV}
SNAPSHOT_DIR="${RESULT_BASE_DIR}/${ALGORITHM}/${TMP_ENV}_results/seed-${SEED}/iteration-${TOTAL_ITERATIONS}"
echo "Test run of showcase for ${ALGORITHM}"
python ${SCRIPT} --gpu ${GPU_ID} --snapshot-dir ${SNAPSHOT_DIR} --showcase \
--showcase_runs ${SHOWCASE_RUNS} --args_file_path ${TMP_ENV}
elif [ ${ALGORITHM} = "decision_transformer" ]; then
echo "Test run of training for ${ALGORITHM}"
TOTAL_EPOCHS=1
Expand Down
10 changes: 10 additions & 0 deletions docs/source/nnablarl_api/algorithms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ A2C
:members:
:show-inheritance:

AMP
====
.. autoclass:: nnabla_rl.algorithms.amp.AMPConfig
:members:
:show-inheritance:

.. autoclass:: nnabla_rl.algorithms.amp.AMP
:members:
:show-inheritance:

ATRPO
======
.. autoclass:: nnabla_rl.algorithms.atrpo.ATRPOConfig
Expand Down
1 change: 1 addition & 0 deletions nnabla_rl/algorithms/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ nnabla-rl offers various (deep) reinforcement learning and optimal control algor
|Algorithm|Online training|Offline(Batch) training|Continuous action|Discrete action|Hybrid action|RNN layer support|
|:---|:---:|:---:|:---:|:---:|:---:|:---:|
|[A2C](https://arxiv.org/abs/1602.01783)|:heavy_check_mark:|:x:|(We will support continuous action in the future)|:heavy_check_mark:|:x:|:x:|
|[AMP](https://arxiv.org/abs/2104.02180)|:heavy_check_mark:|:x:|:heavy_check_mark:|:x:|:x:|:x:|
|[ATRPO](https://arxiv.org/pdf/2106.07329)|:heavy_check_mark:|:x:|:heavy_check_mark:|(We will support discrete action in the future)|:x:|:x:|
|[BCQ](https://arxiv.org/abs/1812.02900)|:x:|:heavy_check_mark:|:heavy_check_mark:|:x:|:x:|:x:|
|[BEAR](https://arxiv.org/abs/1906.00949)|:x:|:heavy_check_mark:|:heavy_check_mark:|:x:|:x:|:x:|
Expand Down
2 changes: 2 additions & 0 deletions nnabla_rl/algorithms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from nnabla_rl.algorithm import Algorithm, AlgorithmConfig
from nnabla_rl.algorithms.a2c import A2C, A2CConfig
from nnabla_rl.algorithms.amp import AMP, AMPConfig
from nnabla_rl.algorithms.atrpo import ATRPO, ATRPOConfig
from nnabla_rl.algorithms.bcq import BCQ, BCQConfig
from nnabla_rl.algorithms.bear import BEAR, BEARConfig
Expand Down Expand Up @@ -83,6 +84,7 @@ def get_class_of(name):

register_algorithm(A2C, A2CConfig)
register_algorithm(ATRPO, ATRPOConfig)
register_algorithm(AMP, AMPConfig)
register_algorithm(BCQ, BCQConfig)
register_algorithm(BEAR, BEARConfig)
register_algorithm(CategoricalDDQN, CategoricalDDQNConfig)
Expand Down
1,349 changes: 1,349 additions & 0 deletions nnabla_rl/algorithms/amp.py

Large diffs are not rendered by default.

17 changes: 11 additions & 6 deletions nnabla_rl/environment_explorers/epsilon_greedy_explorer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright 2020,2021 Sony Corporation.
# Copyright 2021,2022,2023 Sony Group Corporation.
# Copyright 2021,2022,2023,2024 Sony Group Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -80,11 +80,14 @@ class LinearDecayEpsilonGreedyExplorerConfig(EnvironmentExplorerConfig):
This value must be smaller than initial_epsilon. Defaults to 0.05.
max_explore_steps (int): Number of steps to decay epsilon from initial_epsilon to final_epsilon.
Defaults to 1000000.
append_explorer_info (bool): Flag for appending explorer info to the action info. \
The explore info includes whether the action is greedy or not, and explore rate. Defaults to False.
"""

initial_epsilon: float = 1.0
final_epsilon: float = 0.05
max_explore_steps: float = 1000000
append_explorer_info: bool = False

def __post_init__(self):
self._assert_between(self.initial_epsilon, 0.0, 1.0, 'initial_epsilon')
Expand Down Expand Up @@ -125,11 +128,13 @@ def __init__(self,

def action(self, step: int, state: np.ndarray, *, begin_of_episode: bool = False) -> Tuple[np.ndarray, Dict]:
epsilon = self._compute_epsilon(step)
(action, info), _ = epsilon_greedy_action_selection(state,
self._greedy_action_selector,
self._random_action_selector,
epsilon,
begin_of_episode=begin_of_episode)
(action, info), is_greedy_action = epsilon_greedy_action_selection(state,
self._greedy_action_selector,
self._random_action_selector,
epsilon,
begin_of_episode=begin_of_episode)
if self._config.append_explorer_info:
info.update({"greedy_action": is_greedy_action, "explore_rate": epsilon})
return action, info

def _compute_epsilon(self, step):
Expand Down
13 changes: 12 additions & 1 deletion nnabla_rl/environments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
DummyTupleContinuous, DummyTupleDiscrete, DummyTupleMixed,
DummyTupleStateContinuous, DummyTupleStateDiscrete,
DummyTupleActionContinuous, DummyTupleActionDiscrete,
DummyHybridEnv,
DummyHybridEnv, DummyAMPEnv, DummyAMPGoalEnv,
DummyGymnasiumAtariEnv, DummyGymnasiumMujocoEnv)

register(
Expand Down Expand Up @@ -96,6 +96,17 @@
max_episode_steps=10
)

register(
id='FakeAMPNNablaRL-v1',
entry_point='nnabla_rl.environments.dummy:DummyAMPEnv',
max_episode_steps=10
)

register(
id='FakeAMPGoalConditionedNNablaRL-v1',
entry_point='nnabla_rl.environments.dummy:DummyAMPGoalEnv',
max_episode_steps=10
)

gymnasium_register(
id='FakeGymnasiumMujocoNNablaRL-v1',
Expand Down
84 changes: 84 additions & 0 deletions nnabla_rl/environments/amp_env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Copyright 2024 Sony Group Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod
from enum import Enum
from typing import Tuple

import gym

from nnabla_rl.external.goal_env import GoalEnv
from nnabla_rl.typing import Experience, Info, NextState, NonTerminal, Reward


class TaskResult(Enum):
UNKNOWN = 0
SUCCESS = 1
FAIL = 2


class AMPEnv(gym.Env):
def step(self, action):
next_state, reward, done, info = self._step(action)
info["task_result"] = self.task_result(next_state, reward, done, info)
info["valid_episode"] = self.is_valid_episode(next_state, reward, done, info)
info["expert_experience"] = self.expert_experience(next_state, reward, done, info)
return next_state, reward, done, info

@abstractmethod
def task_result(self, state, reward, done, info) -> TaskResult:
raise NotImplementedError

@abstractmethod
def is_valid_episode(self, state, reward, done, info) -> bool:
raise NotImplementedError

@abstractmethod
def expert_experience(self, state, reward, done, info) -> Experience:
raise NotImplementedError

def update_sample_counts(self):
pass

@abstractmethod
def _step(self, action) -> Tuple[NextState, Reward, NonTerminal, Info]:
raise NotImplementedError("Implement this function for stepping the env and do not override step()")


class AMPGoalEnv(GoalEnv):
def step(self, action):
next_state, reward, done, info = self._step(action)
info["task_result"] = self.task_result(next_state, reward, done, info)
info["valid_episode"] = self.is_valid_episode(next_state, reward, done, info)
info["expert_experience"] = self.expert_experience(next_state, reward, done, info)
return next_state, reward, done, info

@abstractmethod
def task_result(self, state, reward, done, info) -> TaskResult:
raise NotImplementedError

@abstractmethod
def is_valid_episode(self, state, reward, done, info) -> bool:
raise NotImplementedError

@abstractmethod
def expert_experience(self, state, reward, done, info) -> Experience:
raise NotImplementedError

def update_sample_counts(self):
pass

@abstractmethod
def _step(self, action) -> Tuple[NextState, Reward, NonTerminal, Info]:
raise NotImplementedError("Implement this function for stepping the env and do not override step()")
117 changes: 116 additions & 1 deletion nnabla_rl/environments/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import TYPE_CHECKING, cast
from typing import TYPE_CHECKING, List, cast

import gym
import gym.spaces
import gymnasium
import numpy as np
from gym.envs.registration import EnvSpec
Expand All @@ -25,6 +26,7 @@
from gym.utils.seeding import RandomNumberGenerator

import nnabla_rl
from nnabla_rl.environments.amp_env import AMPEnv, AMPGoalEnv, TaskResult
from nnabla_rl.external.goal_env import GoalEnv


Expand Down Expand Up @@ -317,6 +319,119 @@ def __init__(self, max_episode_steps=None):
self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(5, ))


class DummyAMPEnv(AMPEnv):
def __init__(self, max_episode_steps=10):
self.spec = EnvSpec('dummy-amp-v0', max_episode_steps=max_episode_steps)
self.action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(4, ))
self.observation_space = gym.spaces.Tuple(
[gym.spaces.Box(low=0.0, high=1.0, shape=(2, )),
gym.spaces.Box(low=0.0, high=1.0, shape=(5, )),
gym.spaces.Box(low=0.0, high=1.0, shape=(1, ))])
self.reward_range = (0.0, 1.0)
self.observation_mean = tuple([np.zeros(2, dtype=np.float32), np.zeros(
5, dtype=np.float32), np.zeros(1, dtype=np.float32)])
self.observation_var = tuple([np.ones(2, dtype=np.float32), np.ones(
5, dtype=np.float32), np.ones(1, dtype=np.float32)])
self.action_mean = np.zeros((4,), dtype=np.float32)
self.action_var = np.ones((4,), dtype=np.float32)
self.reward_at_task_fail = 0.0
self.reward_at_task_success = 10.0
self._episode_steps = 0

def reset(self):
self._episode_steps = 0
state = list(self.observation_space.sample())
return tuple(state)

def task_result(self, state, reward, done, info) -> TaskResult:
return TaskResult(TaskResult.UNKNOWN.value)

def is_valid_episode(self, state, reward, done, info) -> bool:
return True

def expert_experience(self, state, reward, done, info):
state = list(self.observation_space.sample())
action = self.action_space.sample()
next_state = list(self.observation_space.sample())
return tuple(state), action, 0.0, False, tuple(next_state), {}

def _step(self, a):
self._episode_steps += 1
next_state = list(self.observation_space.sample())
reward = np.random.randn()
done = self._episode_steps >= self.spec.max_episode_steps
info = {'rnn_states': {'dummy_scope': {'dummy_state1': 1, 'dummy_state2': 2}}}
return tuple(next_state), reward, done, info


class DummyAMPGoalEnv(AMPGoalEnv):
def __init__(self, max_episode_steps=10):
self.spec = EnvSpec('dummy-amp-goal-v0', max_episode_steps=max_episode_steps)
self.action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(4, ))
observation_space = gym.spaces.Tuple(
[gym.spaces.Box(low=0.0, high=1.0, shape=(2, )),
gym.spaces.Box(low=0.0, high=1.0, shape=(5, )),
gym.spaces.Box(low=0.0, high=1.0, shape=(1, ))])
goal_state_space = gym.spaces.Tuple([gym.spaces.Box(low=-np.inf,
high=np.inf,
shape=(3,),
dtype=np.float32),
gym.spaces.Box(low=0.0,
high=1.0,
shape=(1,),
dtype=np.float32)])
self.observation_space = gym.spaces.Dict({"observation": observation_space,
"desired_goal": goal_state_space,
"achieved_goal": goal_state_space})

self.reward_range = (0.0, 1.0)
self.observation_mean = tuple([np.zeros(2, dtype=np.float32), np.zeros(
5, dtype=np.float32), np.zeros(1, dtype=np.float32)])
self.observation_var = tuple([np.ones(2, dtype=np.float32), np.ones(
5, dtype=np.float32), np.ones(1, dtype=np.float32)])
self.action_mean = np.zeros((4,), dtype=np.float32)
self.action_var = np.ones((4,), dtype=np.float32)
self.reward_at_task_fail = 0.0
self.reward_at_task_success = 10.0
self._episode_steps = 0

def reset(self):
super().reset()
self._episode_steps = 0
return self.observation_space.sample()

def task_result(self, state, reward, done, info) -> TaskResult:
return TaskResult(TaskResult.UNKNOWN.value)

def is_valid_episode(self, state, reward, done, info) -> bool:
return True

def expert_experience(self, state, reward, done, info):
action = self.action_space.sample()
return (self._generate_dummy_goal_env_flatten_state(), action, 0.0,
False, self._generate_dummy_goal_env_flatten_state(), {})

def _generate_dummy_goal_env_flatten_state(self):
state: List[np.ndarray] = []
sample = self.observation_space.sample()
for key in ["observation", "desired_goal", "achieved_goal"]:
s = sample[key]
if isinstance(s, tuple):
state.extend(s)
else:
state.append(s)
state = list(map(lambda v: v * 0.0, state))
return tuple(state)

def _step(self, a):
self._episode_steps += 1
next_state = self.observation_space.sample()
reward = np.random.randn()
done = self._episode_steps >= self.spec.max_episode_steps
info = {'rnn_states': {'dummy_scope': {'dummy_state1': 1, 'dummy_state2': 2}}}
return next_state, reward, done, info


# =========== gymnasium ==========
class AbstractDummyGymnasiumEnv(gymnasium.Env):
def __init__(self, max_episode_steps):
Expand Down
2 changes: 1 addition & 1 deletion nnabla_rl/environments/wrappers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.

from nnabla_rl.environments.wrappers.common import (Float32RewardEnv, HWCToCHWEnv, NumpyFloat32Env, # noqa
ScreenRenderEnv, TimestepAsStateEnv)
ScreenRenderEnv, TimestepAsStateEnv, FlattenNestedTupleStateWrapper)

from nnabla_rl.environments.wrappers.mujoco import EndlessEnv # noqa
from nnabla_rl.environments.wrappers.atari import make_atari, wrap_deepmind # noqa
Expand Down
Loading

0 comments on commit d970061

Please sign in to comment.