Add amp

sony · May 7, 2024 · d970061 · d970061
1 parent b468b79
commit d970061
Show file tree

Hide file tree

Showing 82 changed files with 10,577 additions and 39 deletions.
diff --git a/bin/test_reproductions b/bin/test_reproductions
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2021,2022,2023 Sony Group Corporation.
+# Copyright 2021,2022,2023,2024 Sony Group Corporation.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -82,6 +82,16 @@ do
         echo "Test run of showcase for ${ALGORITHM}"
         python ${SCRIPT} --gpu ${GPU_ID} --snapshot-dir ${SNAPSHOT_DIR} --showcase \
             --showcase_runs ${SHOWCASE_RUNS}
+    elif [ ${BASE_ENV} = "pybullet" ] && [ ${ALGORITHM} = "amp" ]; then
+        TMP_ENV="FakeAMPNNablaRL-v1"
+        echo "Test run of training for ${ALGORITHM}"
+        python ${SCRIPT} --gpu ${GPU_ID} --save-dir "${RESULT_BASE_DIR}/${ALGORITHM}" --seed ${SEED} \
+            --total_iterations ${TOTAL_ITERATIONS} --save_timing ${TOTAL_ITERATIONS} --actor_num 1 \
+            --args_file_path ${TMP_ENV}
+        SNAPSHOT_DIR="${RESULT_BASE_DIR}/${ALGORITHM}/${TMP_ENV}_results/seed-${SEED}/iteration-${TOTAL_ITERATIONS}"
+        echo "Test run of showcase for ${ALGORITHM}"
+        python ${SCRIPT} --gpu ${GPU_ID} --snapshot-dir ${SNAPSHOT_DIR} --showcase \
+            --showcase_runs ${SHOWCASE_RUNS} --args_file_path ${TMP_ENV}
     elif [ ${ALGORITHM} = "decision_transformer" ]; then
         echo "Test run of training for ${ALGORITHM}"
         TOTAL_EPOCHS=1

diff --git a/docs/source/nnablarl_api/algorithms.rst b/docs/source/nnablarl_api/algorithms.rst
@@ -29,6 +29,16 @@ A2C
    :members:
    :show-inheritance:
 
+AMP
+====
+.. autoclass:: nnabla_rl.algorithms.amp.AMPConfig
+   :members:
+   :show-inheritance:
+
+.. autoclass:: nnabla_rl.algorithms.amp.AMP
+   :members:
+   :show-inheritance:
+
 ATRPO
 ======
 .. autoclass:: nnabla_rl.algorithms.atrpo.ATRPOConfig

diff --git a/nnabla_rl/algorithms/README.md b/nnabla_rl/algorithms/README.md
@@ -13,6 +13,7 @@ nnabla-rl offers various (deep) reinforcement learning and optimal control algor
 |Algorithm|Online training|Offline(Batch) training|Continuous action|Discrete action|Hybrid action|RNN layer support|
 |:---|:---:|:---:|:---:|:---:|:---:|:---:|
 |[A2C](https://arxiv.org/abs/1602.01783)|:heavy_check_mark:|:x:|(We will support continuous action in the future)|:heavy_check_mark:|:x:|:x:|
+|[AMP](https://arxiv.org/abs/2104.02180)|:heavy_check_mark:|:x:|:heavy_check_mark:|:x:|:x:|:x:|
 |[ATRPO](https://arxiv.org/pdf/2106.07329)|:heavy_check_mark:|:x:|:heavy_check_mark:|(We will support discrete action in the future)|:x:|:x:|
 |[BCQ](https://arxiv.org/abs/1812.02900)|:x:|:heavy_check_mark:|:heavy_check_mark:|:x:|:x:|:x:|
 |[BEAR](https://arxiv.org/abs/1906.00949)|:x:|:heavy_check_mark:|:heavy_check_mark:|:x:|:x:|:x:|

diff --git a/nnabla_rl/algorithms/__init__.py b/nnabla_rl/algorithms/__init__.py
@@ -15,6 +15,7 @@
 
 from nnabla_rl.algorithm import Algorithm, AlgorithmConfig
 from nnabla_rl.algorithms.a2c import A2C, A2CConfig
+from nnabla_rl.algorithms.amp import AMP, AMPConfig
 from nnabla_rl.algorithms.atrpo import ATRPO, ATRPOConfig
 from nnabla_rl.algorithms.bcq import BCQ, BCQConfig
 from nnabla_rl.algorithms.bear import BEAR, BEARConfig
@@ -83,6 +84,7 @@ def get_class_of(name):
 
 register_algorithm(A2C, A2CConfig)
 register_algorithm(ATRPO, ATRPOConfig)
+register_algorithm(AMP, AMPConfig)
 register_algorithm(BCQ, BCQConfig)
 register_algorithm(BEAR, BEARConfig)
 register_algorithm(CategoricalDDQN, CategoricalDDQNConfig)

diff --git a/nnabla_rl/algorithms/amp.py b/nnabla_rl/algorithms/amp.py
diff --git a/nnabla_rl/environment_explorers/epsilon_greedy_explorer.py b/nnabla_rl/environment_explorers/epsilon_greedy_explorer.py
@@ -1,5 +1,5 @@
 # Copyright 2020,2021 Sony Corporation.
-# Copyright 2021,2022,2023 Sony Group Corporation.
+# Copyright 2021,2022,2023,2024 Sony Group Corporation.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -80,11 +80,14 @@ class LinearDecayEpsilonGreedyExplorerConfig(EnvironmentExplorerConfig):
             This value must be smaller than initial_epsilon. Defaults to 0.05.
         max_explore_steps (int): Number of steps to decay epsilon from initial_epsilon to final_epsilon.
             Defaults to 1000000.
+        append_explorer_info (bool): Flag for appending explorer info to the action info. \
+            The explore info includes whether the action is greedy or not, and explore rate. Defaults to False.
     """
 
     initial_epsilon: float = 1.0
     final_epsilon: float = 0.05
     max_explore_steps: float = 1000000
+    append_explorer_info: bool = False
 
     def __post_init__(self):
         self._assert_between(self.initial_epsilon, 0.0, 1.0, 'initial_epsilon')
@@ -125,11 +128,13 @@ def __init__(self,
 
     def action(self, step: int, state: np.ndarray, *, begin_of_episode: bool = False) -> Tuple[np.ndarray, Dict]:
         epsilon = self._compute_epsilon(step)
-        (action, info), _ = epsilon_greedy_action_selection(state,
-                                                            self._greedy_action_selector,
-                                                            self._random_action_selector,
-                                                            epsilon,
-                                                            begin_of_episode=begin_of_episode)
+        (action, info), is_greedy_action = epsilon_greedy_action_selection(state,
+                                                                           self._greedy_action_selector,
+                                                                           self._random_action_selector,
+                                                                           epsilon,
+                                                                           begin_of_episode=begin_of_episode)
+        if self._config.append_explorer_info:
+            info.update({"greedy_action": is_greedy_action, "explore_rate": epsilon})
         return action, info
 
     def _compute_epsilon(self, step):

diff --git a/nnabla_rl/environments/__init__.py b/nnabla_rl/environments/__init__.py
@@ -22,7 +22,7 @@
                                           DummyTupleContinuous, DummyTupleDiscrete, DummyTupleMixed,
                                           DummyTupleStateContinuous, DummyTupleStateDiscrete,
                                           DummyTupleActionContinuous, DummyTupleActionDiscrete,
-                                          DummyHybridEnv,
+                                          DummyHybridEnv, DummyAMPEnv, DummyAMPGoalEnv,
                                           DummyGymnasiumAtariEnv, DummyGymnasiumMujocoEnv)
 
 register(
@@ -96,6 +96,17 @@
     max_episode_steps=10
 )
 
+register(
+    id='FakeAMPNNablaRL-v1',
+    entry_point='nnabla_rl.environments.dummy:DummyAMPEnv',
+    max_episode_steps=10
+)
+
+register(
+    id='FakeAMPGoalConditionedNNablaRL-v1',
+    entry_point='nnabla_rl.environments.dummy:DummyAMPGoalEnv',
+    max_episode_steps=10
+)
 
 gymnasium_register(
     id='FakeGymnasiumMujocoNNablaRL-v1',

diff --git a/nnabla_rl/environments/amp_env.py b/nnabla_rl/environments/amp_env.py
@@ -0,0 +1,84 @@
+# Copyright 2024 Sony Group Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from enum import Enum
+from typing import Tuple
+
+import gym
+
+from nnabla_rl.external.goal_env import GoalEnv
+from nnabla_rl.typing import Experience, Info, NextState, NonTerminal, Reward
+
+
+class TaskResult(Enum):
+    UNKNOWN = 0
+    SUCCESS = 1
+    FAIL = 2
+
+
+class AMPEnv(gym.Env):
+    def step(self, action):
+        next_state, reward, done, info = self._step(action)
+        info["task_result"] = self.task_result(next_state, reward, done, info)
+        info["valid_episode"] = self.is_valid_episode(next_state, reward, done, info)
+        info["expert_experience"] = self.expert_experience(next_state, reward, done, info)
+        return next_state, reward, done, info
+
+    @abstractmethod
+    def task_result(self, state, reward, done, info) -> TaskResult:
+        raise NotImplementedError
+
+    @abstractmethod
+    def is_valid_episode(self, state, reward, done, info) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def expert_experience(self, state, reward, done, info) -> Experience:
+        raise NotImplementedError
+
+    def update_sample_counts(self):
+        pass
+
+    @abstractmethod
+    def _step(self, action) -> Tuple[NextState, Reward, NonTerminal, Info]:
+        raise NotImplementedError("Implement this function for stepping the env and do not override step()")
+
+
+class AMPGoalEnv(GoalEnv):
+    def step(self, action):
+        next_state, reward, done, info = self._step(action)
+        info["task_result"] = self.task_result(next_state, reward, done, info)
+        info["valid_episode"] = self.is_valid_episode(next_state, reward, done, info)
+        info["expert_experience"] = self.expert_experience(next_state, reward, done, info)
+        return next_state, reward, done, info
+
+    @abstractmethod
+    def task_result(self, state, reward, done, info) -> TaskResult:
+        raise NotImplementedError
+
+    @abstractmethod
+    def is_valid_episode(self, state, reward, done, info) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def expert_experience(self, state, reward, done, info) -> Experience:
+        raise NotImplementedError
+
+    def update_sample_counts(self):
+        pass
+
+    @abstractmethod
+    def _step(self, action) -> Tuple[NextState, Reward, NonTerminal, Info]:
+        raise NotImplementedError("Implement this function for stepping the env and do not override step()")
diff --git a/nnabla_rl/environments/dummy.py b/nnabla_rl/environments/dummy.py
@@ -13,9 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, List, cast
 
 import gym
+import gym.spaces
 import gymnasium
 import numpy as np
 from gym.envs.registration import EnvSpec
@@ -25,6 +26,7 @@
     from gym.utils.seeding import RandomNumberGenerator
 
 import nnabla_rl
+from nnabla_rl.environments.amp_env import AMPEnv, AMPGoalEnv, TaskResult
 from nnabla_rl.external.goal_env import GoalEnv
 
 
@@ -317,6 +319,119 @@ def __init__(self, max_episode_steps=None):
         self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(5, ))
 
 
+class DummyAMPEnv(AMPEnv):
+    def __init__(self, max_episode_steps=10):
+        self.spec = EnvSpec('dummy-amp-v0', max_episode_steps=max_episode_steps)
+        self.action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(4, ))
+        self.observation_space = gym.spaces.Tuple(
+            [gym.spaces.Box(low=0.0, high=1.0, shape=(2, )),
+             gym.spaces.Box(low=0.0, high=1.0, shape=(5, )),
+             gym.spaces.Box(low=0.0, high=1.0, shape=(1, ))])
+        self.reward_range = (0.0, 1.0)
+        self.observation_mean = tuple([np.zeros(2, dtype=np.float32), np.zeros(
+            5, dtype=np.float32), np.zeros(1, dtype=np.float32)])
+        self.observation_var = tuple([np.ones(2, dtype=np.float32), np.ones(
+            5, dtype=np.float32), np.ones(1, dtype=np.float32)])
+        self.action_mean = np.zeros((4,), dtype=np.float32)
+        self.action_var = np.ones((4,), dtype=np.float32)
+        self.reward_at_task_fail = 0.0
+        self.reward_at_task_success = 10.0
+        self._episode_steps = 0
+
+    def reset(self):
+        self._episode_steps = 0
+        state = list(self.observation_space.sample())
+        return tuple(state)
+
+    def task_result(self, state, reward, done, info) -> TaskResult:
+        return TaskResult(TaskResult.UNKNOWN.value)
+
+    def is_valid_episode(self, state, reward, done, info) -> bool:
+        return True
+
+    def expert_experience(self, state, reward, done, info):
+        state = list(self.observation_space.sample())
+        action = self.action_space.sample()
+        next_state = list(self.observation_space.sample())
+        return tuple(state), action, 0.0, False, tuple(next_state), {}
+
+    def _step(self, a):
+        self._episode_steps += 1
+        next_state = list(self.observation_space.sample())
+        reward = np.random.randn()
+        done = self._episode_steps >= self.spec.max_episode_steps
+        info = {'rnn_states': {'dummy_scope': {'dummy_state1': 1, 'dummy_state2': 2}}}
+        return tuple(next_state), reward, done, info
+
+
+class DummyAMPGoalEnv(AMPGoalEnv):
+    def __init__(self, max_episode_steps=10):
+        self.spec = EnvSpec('dummy-amp-goal-v0', max_episode_steps=max_episode_steps)
+        self.action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(4, ))
+        observation_space = gym.spaces.Tuple(
+            [gym.spaces.Box(low=0.0, high=1.0, shape=(2, )),
+             gym.spaces.Box(low=0.0, high=1.0, shape=(5, )),
+             gym.spaces.Box(low=0.0, high=1.0, shape=(1, ))])
+        goal_state_space = gym.spaces.Tuple([gym.spaces.Box(low=-np.inf,
+                                                            high=np.inf,
+                                                            shape=(3,),
+                                                            dtype=np.float32),
+                                             gym.spaces.Box(low=0.0,
+                                                            high=1.0,
+                                                            shape=(1,),
+                                                            dtype=np.float32)])
+        self.observation_space = gym.spaces.Dict({"observation": observation_space,
+                                                  "desired_goal": goal_state_space,
+                                                  "achieved_goal": goal_state_space})
+
+        self.reward_range = (0.0, 1.0)
+        self.observation_mean = tuple([np.zeros(2, dtype=np.float32), np.zeros(
+            5, dtype=np.float32), np.zeros(1, dtype=np.float32)])
+        self.observation_var = tuple([np.ones(2, dtype=np.float32), np.ones(
+            5, dtype=np.float32), np.ones(1, dtype=np.float32)])
+        self.action_mean = np.zeros((4,), dtype=np.float32)
+        self.action_var = np.ones((4,), dtype=np.float32)
+        self.reward_at_task_fail = 0.0
+        self.reward_at_task_success = 10.0
+        self._episode_steps = 0
+
+    def reset(self):
+        super().reset()
+        self._episode_steps = 0
+        return self.observation_space.sample()
+
+    def task_result(self, state, reward, done, info) -> TaskResult:
+        return TaskResult(TaskResult.UNKNOWN.value)
+
+    def is_valid_episode(self, state, reward, done, info) -> bool:
+        return True
+
+    def expert_experience(self, state, reward, done, info):
+        action = self.action_space.sample()
+        return (self._generate_dummy_goal_env_flatten_state(), action, 0.0,
+                False, self._generate_dummy_goal_env_flatten_state(), {})
+
+    def _generate_dummy_goal_env_flatten_state(self):
+        state: List[np.ndarray] = []
+        sample = self.observation_space.sample()
+        for key in ["observation", "desired_goal", "achieved_goal"]:
+            s = sample[key]
+            if isinstance(s, tuple):
+                state.extend(s)
+            else:
+                state.append(s)
+        state = list(map(lambda v: v * 0.0, state))
+        return tuple(state)
+
+    def _step(self, a):
+        self._episode_steps += 1
+        next_state = self.observation_space.sample()
+        reward = np.random.randn()
+        done = self._episode_steps >= self.spec.max_episode_steps
+        info = {'rnn_states': {'dummy_scope': {'dummy_state1': 1, 'dummy_state2': 2}}}
+        return next_state, reward, done, info
+
+
 # =========== gymnasium ==========
 class AbstractDummyGymnasiumEnv(gymnasium.Env):
     def __init__(self, max_episode_steps):

diff --git a/nnabla_rl/environments/wrappers/__init__.py b/nnabla_rl/environments/wrappers/__init__.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 from nnabla_rl.environments.wrappers.common import (Float32RewardEnv, HWCToCHWEnv, NumpyFloat32Env,  # noqa
-                                                    ScreenRenderEnv, TimestepAsStateEnv)
+                                                    ScreenRenderEnv, TimestepAsStateEnv, FlattenNestedTupleStateWrapper)
 
 from nnabla_rl.environments.wrappers.mujoco import EndlessEnv  # noqa
 from nnabla_rl.environments.wrappers.atari import make_atari, wrap_deepmind  # noqa