HumanCompatibleAI · ZiyueWang25 · Sep 13, 2023 · Sep 13, 2023 · Sep 14, 2023 · Sep 14, 2023
diff --git a/examples/train_dagger_atari_interactive_policy.py b/examples/train_dagger_atari_interactive_policy.py
@@ -7,25 +7,42 @@
 
 import gymnasium as gym
 import numpy as np
-from stable_baselines3.common import vec_env
+import torch as th
+from stable_baselines3.common import torch_layers, vec_env
 
 from imitation.algorithms import bc, dagger
-from imitation.data import wrappers
-from imitation.policies import interactive
+from imitation.data import wrappers as data_wrappers
+from imitation.policies import base as policy_base
+from imitation.policies import interactive, obs_update_wrapper
+
+
+def lr_schedule(_: float):
+    # Set lr_schedule to max value to force error if policy.optimizer
+    # is used by mistake (should use self.optimizer instead).
+    return th.finfo(th.float32).max
+
 
 if __name__ == "__main__":
     rng = np.random.default_rng(0)
 
     env = gym.make("PongNoFrameskip-v4", render_mode="rgb_array")
-    env = wrappers.HumanReadableWrapper(env)
-    venv = vec_env.DummyVecEnv([lambda: env])
+    hr_env = data_wrappers.HumanReadableWrapper(env)
+    venv = vec_env.DummyVecEnv([lambda: hr_env])
     venv.seed(0)
 
     expert = interactive.AtariInteractivePolicy(venv)
+    policy = policy_base.FeedForward32Policy(
+        observation_space=env.observation_space,
+        action_space=env.action_space,
+        lr_schedule=lr_schedule,
+        features_extractor_class=torch_layers.FlattenExtractor,
+    )
+    wrapped_policy = obs_update_wrapper.RemoveHR(policy, lr_schedule=lr_schedule)
 
     bc_trainer = bc.BC(
-        observation_space=venv.observation_space,
-        action_space=venv.action_space,
+        observation_space=env.observation_space,
+        action_space=env.action_space,
+        policy=wrapped_policy,
         rng=rng,
     )
 

diff --git a/src/imitation/algorithms/bc.py b/src/imitation/algorithms/bc.py
@@ -26,7 +26,7 @@
 from stable_baselines3.common import policies, torch_layers, utils, vec_env
 
 from imitation.algorithms import base as algo_base
-from imitation.data import rollout, types, wrappers
+from imitation.data import rollout, types
 from imitation.policies import base as policy_base
 from imitation.util import logger as imit_logger
 from imitation.util import util
@@ -294,7 +294,7 @@ def __init__(
             observation_space: the observation space of the environment.
             action_space: the action space of the environment.
             rng: the random state to use for the random number generator.
-            policy: a Stable Baselines3 policy for learning; if unspecified,
+            policy: a Stable Baselines3 policy; if unspecified,
                 defaults to `FeedForward32Policy`.
             demonstrations: Demonstrations from an expert (optional). Transitions
                 expressed directly as a `types.TransitionsMinimal` object, a sequence
@@ -334,19 +334,18 @@ def __init__(
         self._bc_logger = BCLogger(self.logger)
 
         self.action_space = action_space
-        obs_space_without_rgb = wrappers.remove_rgb_obs_space(observation_space)
-        self.observation_space = obs_space_without_rgb
+        self.observation_space = observation_space
 
         self.rng = rng
 
         if policy is None:
             extractor = (
                 torch_layers.CombinedExtractor
-                if isinstance(obs_space_without_rgb, gym.spaces.Dict)
+                if isinstance(observation_space, gym.spaces.Dict)
                 else torch_layers.FlattenExtractor
             )
             policy = policy_base.FeedForward32Policy(
-                observation_space=obs_space_without_rgb,
+                observation_space=observation_space,
                 action_space=action_space,
                 # Set lr_schedule to max value to force error if policy.optimizer
                 # is used by mistake (should use self.optimizer instead).
@@ -355,7 +354,7 @@ def __init__(
             )
         self._policy = policy.to(utils.get_device(device))
         # TODO(adam): make policy mandatory and delete observation/action space params?
-        assert self.policy.observation_space == obs_space_without_rgb
+        assert self.policy.observation_space == self.observation_space
         assert self.policy.action_space == self.action_space
 
         if optimizer_kwargs:
@@ -492,13 +491,8 @@ def process_batch():
                 lambda x: util.safe_to_tensor(x, device=self.policy.device),
                 types.maybe_unwrap_dictobs(batch["obs"]),
             )
-            obs_tensor_without_rgb = wrappers.remove_rgb_obs(obs_tensor)
             acts = util.safe_to_tensor(batch["acts"], device=self.policy.device)
-            training_metrics = self.loss_calculator(
-                self.policy,
-                obs_tensor_without_rgb,
-                acts,
-            )
+            training_metrics = self.loss_calculator(self.policy, obs_tensor, acts)
 
             # Renormalise the loss to be averaged over the whole
             # batch size instead of the minibatch size.

diff --git a/src/imitation/algorithms/dagger.py b/src/imitation/algorithms/dagger.py
@@ -15,14 +15,12 @@
 
 import numpy as np
 import torch as th
-from gymnasium import spaces
 from stable_baselines3.common import policies, utils, vec_env
-from stable_baselines3.common.type_aliases import GymEnv
 from stable_baselines3.common.vec_env.base_vec_env import VecEnvStepReturn
 from torch.utils import data as th_data
 
 from imitation.algorithms import base, bc
-from imitation.data import rollout, serialize, types, wrappers
+from imitation.data import rollout, serialize, types
 from imitation.util import logger as imit_logger
 from imitation.util import util
 
@@ -306,26 +304,6 @@ class NeedsDemosException(Exception):
     """Signals demos need to be collected for current round before continuing."""
 
 
-def _check_for_correct_spaces_with_rgb_env(
-    env_might_with_rgb: GymEnv,
-    obs_space: spaces.Space,
-    action_space: spaces.Space,
-) -> None:
-    """Checks that whether an environment has the same spaces as provided ones."""
-    if isinstance(obs_space, spaces.Dict):
-        assert wrappers.HR_OBS_KEY not in obs_space.spaces
-    env_obs_space = wrappers.remove_rgb_obs_space(env_might_with_rgb.observation_space)
-    if obs_space != env_obs_space:
-        raise ValueError(
-            f"Observation spaces do not match: obs {obs_space} != env {env_obs_space}",
-        )
-    env_action_space = env_might_with_rgb.action_space
-    if action_space != env_action_space:
-        raise ValueError(
-            f"Action spaces do not match: obs {action_space} != env {env_action_space}",
-        )
-
-
 class DAggerTrainer(base.BaseImitationAlgorithm):
     """DAgger training class with low-level API suitable for interactive human feedback.
 
@@ -396,7 +374,7 @@ def __init__(
         self._all_demos = []
         self.rng = rng
 
-        _check_for_correct_spaces_with_rgb_env(
+        utils.check_for_correct_spaces(
             self.venv,
             bc_trainer.observation_space,
             bc_trainer.action_space,
@@ -531,30 +509,6 @@ def extend_and_update(
         logging.info(f"New round number is {self.round_num}")
         return self.round_num
 
-    def _get_trainable_predict_fn(
-        self,
-    ) -> Callable[[Union[Dict[str, np.ndarray], np.ndarray]], np.ndarray]:
-        """Returns a function that uses `bc_trainer.policy` to predict observations.
-
-        Since bc_trainer.policy doesn't accept RGB observations, this function removes
-        The RGB observation part, if any, before passing the observation to prediction.
-
-        Returns:
-            A function that accepts a dictionary observation and returns a numpy array
-            of actions.
-        """
-
-        def remove_rgb_and_predict(
-            obs: Union[Dict[str, np.ndarray], np.ndarray],
-        ) -> np.ndarray:
-            obs_without_rgb = wrappers.remove_rgb_obs(obs)
-            assert isinstance(obs_without_rgb, (np.ndarray, dict))
-            fn = self.bc_trainer.policy.predict
-            # the Dict[str, Tensor] type seems hard to exclude from type annotation.
-            return fn(obs_without_rgb)[0]  # type: ignore[arg-type]
-
-        return remove_rgb_and_predict
-
     def create_trajectory_collector(self) -> InteractiveTrajectoryCollector:
         """Create trajectory collector to extend current round's demonstration set.
 
@@ -567,7 +521,7 @@ def create_trajectory_collector(self) -> InteractiveTrajectoryCollector:
         beta = self.beta_schedule(self.round_num)
         collector = InteractiveTrajectoryCollector(
             venv=self.venv,
-            get_robot_acts=self._get_trainable_predict_fn(),
+            get_robot_acts=lambda obs: self.bc_trainer.policy.predict(obs)[0],
             beta=beta,
             save_dir=save_dir,
             rng=self.rng,

diff --git a/src/imitation/data/wrappers.py b/src/imitation/data/wrappers.py
@@ -5,7 +5,6 @@
 import gymnasium as gym
 import numpy as np
 import numpy.typing as npt
-import torch as th
 from gymnasium.core import Env
 from stable_baselines3.common.vec_env import VecEnv, VecEnvWrapper
 
@@ -213,7 +212,7 @@ def step(self, action):
         return obs, rew, terminated, truncated, info
 
 
-class HumanReadableWrapper(gym.Wrapper):
+class HumanReadableWrapper(gym.ObservationWrapper):
     """Adds human-readable observation to `obs` at every step."""
 
     def __init__(self, env: Env, original_obs_key: str = "ORI_OBS"):
@@ -235,30 +234,8 @@ def __init__(self, env: Env, original_obs_key: str = "ORI_OBS"):
             )
         self._original_obs_key = original_obs_key
         super().__init__(env)
-        self._update_obs_space()
-
-    def _update_obs_space(self):
-        # need to reset before render.
-        self.env.reset()
-        example_rgb_obs = self.env.render()
-        new_rgb_space = gym.spaces.Box(
-            low=0,
-            high=255,
-            shape=example_rgb_obs.shape,
-            dtype=np.uint8,
-        )
-        curr_sapce = self.observation_space
-        if isinstance(curr_sapce, gym.spaces.Dict):
-            curr_sapce.spaces[HR_OBS_KEY] = new_rgb_space
-        else:
-            self.observation_space = gym.spaces.Dict(
-                {
-                    HR_OBS_KEY: new_rgb_space,
-                    self._original_obs_key: curr_sapce,
-                },
-            )
 
-    def _add_hr_obs(
+    def observation(
         self,
         obs: Union[np.ndarray, Dict[str, np.ndarray]],
     ) -> Dict[str, np.ndarray]:
@@ -284,51 +261,3 @@ def _add_hr_obs(
             raise KeyError(f"{HR_OBS_KEY!r} already exists in observation dict")
         obs[HR_OBS_KEY] = self.env.render()  # type: ignore[assignment]
         return obs
-
-    def reset(self, **kwargs):
-        obs, info = super().reset(**kwargs)
-        return self._add_hr_obs(obs), info
-
-    def step(self, action):
-        obs, rew, terminated, truncated, info = self.env.step(action)
-        return self._add_hr_obs(obs), rew, terminated, truncated, info
-
-
-def remove_rgb_obs_space(obs_space: gym.Space) -> gym.Space:
-    """Removes rgb observation space from the observation space."""
-    if not isinstance(obs_space, gym.spaces.Dict):
-        return obs_space
-    if HR_OBS_KEY not in obs_space.spaces:
-        return obs_space
-    if len(obs_space.keys()) == 1:
-        raise ValueError(
-            "Only human readable observation space exists, can't remove it",
-        )
-    # keeps the original obs_space unchanged in case it is used elsewhere.
-    new_obs_space = gym.spaces.Dict(obs_space.spaces.copy())
-    del new_obs_space.spaces[HR_OBS_KEY]
-    if len(new_obs_space.spaces) == 1:
-        # unwrap dictionary structure
-        return next(iter(new_obs_space.values()))
-    return new_obs_space
-
-
-def remove_rgb_obs(
-    obs: Union[Dict[str, np.ndarray], Dict[str, th.Tensor], np.ndarray, th.Tensor],
-) -> Union[Dict[str, np.ndarray], Dict[str, th.Tensor], np.ndarray, th.Tensor]:
-    """Removes rgb observation from the observation."""
-    if not isinstance(obs, dict):
-        return obs
-    if HR_OBS_KEY not in obs:
-        return obs
-    if len(obs) == 1:
-        raise ValueError(
-            "Only human readable observation exists, can't remove it",
-        )
-    # keeps the original observation unchanged in case it is used elsewhere.
-    new_obs = obs.copy()
-    del new_obs[HR_OBS_KEY]
-    if len(new_obs) == 1:
-        # unwrap dictionary structure
-        return next(iter(new_obs.values()))  # type: ignore[return-value]
-    return new_obs
diff --git a/src/imitation/policies/interactive.py b/src/imitation/policies/interactive.py
@@ -65,9 +65,6 @@ def _choose_action(
         if self.clear_screen_on_query:
             util.clear_screen()
 
-        if isinstance(obs, dict):
-            raise ValueError("Dictionary observations are not supported here")
-
         context = self._render(obs)
         key = self._get_input_key()
         self._clean_up(context)