diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 08a8e33e72..7d1fffd938 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -56,9 +56,13 @@ repos: (?x)^( .*cs.meta| .*.css| - .*.meta + .*.meta| + .*.asset| + .*.prefab| + .*.unity| + .*.json )$ - args: [--fix=lf] + args: [--fix=crlf] - id: trailing-whitespace name: trailing-whitespace-markdown diff --git a/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb b/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb index e5d3d45c8b..83aad09aba 100644 --- a/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb +++ b/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb @@ -161,8 +161,8 @@ "from pathlib import Path\n", "from typing import Callable, Any\n", "\n", - "import gym\n", - "from gym import Env\n", + "import gymnasium as gym\n", + "from gymnasium import Env\n", "\n", "from stable_baselines3 import PPO\n", "from stable_baselines3.common.vec_env import VecMonitor, VecEnv, SubprocVecEnv\n", diff --git a/docs/Python-Gym-API-Documentation.md b/docs/Python-Gym-API-Documentation.md index b35771fc46..e92edce5e0 100644 --- a/docs/Python-Gym-API-Documentation.md +++ b/docs/Python-Gym-API-Documentation.md @@ -59,18 +59,22 @@ Environment initialization #### reset ```python - | reset() -> Union[List[np.ndarray], np.ndarray] + | reset(*, seed: int | None = None, options: dict[str, Any] | None = None) -> Tuple[np.ndarray, Dict] ``` -Resets the state of the environment and returns an initial observation. -Returns: observation (object/list): the initial observation of the -space. +Resets the state of the environment and returns an initial observation and info. + +**Returns**: + +- `observation` _object/list_ - the initial observation of the + space. +- `info` _dict_ - contains auxiliary diagnostic information. #### step ```python - | step(action: List[Any]) -> GymStepResult + | step(action: Any) -> GymStepResult ``` Run one timestep of the environment's dynamics. When end of @@ -86,14 +90,15 @@ Accepts an action and returns a tuple (observation, reward, done, info). - `observation` _object/list_ - agent's observation of the current environment reward (float/list) : amount of reward returned after previous action -- `done` _boolean/list_ - whether the episode has ended. +- `terminated` _boolean/list_ - whether the episode has ended by termination. +- `truncated` _boolean/list_ - whether the episode has ended by truncation. - `info` _dict_ - contains auxiliary diagnostic information. #### render ```python - | render(mode="rgb_array") + | render() ``` Return the latest visual observations. diff --git a/docs/Python-Gym-API.md b/docs/Python-Gym-API.md index 97869899ce..59ce44eeb6 100644 --- a/docs/Python-Gym-API.md +++ b/docs/Python-Gym-API.md @@ -93,7 +93,7 @@ observation, a single discrete action and a single Agent in the scene. Add the following code to the `train_unity.py` file: ```python -import gym +import gymnasium as gym from baselines import deepq from baselines import logger diff --git a/docs/Python-PettingZoo-API-Documentation.md b/docs/Python-PettingZoo-API-Documentation.md index 233e45e805..423a09d64e 100644 --- a/docs/Python-PettingZoo-API-Documentation.md +++ b/docs/Python-PettingZoo-API-Documentation.md @@ -21,7 +21,6 @@ * [action\_space](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.action_space) * [side\_channel](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.side_channel) * [reset](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.reset) - * [seed](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.seed) * [render](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.render) * [close](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.close) @@ -137,7 +136,7 @@ Initializes a Unity Parallel environment wrapper. #### reset ```python - | reset() -> Dict[str, Any] + | reset(seed: int | None = None, options: dict | None = None) -> Tuple[Dict[str, Any], Dict[str, Dict]] ``` Resets the environment. @@ -207,32 +206,24 @@ of an environment with `env.side_channel[]`. #### reset ```python - | reset() + | reset(seed: int | None = None, options: dict | None = None) -> Any ``` Resets the environment. - -#### seed - -```python - | seed(seed=None) -``` - -Reseeds the environment (making the resulting environment deterministic). -`reset()` must be called after `seed()`, and before `step()`. - #### render ```python - | render(mode="human") + | render() ``` NOT SUPPORTED. -Displays a rendered frame from the environment, if supported. -Alternate render modes in the default environments are `'rgb_array'` +Renders the environment as specified by self.render_mode, if supported. + +Render mode can be `human` to display a window. +Other render modes in the default environments are `'rgb_array'` which returns a numpy array and is supported by all environments outside of classic, and `'ansi'` which returns the strings printed (specific to classic environments). diff --git a/docs/Python-PettingZoo-API.md b/docs/Python-PettingZoo-API.md index 2c62ed8415..651932a9e4 100644 --- a/docs/Python-PettingZoo-API.md +++ b/docs/Python-PettingZoo-API.md @@ -25,13 +25,13 @@ Here's an example of interacting with wrapped environment: ```python from mlagents_envs.environment import UnityEnvironment -from mlagents_envs.envs import UnityToPettingZooWrapper +from mlagents_envs.envs.unity_aec_env import UnityAECEnv unity_env = UnityEnvironment("StrikersVsGoalie") -env = UnityToPettingZooWrapper(unity_env) +env = UnityAECEnv(unity_env) env.reset() for agent in env.agent_iter(): - observation, reward, done, info = env.last() + observation, reward, terminated, truncated, info = env.last() action = policy(observation, agent) env.step(action) ``` diff --git a/ml-agents-envs/README.md b/ml-agents-envs/README.md index 4db68723d2..b6b94f7a18 100644 --- a/ml-agents-envs/README.md +++ b/ml-agents-envs/README.md @@ -12,14 +12,6 @@ The LLAPI is used by the trainer implementation in `mlagents`. `mlagents_envs` can be used independently of `mlagents` for Python communication. -## Installation - -Install the `mlagents_envs` package with: - -```sh -python -m pip install mlagents_envs==1.1.0 -``` - ## Usage & More Information See @@ -42,3 +34,32 @@ scene with the ML-Agents SDK, check out the main - Communication between Unity and the Python `UnityEnvironment` is not secure. - On Linux, ports are not released immediately after the communication closes. As such, you cannot reuse ports right after closing a `UnityEnvironment`. + +## Development and publishing (Wargaming artifactory) + +Since this package does not seem to be maintained anymore be the official developers, we have forked it to the Wargaming gitlab and are maintaining it there. +Publishing is done via the [Wargaming artifactory](https://ed.artifactory.wgdp.io:443/artifactory/api/pypi/mlopsbi-pypi/simple). + +To contribute to the `mlagents_envs` package, please work on a branch and create a merge request to `master` once ready. +Once the merge request is approved and merged to `master` branch, a gitlab pipeline will automatically create a new git tag and publish the new version to the Wargaming artifactory. + +## Installation (Wargaming artifactory) + +Since publishing is done via the Wargaming artifactory, you can use this package as dependency by adding the following to your `pyproject.toml`: + +```toml +[tool.poetry.dependencies] +mlagents-envs = { version = "^0.1", source = "artifactory" } + +[[tool.poetry.source]] +name = "artifactory" +url = "https://ed.artifactory.wgdp.io:443/artifactory/api/pypi/mlopsbi-pypi/simple" +priority = "explicit" +``` + + +Or you can install the `mlagents_envs` package from the Wargaming artifactory using pip: + +```bash +pip install mlagents-envs --extra-index-url https://ed.artifactory.wgdp.io:443/artifactory/api/pypi/mlopsbi-pypi/simple +``` diff --git a/ml-agents-envs/mlagents_envs/base_env.py b/ml-agents-envs/mlagents_envs/base_env.py index 67deb26e85..eb3acdfff2 100644 --- a/ml-agents-envs/mlagents_envs/base_env.py +++ b/ml-agents-envs/mlagents_envs/base_env.py @@ -138,6 +138,30 @@ def __getitem__(self, agent_id: AgentId) -> DecisionStep: def __iter__(self) -> Iterator[Any]: yield from self.agent_id + def __add__(self, other: "DecisionSteps") -> "DecisionSteps": + assert isinstance(other, DecisionSteps) + + combined_terminal_steps = DecisionSteps( + list(np.hstack([self.obs, other.obs])), + np.hstack([self.reward, other.reward]), + np.hstack([self.agent_id, other.agent_id]), + list(np.hstack([self.action_mask, other.action_mask])) + if self.action_mask or other.action_mask + else None, + np.hstack([self.group_id, other.group_id]), + np.hstack([self.group_reward, other.group_reward]), + ) + combined_terminal_steps._agent_id_to_index = { + **self.agent_id_to_index, + # shift index of added termination steps because of appending + **{ + agent_id: index + len(self) + for agent_id, index in other.agent_id_to_index.items() + }, + } + + return combined_terminal_steps + @staticmethod def empty(spec: "BehaviorSpec") -> "DecisionSteps": """ @@ -245,6 +269,28 @@ def __getitem__(self, agent_id: AgentId) -> TerminalStep: def __iter__(self) -> Iterator[Any]: yield from self.agent_id + def __add__(self, other: "TerminalSteps") -> "TerminalSteps": + assert isinstance(other, TerminalSteps) + + combined_terminal_steps = TerminalSteps( + list(np.hstack([self.obs, other.obs])), + np.hstack([self.reward, other.reward]), + np.hstack([self.interrupted, other.interrupted]), + np.hstack([self.agent_id, other.agent_id]), + np.hstack([self.group_id, other.group_id]), + np.hstack([self.group_reward, other.group_reward]), + ) + combined_terminal_steps._agent_id_to_index = { + **self.agent_id_to_index, + # shift index of added termination steps because of appending + **{ + agent_id: index + len(self) + for agent_id, index in other.agent_id_to_index.items() + }, + } + + return combined_terminal_steps + @staticmethod def empty(spec: "BehaviorSpec") -> "TerminalSteps": """ diff --git a/ml-agents-envs/mlagents_envs/envs/env_helpers.py b/ml-agents-envs/mlagents_envs/envs/env_helpers.py index 768e670603..7d8ef10687 100644 --- a/ml-agents-envs/mlagents_envs/envs/env_helpers.py +++ b/ml-agents-envs/mlagents_envs/envs/env_helpers.py @@ -17,7 +17,11 @@ def _unwrap_batch_steps(batch_steps, behavior_name): termination_id = [ _behavior_to_agent_id(behavior_name, i) for i in termination_batch.agent_id ] - agents = decision_id + termination_id + agents = decision_id + for id in termination_id: + if id not in agents: + agents.append(id) + obs = { agent_id: [batch_obs[i] for batch_obs in termination_batch.obs] for i, agent_id in enumerate(termination_id) @@ -40,30 +44,46 @@ def _unwrap_batch_steps(batch_steps, behavior_name): } ) obs = {k: v if len(v) > 1 else v[0] for k, v in obs.items()} - dones = {agent_id: True for agent_id in termination_id} - dones.update({agent_id: False for agent_id in decision_id}) rewards = { - agent_id: termination_batch.reward[i] - for i, agent_id in enumerate(termination_id) + agent_id: decision_batch.reward[i] for i, agent_id in enumerate(decision_id) } rewards.update( - {agent_id: decision_batch.reward[i] for i, agent_id in enumerate(decision_id)} + { + agent_id: termination_batch.reward[i] + for i, agent_id in enumerate(termination_id) + } ) cumulative_rewards = {k: v for k, v in rewards.items()} infos = {} + terminations = {} + truncations = {} for i, agent_id in enumerate(decision_id): infos[agent_id] = {} infos[agent_id]["behavior_name"] = behavior_name infos[agent_id]["group_id"] = decision_batch.group_id[i] infos[agent_id]["group_reward"] = decision_batch.group_reward[i] + truncations[agent_id] = False + terminations[agent_id] = False for i, agent_id in enumerate(termination_id): infos[agent_id] = {} infos[agent_id]["behavior_name"] = behavior_name infos[agent_id]["group_id"] = termination_batch.group_id[i] infos[agent_id]["group_reward"] = termination_batch.group_reward[i] - infos[agent_id]["interrupted"] = termination_batch.interrupted[i] + truncated = bool(termination_batch.interrupted[i]) + infos[agent_id]["interrupted"] = truncated + truncations[agent_id] = truncated + terminations[agent_id] = not truncated id_map = {agent_id: i for i, agent_id in enumerate(decision_id)} - return agents, obs, dones, rewards, cumulative_rewards, infos, id_map + return ( + agents, + obs, + terminations, + truncations, + rewards, + cumulative_rewards, + infos, + id_map, + ) def _parse_behavior(full_behavior): diff --git a/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py b/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py index 4bb6fdf390..d7dea3fc10 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py @@ -1,5 +1,5 @@ from typing import Any, Optional -from gym import error +from gymnasium import error from mlagents_envs.base_env import BaseEnv from pettingzoo import AECEnv @@ -53,7 +53,8 @@ def observe(self, agent_id): return ( self._observations[agent_id], self._cumm_rewards[agent_id], - self._dones[agent_id], + self._terminations[agent_id], + self._truncations[agent_id], self._infos[agent_id], ) @@ -61,8 +62,10 @@ def last(self, observe=True): """ returns observation, cumulative reward, done, info for the current agent (specified by self.agent_selection) """ - obs, reward, done, info = self.observe(self._agents[self._agent_index]) - return obs if observe else None, reward, done, info + obs, cumm_rewards, terminated, truncated, info = self.observe( + self._agents[self._agent_index] + ) + return obs if observe else None, cumm_rewards, terminated, truncated, info @property def agent_selection(self): diff --git a/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py b/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py index df29a95c9a..f4209ba837 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py @@ -3,8 +3,8 @@ import numpy as np from typing import Any, Dict, List, Optional, Tuple, Union -import gym -from gym import error, spaces +import gymnasium as gym +from gymnasium import error, spaces from mlagents_envs.base_env import ActionTuple, BaseEnv from mlagents_envs.base_env import DecisionSteps, TerminalSteps @@ -20,7 +20,7 @@ class UnityGymException(error.Error): logger = logging_util.get_logger(__name__) -GymStepResult = Tuple[np.ndarray, float, bool, Dict] +GymStepResult = Tuple[np.ndarray, float, bool, bool, Dict] class UnityToGymWrapper(gym.Env): @@ -151,11 +151,16 @@ def __init__( else: self._observation_space = list_spaces[0] # only return the first one - def reset(self) -> Union[List[np.ndarray], np.ndarray]: - """Resets the state of the environment and returns an initial observation. - Returns: observation (object/list): the initial observation of the + def reset( + self, *, seed: int | None = None, options: dict[str, Any] | None = None + ) -> Tuple[np.ndarray, Dict]: + """Resets the state of the environment and returns an initial observation and info. + Returns: + observation (object/list): the initial observation of the space. + info (dict): contains auxiliary diagnostic information. """ + super().reset(seed=seed, options=options) self._env.reset() decision_step, _ = self._env.get_steps(self.name) n_agents = len(decision_step) @@ -163,9 +168,9 @@ def reset(self) -> Union[List[np.ndarray], np.ndarray]: self.game_over = False res: GymStepResult = self._single_step(decision_step) - return res[0] + return res[0], res[4] - def step(self, action: List[Any]) -> GymStepResult: + def step(self, action: Any) -> GymStepResult: """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. @@ -175,14 +180,15 @@ def step(self, action: List[Any]) -> GymStepResult: Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action - done (boolean/list): whether the episode has ended. + terminated (boolean/list): whether the episode has ended by termination. + truncated (boolean/list): whether the episode has ended by truncation. info (dict): contains auxiliary diagnostic information. """ if self.game_over: raise UnityGymException( "You are calling 'step()' even though this environment has already " - "returned done = True. You must always call 'reset()' once you " - "receive 'done = True'." + "returned `terminated` or `truncated` as True. You must always call 'reset()' once you " + "receive `terminated` or `truncated` as True." ) if self._flattener is not None: # Translate action into list @@ -227,9 +233,19 @@ def _single_step(self, info: Union[DecisionSteps, TerminalSteps]) -> GymStepResu visual_obs = self._get_vis_obs_list(info) self.visual_obs = self._preprocess_single(visual_obs[0][0]) - done = isinstance(info, TerminalSteps) + if isinstance(info, TerminalSteps): + interrupted = info.interrupted[0] + terminated, truncated = not interrupted, interrupted + else: + terminated, truncated = False, False - return (default_observation, info.reward[0], done, {"step": info}) + return ( + default_observation, + info.reward[0], + terminated, + truncated, + {"step": info}, + ) def _preprocess_single(self, single_visual_obs: np.ndarray) -> np.ndarray: if self.uint8_visual: @@ -276,7 +292,7 @@ def _get_vec_obs_size(self) -> int: result += obs_spec.shape[0] return result - def render(self, mode="rgb_array"): + def render(self): """ Return the latest visual observations. Note that it will not render a new frame of the environment. diff --git a/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py b/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py index 09398d27fa..9121199e4a 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py @@ -1,5 +1,5 @@ from typing import Optional, Dict, Any, Tuple -from gym import error +from gymnasium import error from mlagents_envs.base_env import BaseEnv from pettingzoo import ParallelEnv @@ -20,13 +20,17 @@ def __init__(self, env: BaseEnv, seed: Optional[int] = None): """ super().__init__(env, seed) - def reset(self) -> Dict[str, Any]: + def reset( + self, + seed: int | None = None, + options: dict | None = None, + ) -> Tuple[Dict[str, Any], Dict[str, Dict]]: """ Resets the environment. """ - super().reset() + super().reset(seed=seed, options=options) - return self._observations + return self._observations, self._infos def step(self, actions: Dict[str, Any]) -> Tuple: self._assert_loaded() @@ -46,8 +50,13 @@ def step(self, actions: Dict[str, Any]) -> Tuple: # Step environment self._step() - # Agent cleanup and sorting - self._cleanup_agents() + # Agent sorting self._live_agents.sort() # unnecessary, only for passing API test - return self._observations, self._rewards, self._dones, self._infos + return ( + self._observations, + self._rewards, + self._terminations, + self._truncations, + self._infos, + ) diff --git a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py index 3457f18c88..3c62885f12 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py @@ -1,7 +1,7 @@ import atexit from typing import Optional, List, Set, Dict, Any, Tuple import numpy as np -from gym import error, spaces +from gymnasium import error, spaces from mlagents_envs.base_env import BaseEnv, ActionTuple from mlagents_envs.envs.env_helpers import _agent_id_to_behavior, _unwrap_batch_steps @@ -32,7 +32,8 @@ def __init__( self._possible_agents: Set[str] = set() # all agents that have ever appear self._agent_id_to_index: Dict[str, int] = {} # agent_id: index in decision step self._observations: Dict[str, np.ndarray] = {} # agent_id: obs - self._dones: Dict[str, bool] = {} # agent_id: done + self._terminations: Dict[str, bool] = {} # agent_id: terminated + self._truncations: Dict[str, bool] = {} # agent_id: truncated self._rewards: Dict[str, float] = {} # agent_id: reward self._cumm_rewards: Dict[str, float] = {} # agent_id: reward self._infos: Dict[str, Dict] = {} # agent_id: info @@ -45,7 +46,7 @@ def __init__( if not self._env.behavior_specs: self._env.step() for behavior_name in self._env.behavior_specs.keys(): - _, _, _ = self._batch_update(behavior_name) + _, _, _, _ = self._batch_update(behavior_name) self._update_observation_spaces() self._update_action_spaces() @@ -132,7 +133,7 @@ def _update_action_spaces(self) -> None: continue if act_spec.continuous_size > 0: c_space = spaces.Box( - -1, 1, (act_spec.continuous_size,), dtype=np.int32 + -1, 1, (act_spec.continuous_size,), dtype=np.float64 ) if self._seed is not None: c_space.seed(self._seed) @@ -162,13 +163,13 @@ def _process_action(self, current_agent, action): else: action = ActionTuple(action, None) - if not self._dones[current_agent]: + if not self._terminations[current_agent] or self._truncations[current_agent]: current_behavior = _agent_id_to_behavior(current_agent) current_index = self._agent_id_to_index[current_agent] if action.continuous is not None: self._current_action[current_behavior].continuous[ current_index - ] = action.continuous[0] + ] = action.continuous if action.discrete is not None: self._current_action[current_behavior].discrete[ current_index @@ -176,7 +177,8 @@ def _process_action(self, current_agent, action): else: self._live_agents.remove(current_agent) del self._observations[current_agent] - del self._dones[current_agent] + del self._terminations[current_agent] + del self._truncations[current_agent] del self._rewards[current_agent] del self._cumm_rewards[current_agent] del self._infos[current_agent] @@ -184,18 +186,51 @@ def _process_action(self, current_agent, action): def _step(self): for behavior_name, actions in self._current_action.items(): self._env.set_actions(behavior_name, actions) - self._env.step() + + def step_and_return_steps(behavior_name): + self._env.step() + decision_steps, termination_steps = self._env.get_steps(behavior_name) + return decision_steps, termination_steps + + # DecisionSteps are assumed come in synchronously at every `DecisionPeriod` frame, + # but TerminationSteps can be sent inbetween. Therefore, to collect step information about all agents, + # we need to continue stepping the environment. + # NOTE: This can lead to returning TerminationSteps and subsequent DecisionSteps at the same time for an agent + # (but this was also possible before). + for behavior_name in self._env.behavior_specs.keys(): + decision_steps, termination_steps = step_and_return_steps(behavior_name) + collected_decision_steps = decision_steps + collected_termination_steps = termination_steps + while not len(set(collected_decision_steps.agent_id)) >= len(self._agents): + decision_steps, termination_steps = step_and_return_steps(behavior_name) + if len(decision_steps) > 0: + collected_decision_steps += decision_steps + if len(termination_steps) > 0: + collected_termination_steps += termination_steps + + self._env._env_state[behavior_name] = ( + collected_decision_steps, + collected_termination_steps, + ) + self._reset_states() for behavior_name in self._env.behavior_specs.keys(): - dones, rewards, cumulative_rewards = self._batch_update(behavior_name) - self._dones.update(dones) + terminations, truncations, rewards, cumulative_rewards = self._batch_update( + behavior_name + ) + self._terminations.update(terminations) + self._truncations.update(truncations) self._rewards.update(rewards) self._cumm_rewards.update(cumulative_rewards) self._agent_index = 0 def _cleanup_agents(self): - for current_agent, done in self.dones.items(): - if done: + for current_agent, terminated in self.terminations.items(): + if terminated: + self._live_agents.remove(current_agent) + + for current_agent, truncated in self.truncations.items(): + if truncated: self._live_agents.remove(current_agent) @property @@ -226,25 +261,33 @@ def _reset_states(self): self._live_agents = [] self._agents = [] self._observations = {} - self._dones = {} + self._terminations = {} + self._truncations = {} self._rewards = {} self._cumm_rewards = {} self._infos = {} self._agent_id_to_index = {} - def reset(self): + def reset( + self, + seed: int | None = None, + options: dict | None = None, + ) -> Any: """ Resets the environment. """ + self._seed = seed + self._assert_loaded() self._agent_index = 0 self._reset_states() self._possible_agents = set() self._env.reset() for behavior_name in self._env.behavior_specs.keys(): - _, _, _ = self._batch_update(behavior_name) + _, _, _, _ = self._batch_update(behavior_name) self._live_agents.sort() # unnecessary, only for passing API test - self._dones = {agent: False for agent in self._agents} + self._terminations = {agent: False for agent in self._agents} + self._truncations = {agent: False for agent in self._agents} self._rewards = {agent: 0 for agent in self._agents} self._cumm_rewards = {agent: 0 for agent in self._agents} @@ -256,7 +299,8 @@ def _batch_update(self, behavior_name): ( agents, obs, - dones, + terminations, + truncations, rewards, cumulative_rewards, infos, @@ -268,33 +312,32 @@ def _batch_update(self, behavior_name): self._infos.update(infos) self._agent_id_to_index.update(id_map) self._possible_agents.update(agents) - return dones, rewards, cumulative_rewards - - def seed(self, seed=None): - """ - Reseeds the environment (making the resulting environment deterministic). - `reset()` must be called after `seed()`, and before `step()`. - """ - self._seed = seed + return terminations, truncations, rewards, cumulative_rewards - def render(self, mode="human"): + def render(self): """ NOT SUPPORTED. - Displays a rendered frame from the environment, if supported. - Alternate render modes in the default environments are `'rgb_array'` + Renders the environment as specified by self.render_mode, if supported. + + Render mode can be `human` to display a window. + Other render modes in the default environments are `'rgb_array'` which returns a numpy array and is supported by all environments outside of classic, and `'ansi'` which returns the strings printed (specific to classic environments). """ pass @property - def dones(self): - return dict(self._dones) + def terminations(self): + return dict(self._terminations) + + @property + def truncations(self): + return dict(self._truncations) @property def agents(self): - return sorted(self._live_agents) + return sorted(self._agents) @property def rewards(self): diff --git a/ml-agents-envs/pyproject.toml b/ml-agents-envs/pyproject.toml new file mode 100644 index 0000000000..104caf7a78 --- /dev/null +++ b/ml-agents-envs/pyproject.toml @@ -0,0 +1,38 @@ +[tool.poetry] +name = "mlagents_envs" +version = "None" +description = "Unity Machine Learning Agents Interface" +homepage = "https://github.com/Unity-Technologies/ml-agents" +authors = ["Unity Technologies "] +classifiers=[ + "Intended Audience :: Developers", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +readme = "README.md" + +packages = [ + { include = "mlagents_envs", from = "." }, +] +include = ["mlagents_envs/*"] +exclude = ["*.tests", "*.tests.*", "tests.*", "tests", "colabs", "*.ipynb"] + +[tool.poetry.dependencies] +python = "^3.9" +grpcio = ">=1.11.0,<=1.48.2" +Pillow = ">=4.2.1" +protobuf = ">=3.6,<3.21" +pyyaml = ">=3.1.0" +gymnasium = ">=0.25.0" +pettingzoo = ">=1.22.0" +numpy = ">=1.23.5,<2.0" +filelock = ">=3.4.0" +cloudpickle = "*" + +[build-system] +requires = ["poetry-core>=1.9.0"] +build-backend = "poetry.core.masonry.api" diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py index fcbee96151..2eb0a0401d 100644 --- a/ml-agents-envs/setup.py +++ b/ml-agents-envs/setup.py @@ -4,7 +4,7 @@ from setuptools.command.install import install import mlagents_envs -VERSION = mlagents_envs.__version__ +VERSION = (None,) EXPECTED_TAG = mlagents_envs.__release_tag__ here = os.path.abspath(os.path.dirname(__file__)) @@ -35,7 +35,7 @@ def run(self): setup( name="mlagents_envs", - version=VERSION, + version=None, description="Unity Machine Learning Agents Interface", long_description=long_description, long_description_content_type="text/markdown", @@ -58,12 +58,12 @@ def run(self): "Pillow>=4.2.1", "protobuf>=3.6,<3.21", "pyyaml>=3.1.0", - "gym>=0.21.0", - "pettingzoo==1.15.0", - "numpy>=1.23.5,<1.24.0", + "gymnasium>=0.25.0", + "pettingzoo>=1.22.0", + "numpy>=1.23.5,<2.0", "filelock>=3.4.0", ], - python_requires=">=3.10.1,<=3.10.12", + python_requires=">=3.9,<4", # TODO: Remove this once mypy stops having spurious setuptools issues. cmdclass={"verify": VerifyVersionCommand}, # type: ignore ) diff --git a/ml-agents-envs/tests/test_gym.py b/ml-agents-envs/tests/test_gym.py index 4fc2bf548c..21afdc0c9f 100644 --- a/ml-agents-envs/tests/test_gym.py +++ b/ml-agents-envs/tests/test_gym.py @@ -2,7 +2,7 @@ import pytest import numpy as np -from gym import spaces +from gymnasium import spaces from mlagents_envs.envs.unity_gym_env import UnityToGymWrapper from mlagents_envs.base_env import ( diff --git a/ml-agents/mlagents/trainers/subprocess_env_manager.py b/ml-agents/mlagents/trainers/subprocess_env_manager.py index 43d468f2bc..8f767e23d0 100644 --- a/ml-agents/mlagents/trainers/subprocess_env_manager.py +++ b/ml-agents/mlagents/trainers/subprocess_env_manager.py @@ -12,7 +12,7 @@ UnityCommunicatorStoppedException, ) from multiprocessing import Process, Pipe, Queue -from multiprocessing.connection import Connection +from multiprocessing.connection import Connection, PipeConnection from queue import Empty as EmptyQueueException from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec from mlagents_envs import logging_util @@ -77,7 +77,7 @@ class StepResponse(NamedTuple): class UnityEnvWorker: - def __init__(self, process: Process, worker_id: int, conn: Connection): + def __init__(self, process: Process, worker_id: int, conn: PipeConnection): self.process = process self.worker_id = worker_id self.conn = conn diff --git a/utils/generate_markdown_docs.py b/utils/generate_markdown_docs.py index 7566b1bdc7..5ce432b3a2 100755 --- a/utils/generate_markdown_docs.py +++ b/utils/generate_markdown_docs.py @@ -6,7 +6,6 @@ import argparse import hashlib - # pydoc-markdown -I . -m module_name --render_toc > doc.md @@ -52,8 +51,8 @@ def remove_trailing_whitespace(filename): # compare source and destination and write only if changed if source_file != destination_file: num_changed += 1 - with open(filename, "wb") as f: - f.write(destination_file.encode()) + with open(filename, "w", newline="\r\n") as f: + f.write(destination_file) if __name__ == "__main__": @@ -84,7 +83,7 @@ def remove_trailing_whitespace(filename): for submodule in submodules: module_args.append("-m") module_args.append(f"{module_name}.{submodule}") - with open(output_file_name, "w") as output_file: + with open(output_file_name, "wb") as output_file: subprocess_args = [ "pydoc-markdown", "-I",