From 6bd65f14739ceb0ba95cf7c64ff8f211f6c51419 Mon Sep 17 00:00:00 2001 From: a_zap Date: Wed, 28 May 2025 17:29:08 +0200 Subject: [PATCH 1/9] Adjusted .pre-commit-config.yaml for Windows usage --- .pre-commit-config.yaml | 8 ++++++-- utils/generate_markdown_docs.py | 7 +++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 08a8e33e72..7d1fffd938 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -56,9 +56,13 @@ repos: (?x)^( .*cs.meta| .*.css| - .*.meta + .*.meta| + .*.asset| + .*.prefab| + .*.unity| + .*.json )$ - args: [--fix=lf] + args: [--fix=crlf] - id: trailing-whitespace name: trailing-whitespace-markdown diff --git a/utils/generate_markdown_docs.py b/utils/generate_markdown_docs.py index 7566b1bdc7..5ce432b3a2 100755 --- a/utils/generate_markdown_docs.py +++ b/utils/generate_markdown_docs.py @@ -6,7 +6,6 @@ import argparse import hashlib - # pydoc-markdown -I . -m module_name --render_toc > doc.md @@ -52,8 +51,8 @@ def remove_trailing_whitespace(filename): # compare source and destination and write only if changed if source_file != destination_file: num_changed += 1 - with open(filename, "wb") as f: - f.write(destination_file.encode()) + with open(filename, "w", newline="\r\n") as f: + f.write(destination_file) if __name__ == "__main__": @@ -84,7 +83,7 @@ def remove_trailing_whitespace(filename): for submodule in submodules: module_args.append("-m") module_args.append(f"{module_name}.{submodule}") - with open(output_file_name, "w") as output_file: + with open(output_file_name, "wb") as output_file: subprocess_args = [ "pydoc-markdown", "-I", From cd2b649017dcebd1722b01fd70c68f90c1ff612c Mon Sep 17 00:00:00 2001 From: a_zap Date: Wed, 28 May 2025 17:31:55 +0200 Subject: [PATCH 2/9] Fixed mypy issue --- ml-agents/mlagents/trainers/subprocess_env_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ml-agents/mlagents/trainers/subprocess_env_manager.py b/ml-agents/mlagents/trainers/subprocess_env_manager.py index 43d468f2bc..8f767e23d0 100644 --- a/ml-agents/mlagents/trainers/subprocess_env_manager.py +++ b/ml-agents/mlagents/trainers/subprocess_env_manager.py @@ -12,7 +12,7 @@ UnityCommunicatorStoppedException, ) from multiprocessing import Process, Pipe, Queue -from multiprocessing.connection import Connection +from multiprocessing.connection import Connection, PipeConnection from queue import Empty as EmptyQueueException from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec from mlagents_envs import logging_util @@ -77,7 +77,7 @@ class StepResponse(NamedTuple): class UnityEnvWorker: - def __init__(self, process: Process, worker_id: int, conn: Connection): + def __init__(self, process: Process, worker_id: int, conn: PipeConnection): self.process = process self.worker_id = worker_id self.conn = conn From 48a29df751f9c357826c00807c73abfed2caf64d Mon Sep 17 00:00:00 2001 From: a_zap Date: Wed, 28 May 2025 17:32:27 +0200 Subject: [PATCH 3/9] Switched go gymnasium interface - added dependency to gymnasium instead of gym - adjusted Unity gym interface to return `truncated` on step - adjusted Unity gym interface to accept `seed` and `options` on reset (not used) - adjusted Unity gym interface to not accept `mode` on reset (not used anyway) - relaxed dependencies to numpy, pettingzoo and Python version --- docs/Python-Gym-API-Documentation.md | 19 +++++--- .../mlagents_envs/envs/unity_gym_env.py | 44 +++++++++++++------ ml-agents-envs/setup.py | 8 ++-- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/docs/Python-Gym-API-Documentation.md b/docs/Python-Gym-API-Documentation.md index b35771fc46..e92edce5e0 100644 --- a/docs/Python-Gym-API-Documentation.md +++ b/docs/Python-Gym-API-Documentation.md @@ -59,18 +59,22 @@ Environment initialization #### reset ```python - | reset() -> Union[List[np.ndarray], np.ndarray] + | reset(*, seed: int | None = None, options: dict[str, Any] | None = None) -> Tuple[np.ndarray, Dict] ``` -Resets the state of the environment and returns an initial observation. -Returns: observation (object/list): the initial observation of the -space. +Resets the state of the environment and returns an initial observation and info. + +**Returns**: + +- `observation` _object/list_ - the initial observation of the + space. +- `info` _dict_ - contains auxiliary diagnostic information. #### step ```python - | step(action: List[Any]) -> GymStepResult + | step(action: Any) -> GymStepResult ``` Run one timestep of the environment's dynamics. When end of @@ -86,14 +90,15 @@ Accepts an action and returns a tuple (observation, reward, done, info). - `observation` _object/list_ - agent's observation of the current environment reward (float/list) : amount of reward returned after previous action -- `done` _boolean/list_ - whether the episode has ended. +- `terminated` _boolean/list_ - whether the episode has ended by termination. +- `truncated` _boolean/list_ - whether the episode has ended by truncation. - `info` _dict_ - contains auxiliary diagnostic information. #### render ```python - | render(mode="rgb_array") + | render() ``` Return the latest visual observations. diff --git a/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py b/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py index df29a95c9a..3f0513ffb0 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py @@ -3,8 +3,8 @@ import numpy as np from typing import Any, Dict, List, Optional, Tuple, Union -import gym -from gym import error, spaces +import gymnasium as gym +from gymnasium import error, spaces from mlagents_envs.base_env import ActionTuple, BaseEnv from mlagents_envs.base_env import DecisionSteps, TerminalSteps @@ -20,7 +20,7 @@ class UnityGymException(error.Error): logger = logging_util.get_logger(__name__) -GymStepResult = Tuple[np.ndarray, float, bool, Dict] +GymStepResult = Tuple[np.ndarray, float, bool, bool, Dict] class UnityToGymWrapper(gym.Env): @@ -151,11 +151,16 @@ def __init__( else: self._observation_space = list_spaces[0] # only return the first one - def reset(self) -> Union[List[np.ndarray], np.ndarray]: - """Resets the state of the environment and returns an initial observation. - Returns: observation (object/list): the initial observation of the + def reset( + self, *, seed: int | None = None, options: dict[str, Any] | None = None + ) -> Tuple[np.ndarray, Dict]: + """Resets the state of the environment and returns an initial observation and info. + Returns: + observation (object/list): the initial observation of the space. + info (dict): contains auxiliary diagnostic information. """ + super().reset(seed=seed, options=options) self._env.reset() decision_step, _ = self._env.get_steps(self.name) n_agents = len(decision_step) @@ -163,9 +168,9 @@ def reset(self) -> Union[List[np.ndarray], np.ndarray]: self.game_over = False res: GymStepResult = self._single_step(decision_step) - return res[0] + return res[0], res[4] - def step(self, action: List[Any]) -> GymStepResult: + def step(self, action: Any) -> GymStepResult: """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. @@ -175,14 +180,15 @@ def step(self, action: List[Any]) -> GymStepResult: Returns: observation (object/list): agent's observation of the current environment reward (float/list) : amount of reward returned after previous action - done (boolean/list): whether the episode has ended. + terminated (boolean/list): whether the episode has ended by termination. + truncated (boolean/list): whether the episode has ended by truncation. info (dict): contains auxiliary diagnostic information. """ if self.game_over: raise UnityGymException( "You are calling 'step()' even though this environment has already " - "returned done = True. You must always call 'reset()' once you " - "receive 'done = True'." + "returned `terminated` or `truncated` as True. You must always call 'reset()' once you " + "receive `terminated` or `truncated` as True." ) if self._flattener is not None: # Translate action into list @@ -227,9 +233,19 @@ def _single_step(self, info: Union[DecisionSteps, TerminalSteps]) -> GymStepResu visual_obs = self._get_vis_obs_list(info) self.visual_obs = self._preprocess_single(visual_obs[0][0]) - done = isinstance(info, TerminalSteps) + if isinstance(info, TerminalSteps): + interrupted = info.interrupted + terminated, truncated = not interrupted, interrupted + else: + terminated, truncated = False, False - return (default_observation, info.reward[0], done, {"step": info}) + return ( + default_observation, + info.reward[0], + terminated, + truncated, + {"step": info}, + ) def _preprocess_single(self, single_visual_obs: np.ndarray) -> np.ndarray: if self.uint8_visual: @@ -276,7 +292,7 @@ def _get_vec_obs_size(self) -> int: result += obs_spec.shape[0] return result - def render(self, mode="rgb_array"): + def render(self): """ Return the latest visual observations. Note that it will not render a new frame of the environment. diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py index fcbee96151..bd40cb4c01 100644 --- a/ml-agents-envs/setup.py +++ b/ml-agents-envs/setup.py @@ -58,12 +58,12 @@ def run(self): "Pillow>=4.2.1", "protobuf>=3.6,<3.21", "pyyaml>=3.1.0", - "gym>=0.21.0", - "pettingzoo==1.15.0", - "numpy>=1.23.5,<1.24.0", + "gymnasium>=0.25.0", + "pettingzoo>=1.15.0", + "numpy>=1.23.5,<2.0", "filelock>=3.4.0", ], - python_requires=">=3.10.1,<=3.10.12", + python_requires=">=3.9,<4", # TODO: Remove this once mypy stops having spurious setuptools issues. cmdclass={"verify": VerifyVersionCommand}, # type: ignore ) From 98b22d7557b74b80bf859ab32449a3aabf88d6a7 Mon Sep 17 00:00:00 2001 From: a_zap Date: Wed, 28 May 2025 17:34:29 +0200 Subject: [PATCH 4/9] Created pyproject.toml to support installation of package via poetry (without publishing) --- ml-agents-envs/pyproject.toml | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 ml-agents-envs/pyproject.toml diff --git a/ml-agents-envs/pyproject.toml b/ml-agents-envs/pyproject.toml new file mode 100644 index 0000000000..51752d556d --- /dev/null +++ b/ml-agents-envs/pyproject.toml @@ -0,0 +1,38 @@ +[tool.poetry] +name = "mlagents_envs" +version = "1.2.0.dev0" +description = "Unity Machine Learning Agents Interface" +homepage = "https://github.com/Unity-Technologies/ml-agents" +authors = ["Unity Technologies "] +classifiers=[ + "Intended Audience :: Developers", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +readme = "README.md" + +packages = [ + { include = "mlagents_envs", from = "." }, +] +include = ["mlagents_envs/*"] +exclude = ["*.tests", "*.tests.*", "tests.*", "tests", "colabs", "*.ipynb"] + +[tool.poetry.dependencies] +python = "^3.9" +grpcio = ">=1.11.0,<=1.48.2" +Pillow = ">=4.2.1" +protobuf = ">=3.6,<3.21" +pyyaml = ">=3.1.0" +gymnasium = ">=0.25.0" +pettingzoo = ">=1.15.0" +numpy = ">=1.23.5,<2.0" +filelock = ">=3.4.0" +cloudpickle = "*" + +[build-system] +requires = ["poetry-core>=1.9.0"] +build-backend = "poetry.core.masonry.api" From 4ca9d8be7ef2a65d6e43a09654281005320bc091 Mon Sep 17 00:00:00 2001 From: alexander-zap Date: Wed, 28 May 2025 18:13:39 +0200 Subject: [PATCH 5/9] Updated imports to use gymnasium instead of gym --- colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb | 4 ++-- docs/Python-Gym-API.md | 2 +- ml-agents-envs/mlagents_envs/envs/unity_aec_env.py | 2 +- ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py | 2 +- .../mlagents_envs/envs/unity_pettingzoo_base_env.py | 2 +- ml-agents-envs/tests/test_gym.py | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb b/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb index e5d3d45c8b..83aad09aba 100644 --- a/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb +++ b/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb @@ -161,8 +161,8 @@ "from pathlib import Path\n", "from typing import Callable, Any\n", "\n", - "import gym\n", - "from gym import Env\n", + "import gymnasium as gym\n", + "from gymnasium import Env\n", "\n", "from stable_baselines3 import PPO\n", "from stable_baselines3.common.vec_env import VecMonitor, VecEnv, SubprocVecEnv\n", diff --git a/docs/Python-Gym-API.md b/docs/Python-Gym-API.md index 97869899ce..59ce44eeb6 100644 --- a/docs/Python-Gym-API.md +++ b/docs/Python-Gym-API.md @@ -93,7 +93,7 @@ observation, a single discrete action and a single Agent in the scene. Add the following code to the `train_unity.py` file: ```python -import gym +import gymnasium as gym from baselines import deepq from baselines import logger diff --git a/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py b/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py index 4bb6fdf390..bccae65c0f 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py @@ -1,5 +1,5 @@ from typing import Any, Optional -from gym import error +from gymnasium import error from mlagents_envs.base_env import BaseEnv from pettingzoo import AECEnv diff --git a/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py b/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py index 09398d27fa..906905e83b 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py @@ -1,5 +1,5 @@ from typing import Optional, Dict, Any, Tuple -from gym import error +from gymnasium import error from mlagents_envs.base_env import BaseEnv from pettingzoo import ParallelEnv diff --git a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py index 3457f18c88..c040050a2b 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py @@ -1,7 +1,7 @@ import atexit from typing import Optional, List, Set, Dict, Any, Tuple import numpy as np -from gym import error, spaces +from gymnasium import error, spaces from mlagents_envs.base_env import BaseEnv, ActionTuple from mlagents_envs.envs.env_helpers import _agent_id_to_behavior, _unwrap_batch_steps diff --git a/ml-agents-envs/tests/test_gym.py b/ml-agents-envs/tests/test_gym.py index 4fc2bf548c..21afdc0c9f 100644 --- a/ml-agents-envs/tests/test_gym.py +++ b/ml-agents-envs/tests/test_gym.py @@ -2,7 +2,7 @@ import pytest import numpy as np -from gym import spaces +from gymnasium import spaces from mlagents_envs.envs.unity_gym_env import UnityToGymWrapper from mlagents_envs.base_env import ( From be47ce09d38bd0207379fda425dba72c284b4c30 Mon Sep 17 00:00:00 2001 From: alexander-zap Date: Fri, 13 Jun 2025 18:53:04 +0200 Subject: [PATCH 6/9] Adjusted pettingzoo environments to support gymnasium interface (reset returning obs and info; step returning terminated and truncated instead of done) --- docs/Python-PettingZoo-API.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/Python-PettingZoo-API.md b/docs/Python-PettingZoo-API.md index 2c62ed8415..651932a9e4 100644 --- a/docs/Python-PettingZoo-API.md +++ b/docs/Python-PettingZoo-API.md @@ -25,13 +25,13 @@ Here's an example of interacting with wrapped environment: ```python from mlagents_envs.environment import UnityEnvironment -from mlagents_envs.envs import UnityToPettingZooWrapper +from mlagents_envs.envs.unity_aec_env import UnityAECEnv unity_env = UnityEnvironment("StrikersVsGoalie") -env = UnityToPettingZooWrapper(unity_env) +env = UnityAECEnv(unity_env) env.reset() for agent in env.agent_iter(): - observation, reward, done, info = env.last() + observation, reward, terminated, truncated, info = env.last() action = policy(observation, agent) env.step(action) ``` From f3011030c1ad79151cb4a9f49b2f30ec93f91fc9 Mon Sep 17 00:00:00 2001 From: alexander-zap Date: Fri, 13 Jun 2025 21:20:18 +0200 Subject: [PATCH 7/9] Adjusted pettingzoo environments to support gymnasium interface (reset returning obs and info; step returning terminated and truncated instead of done) --- docs/Python-PettingZoo-API-Documentation.md | 23 ++----- ml-agents-envs/README.md | 37 +++++++--- .../mlagents_envs/envs/env_helpers.py | 20 +++++- .../mlagents_envs/envs/unity_aec_env.py | 9 ++- .../mlagents_envs/envs/unity_gym_env.py | 2 +- .../mlagents_envs/envs/unity_parallel_env.py | 18 +++-- .../envs/unity_pettingzoo_base_env.py | 69 ++++++++++++------- ml-agents-envs/pyproject.toml | 4 +- ml-agents-envs/setup.py | 6 +- 9 files changed, 122 insertions(+), 66 deletions(-) diff --git a/docs/Python-PettingZoo-API-Documentation.md b/docs/Python-PettingZoo-API-Documentation.md index 233e45e805..423a09d64e 100644 --- a/docs/Python-PettingZoo-API-Documentation.md +++ b/docs/Python-PettingZoo-API-Documentation.md @@ -21,7 +21,6 @@ * [action\_space](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.action_space) * [side\_channel](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.side_channel) * [reset](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.reset) - * [seed](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.seed) * [render](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.render) * [close](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.close) @@ -137,7 +136,7 @@ Initializes a Unity Parallel environment wrapper. #### reset ```python - | reset() -> Dict[str, Any] + | reset(seed: int | None = None, options: dict | None = None) -> Tuple[Dict[str, Any], Dict[str, Dict]] ``` Resets the environment. @@ -207,32 +206,24 @@ of an environment with `env.side_channel[]`. #### reset ```python - | reset() + | reset(seed: int | None = None, options: dict | None = None) -> Any ``` Resets the environment. - -#### seed - -```python - | seed(seed=None) -``` - -Reseeds the environment (making the resulting environment deterministic). -`reset()` must be called after `seed()`, and before `step()`. - #### render ```python - | render(mode="human") + | render() ``` NOT SUPPORTED. -Displays a rendered frame from the environment, if supported. -Alternate render modes in the default environments are `'rgb_array'` +Renders the environment as specified by self.render_mode, if supported. + +Render mode can be `human` to display a window. +Other render modes in the default environments are `'rgb_array'` which returns a numpy array and is supported by all environments outside of classic, and `'ansi'` which returns the strings printed (specific to classic environments). diff --git a/ml-agents-envs/README.md b/ml-agents-envs/README.md index 4db68723d2..b6b94f7a18 100644 --- a/ml-agents-envs/README.md +++ b/ml-agents-envs/README.md @@ -12,14 +12,6 @@ The LLAPI is used by the trainer implementation in `mlagents`. `mlagents_envs` can be used independently of `mlagents` for Python communication. -## Installation - -Install the `mlagents_envs` package with: - -```sh -python -m pip install mlagents_envs==1.1.0 -``` - ## Usage & More Information See @@ -42,3 +34,32 @@ scene with the ML-Agents SDK, check out the main - Communication between Unity and the Python `UnityEnvironment` is not secure. - On Linux, ports are not released immediately after the communication closes. As such, you cannot reuse ports right after closing a `UnityEnvironment`. + +## Development and publishing (Wargaming artifactory) + +Since this package does not seem to be maintained anymore be the official developers, we have forked it to the Wargaming gitlab and are maintaining it there. +Publishing is done via the [Wargaming artifactory](https://ed.artifactory.wgdp.io:443/artifactory/api/pypi/mlopsbi-pypi/simple). + +To contribute to the `mlagents_envs` package, please work on a branch and create a merge request to `master` once ready. +Once the merge request is approved and merged to `master` branch, a gitlab pipeline will automatically create a new git tag and publish the new version to the Wargaming artifactory. + +## Installation (Wargaming artifactory) + +Since publishing is done via the Wargaming artifactory, you can use this package as dependency by adding the following to your `pyproject.toml`: + +```toml +[tool.poetry.dependencies] +mlagents-envs = { version = "^0.1", source = "artifactory" } + +[[tool.poetry.source]] +name = "artifactory" +url = "https://ed.artifactory.wgdp.io:443/artifactory/api/pypi/mlopsbi-pypi/simple" +priority = "explicit" +``` + + +Or you can install the `mlagents_envs` package from the Wargaming artifactory using pip: + +```bash +pip install mlagents-envs --extra-index-url https://ed.artifactory.wgdp.io:443/artifactory/api/pypi/mlopsbi-pypi/simple +``` diff --git a/ml-agents-envs/mlagents_envs/envs/env_helpers.py b/ml-agents-envs/mlagents_envs/envs/env_helpers.py index 768e670603..0c17c2b20d 100644 --- a/ml-agents-envs/mlagents_envs/envs/env_helpers.py +++ b/ml-agents-envs/mlagents_envs/envs/env_helpers.py @@ -40,8 +40,6 @@ def _unwrap_batch_steps(batch_steps, behavior_name): } ) obs = {k: v if len(v) > 1 else v[0] for k, v in obs.items()} - dones = {agent_id: True for agent_id in termination_id} - dones.update({agent_id: False for agent_id in decision_id}) rewards = { agent_id: termination_batch.reward[i] for i, agent_id in enumerate(termination_id) @@ -51,19 +49,35 @@ def _unwrap_batch_steps(batch_steps, behavior_name): ) cumulative_rewards = {k: v for k, v in rewards.items()} infos = {} + terminations = {} + truncations = {} for i, agent_id in enumerate(decision_id): infos[agent_id] = {} infos[agent_id]["behavior_name"] = behavior_name infos[agent_id]["group_id"] = decision_batch.group_id[i] infos[agent_id]["group_reward"] = decision_batch.group_reward[i] + truncations[agent_id] = False + terminations[agent_id] = False for i, agent_id in enumerate(termination_id): infos[agent_id] = {} infos[agent_id]["behavior_name"] = behavior_name infos[agent_id]["group_id"] = termination_batch.group_id[i] infos[agent_id]["group_reward"] = termination_batch.group_reward[i] infos[agent_id]["interrupted"] = termination_batch.interrupted[i] + truncated = termination_batch.interrupted[i] + truncations[agent_id] = truncated + terminations[agent_id] = not truncated id_map = {agent_id: i for i, agent_id in enumerate(decision_id)} - return agents, obs, dones, rewards, cumulative_rewards, infos, id_map + return ( + agents, + obs, + terminations, + truncations, + rewards, + cumulative_rewards, + infos, + id_map, + ) def _parse_behavior(full_behavior): diff --git a/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py b/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py index bccae65c0f..d7dea3fc10 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py @@ -53,7 +53,8 @@ def observe(self, agent_id): return ( self._observations[agent_id], self._cumm_rewards[agent_id], - self._dones[agent_id], + self._terminations[agent_id], + self._truncations[agent_id], self._infos[agent_id], ) @@ -61,8 +62,10 @@ def last(self, observe=True): """ returns observation, cumulative reward, done, info for the current agent (specified by self.agent_selection) """ - obs, reward, done, info = self.observe(self._agents[self._agent_index]) - return obs if observe else None, reward, done, info + obs, cumm_rewards, terminated, truncated, info = self.observe( + self._agents[self._agent_index] + ) + return obs if observe else None, cumm_rewards, terminated, truncated, info @property def agent_selection(self): diff --git a/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py b/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py index 3f0513ffb0..f4209ba837 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py @@ -234,7 +234,7 @@ def _single_step(self, info: Union[DecisionSteps, TerminalSteps]) -> GymStepResu self.visual_obs = self._preprocess_single(visual_obs[0][0]) if isinstance(info, TerminalSteps): - interrupted = info.interrupted + interrupted = info.interrupted[0] terminated, truncated = not interrupted, interrupted else: terminated, truncated = False, False diff --git a/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py b/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py index 906905e83b..85ce904f24 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py @@ -20,13 +20,17 @@ def __init__(self, env: BaseEnv, seed: Optional[int] = None): """ super().__init__(env, seed) - def reset(self) -> Dict[str, Any]: + def reset( + self, + seed: int | None = None, + options: dict | None = None, + ) -> Tuple[Dict[str, Any], Dict[str, Dict]]: """ Resets the environment. """ - super().reset() + super().reset(seed=seed, options=options) - return self._observations + return self._observations, self._infos def step(self, actions: Dict[str, Any]) -> Tuple: self._assert_loaded() @@ -50,4 +54,10 @@ def step(self, actions: Dict[str, Any]) -> Tuple: self._cleanup_agents() self._live_agents.sort() # unnecessary, only for passing API test - return self._observations, self._rewards, self._dones, self._infos + return ( + self._observations, + self._rewards, + self._terminations, + self._truncations, + self._infos, + ) diff --git a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py index c040050a2b..41c1cff8c1 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py @@ -32,7 +32,8 @@ def __init__( self._possible_agents: Set[str] = set() # all agents that have ever appear self._agent_id_to_index: Dict[str, int] = {} # agent_id: index in decision step self._observations: Dict[str, np.ndarray] = {} # agent_id: obs - self._dones: Dict[str, bool] = {} # agent_id: done + self._terminations: Dict[str, bool] = {} # agent_id: terminated + self._truncations: Dict[str, bool] = {} # agent_id: truncated self._rewards: Dict[str, float] = {} # agent_id: reward self._cumm_rewards: Dict[str, float] = {} # agent_id: reward self._infos: Dict[str, Dict] = {} # agent_id: info @@ -45,7 +46,7 @@ def __init__( if not self._env.behavior_specs: self._env.step() for behavior_name in self._env.behavior_specs.keys(): - _, _, _ = self._batch_update(behavior_name) + _, _, _, _ = self._batch_update(behavior_name) self._update_observation_spaces() self._update_action_spaces() @@ -162,7 +163,7 @@ def _process_action(self, current_agent, action): else: action = ActionTuple(action, None) - if not self._dones[current_agent]: + if not self._terminations[current_agent] or self._truncations[current_agent]: current_behavior = _agent_id_to_behavior(current_agent) current_index = self._agent_id_to_index[current_agent] if action.continuous is not None: @@ -176,7 +177,8 @@ def _process_action(self, current_agent, action): else: self._live_agents.remove(current_agent) del self._observations[current_agent] - del self._dones[current_agent] + del self._terminations[current_agent] + del self._truncations[current_agent] del self._rewards[current_agent] del self._cumm_rewards[current_agent] del self._infos[current_agent] @@ -187,15 +189,22 @@ def _step(self): self._env.step() self._reset_states() for behavior_name in self._env.behavior_specs.keys(): - dones, rewards, cumulative_rewards = self._batch_update(behavior_name) - self._dones.update(dones) + terminations, truncations, rewards, cumulative_rewards = self._batch_update( + behavior_name + ) + self._terminations.update(terminations) + self._truncations.update(truncations) self._rewards.update(rewards) self._cumm_rewards.update(cumulative_rewards) self._agent_index = 0 def _cleanup_agents(self): - for current_agent, done in self.dones.items(): - if done: + for current_agent, terminated in self.terminations.items(): + if terminated: + self._live_agents.remove(current_agent) + + for current_agent, truncated in self.truncations.items(): + if truncated: self._live_agents.remove(current_agent) @property @@ -226,25 +235,33 @@ def _reset_states(self): self._live_agents = [] self._agents = [] self._observations = {} - self._dones = {} + self._terminations = {} + self._truncations = {} self._rewards = {} self._cumm_rewards = {} self._infos = {} self._agent_id_to_index = {} - def reset(self): + def reset( + self, + seed: int | None = None, + options: dict | None = None, + ) -> Any: """ Resets the environment. """ + self._seed = seed + self._assert_loaded() self._agent_index = 0 self._reset_states() self._possible_agents = set() self._env.reset() for behavior_name in self._env.behavior_specs.keys(): - _, _, _ = self._batch_update(behavior_name) + _, _, _, _ = self._batch_update(behavior_name) self._live_agents.sort() # unnecessary, only for passing API test - self._dones = {agent: False for agent in self._agents} + self._terminations = {agent: False for agent in self._agents} + self._truncations = {agent: False for agent in self._agents} self._rewards = {agent: 0 for agent in self._agents} self._cumm_rewards = {agent: 0 for agent in self._agents} @@ -256,7 +273,8 @@ def _batch_update(self, behavior_name): ( agents, obs, - dones, + terminations, + truncations, rewards, cumulative_rewards, infos, @@ -268,29 +286,28 @@ def _batch_update(self, behavior_name): self._infos.update(infos) self._agent_id_to_index.update(id_map) self._possible_agents.update(agents) - return dones, rewards, cumulative_rewards + return terminations, truncations, rewards, cumulative_rewards - def seed(self, seed=None): - """ - Reseeds the environment (making the resulting environment deterministic). - `reset()` must be called after `seed()`, and before `step()`. - """ - self._seed = seed - - def render(self, mode="human"): + def render(self): """ NOT SUPPORTED. - Displays a rendered frame from the environment, if supported. - Alternate render modes in the default environments are `'rgb_array'` + Renders the environment as specified by self.render_mode, if supported. + + Render mode can be `human` to display a window. + Other render modes in the default environments are `'rgb_array'` which returns a numpy array and is supported by all environments outside of classic, and `'ansi'` which returns the strings printed (specific to classic environments). """ pass @property - def dones(self): - return dict(self._dones) + def terminations(self): + return dict(self._terminations) + + @property + def truncations(self): + return dict(self._truncations) @property def agents(self): diff --git a/ml-agents-envs/pyproject.toml b/ml-agents-envs/pyproject.toml index 51752d556d..104caf7a78 100644 --- a/ml-agents-envs/pyproject.toml +++ b/ml-agents-envs/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "mlagents_envs" -version = "1.2.0.dev0" +version = "None" description = "Unity Machine Learning Agents Interface" homepage = "https://github.com/Unity-Technologies/ml-agents" authors = ["Unity Technologies "] @@ -28,7 +28,7 @@ Pillow = ">=4.2.1" protobuf = ">=3.6,<3.21" pyyaml = ">=3.1.0" gymnasium = ">=0.25.0" -pettingzoo = ">=1.15.0" +pettingzoo = ">=1.22.0" numpy = ">=1.23.5,<2.0" filelock = ">=3.4.0" cloudpickle = "*" diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py index bd40cb4c01..2eb0a0401d 100644 --- a/ml-agents-envs/setup.py +++ b/ml-agents-envs/setup.py @@ -4,7 +4,7 @@ from setuptools.command.install import install import mlagents_envs -VERSION = mlagents_envs.__version__ +VERSION = (None,) EXPECTED_TAG = mlagents_envs.__release_tag__ here = os.path.abspath(os.path.dirname(__file__)) @@ -35,7 +35,7 @@ def run(self): setup( name="mlagents_envs", - version=VERSION, + version=None, description="Unity Machine Learning Agents Interface", long_description=long_description, long_description_content_type="text/markdown", @@ -59,7 +59,7 @@ def run(self): "protobuf>=3.6,<3.21", "pyyaml>=3.1.0", "gymnasium>=0.25.0", - "pettingzoo>=1.15.0", + "pettingzoo>=1.22.0", "numpy>=1.23.5,<2.0", "filelock>=3.4.0", ], From 440c5cf86a064551f7adc1171f8820e4e72469dc Mon Sep 17 00:00:00 2001 From: alexander-zap Date: Tue, 17 Jun 2025 18:23:25 +0200 Subject: [PATCH 8/9] Set continuous action space as dtype float64 instead of int32 --- ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py index 41c1cff8c1..34fb60ca68 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py @@ -133,7 +133,7 @@ def _update_action_spaces(self) -> None: continue if act_spec.continuous_size > 0: c_space = spaces.Box( - -1, 1, (act_spec.continuous_size,), dtype=np.int32 + -1, 1, (act_spec.continuous_size,), dtype=np.float64 ) if self._seed is not None: c_space.seed(self._seed) From f0d25ea1c22b676177b2e1d78a09074f2733ba70 Mon Sep 17 00:00:00 2001 From: alexander-zap Date: Mon, 30 Jun 2025 15:27:59 +0200 Subject: [PATCH 9/9] Fixed interface problems with pettingzoo implementation - all agents return an observations synchronously (instead of having steps where only one agent returns an observation) - in case the agent sends a TerminationStep *and* a DecisionStep after, the reward from the termination step should be taken (but for observations it's the other way around because of SB3 convention) - `agents` attribute should store agents even if they have *just* finished (they should be removed the step after) - fixed a bug where a continuous action vector was assigned len(action) * action[0] - fixed double removal of an agent from _live_agents for ParallelEnv --- ml-agents-envs/mlagents_envs/base_env.py | 46 +++++++++++++++++++ .../mlagents_envs/envs/env_helpers.py | 18 +++++--- .../mlagents_envs/envs/unity_parallel_env.py | 3 +- .../envs/unity_pettingzoo_base_env.py | 32 +++++++++++-- 4 files changed, 88 insertions(+), 11 deletions(-) diff --git a/ml-agents-envs/mlagents_envs/base_env.py b/ml-agents-envs/mlagents_envs/base_env.py index 67deb26e85..eb3acdfff2 100644 --- a/ml-agents-envs/mlagents_envs/base_env.py +++ b/ml-agents-envs/mlagents_envs/base_env.py @@ -138,6 +138,30 @@ def __getitem__(self, agent_id: AgentId) -> DecisionStep: def __iter__(self) -> Iterator[Any]: yield from self.agent_id + def __add__(self, other: "DecisionSteps") -> "DecisionSteps": + assert isinstance(other, DecisionSteps) + + combined_terminal_steps = DecisionSteps( + list(np.hstack([self.obs, other.obs])), + np.hstack([self.reward, other.reward]), + np.hstack([self.agent_id, other.agent_id]), + list(np.hstack([self.action_mask, other.action_mask])) + if self.action_mask or other.action_mask + else None, + np.hstack([self.group_id, other.group_id]), + np.hstack([self.group_reward, other.group_reward]), + ) + combined_terminal_steps._agent_id_to_index = { + **self.agent_id_to_index, + # shift index of added termination steps because of appending + **{ + agent_id: index + len(self) + for agent_id, index in other.agent_id_to_index.items() + }, + } + + return combined_terminal_steps + @staticmethod def empty(spec: "BehaviorSpec") -> "DecisionSteps": """ @@ -245,6 +269,28 @@ def __getitem__(self, agent_id: AgentId) -> TerminalStep: def __iter__(self) -> Iterator[Any]: yield from self.agent_id + def __add__(self, other: "TerminalSteps") -> "TerminalSteps": + assert isinstance(other, TerminalSteps) + + combined_terminal_steps = TerminalSteps( + list(np.hstack([self.obs, other.obs])), + np.hstack([self.reward, other.reward]), + np.hstack([self.interrupted, other.interrupted]), + np.hstack([self.agent_id, other.agent_id]), + np.hstack([self.group_id, other.group_id]), + np.hstack([self.group_reward, other.group_reward]), + ) + combined_terminal_steps._agent_id_to_index = { + **self.agent_id_to_index, + # shift index of added termination steps because of appending + **{ + agent_id: index + len(self) + for agent_id, index in other.agent_id_to_index.items() + }, + } + + return combined_terminal_steps + @staticmethod def empty(spec: "BehaviorSpec") -> "TerminalSteps": """ diff --git a/ml-agents-envs/mlagents_envs/envs/env_helpers.py b/ml-agents-envs/mlagents_envs/envs/env_helpers.py index 0c17c2b20d..7d8ef10687 100644 --- a/ml-agents-envs/mlagents_envs/envs/env_helpers.py +++ b/ml-agents-envs/mlagents_envs/envs/env_helpers.py @@ -17,7 +17,11 @@ def _unwrap_batch_steps(batch_steps, behavior_name): termination_id = [ _behavior_to_agent_id(behavior_name, i) for i in termination_batch.agent_id ] - agents = decision_id + termination_id + agents = decision_id + for id in termination_id: + if id not in agents: + agents.append(id) + obs = { agent_id: [batch_obs[i] for batch_obs in termination_batch.obs] for i, agent_id in enumerate(termination_id) @@ -41,11 +45,13 @@ def _unwrap_batch_steps(batch_steps, behavior_name): ) obs = {k: v if len(v) > 1 else v[0] for k, v in obs.items()} rewards = { - agent_id: termination_batch.reward[i] - for i, agent_id in enumerate(termination_id) + agent_id: decision_batch.reward[i] for i, agent_id in enumerate(decision_id) } rewards.update( - {agent_id: decision_batch.reward[i] for i, agent_id in enumerate(decision_id)} + { + agent_id: termination_batch.reward[i] + for i, agent_id in enumerate(termination_id) + } ) cumulative_rewards = {k: v for k, v in rewards.items()} infos = {} @@ -63,8 +69,8 @@ def _unwrap_batch_steps(batch_steps, behavior_name): infos[agent_id]["behavior_name"] = behavior_name infos[agent_id]["group_id"] = termination_batch.group_id[i] infos[agent_id]["group_reward"] = termination_batch.group_reward[i] - infos[agent_id]["interrupted"] = termination_batch.interrupted[i] - truncated = termination_batch.interrupted[i] + truncated = bool(termination_batch.interrupted[i]) + infos[agent_id]["interrupted"] = truncated truncations[agent_id] = truncated terminations[agent_id] = not truncated id_map = {agent_id: i for i, agent_id in enumerate(decision_id)} diff --git a/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py b/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py index 85ce904f24..9121199e4a 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py @@ -50,8 +50,7 @@ def step(self, actions: Dict[str, Any]) -> Tuple: # Step environment self._step() - # Agent cleanup and sorting - self._cleanup_agents() + # Agent sorting self._live_agents.sort() # unnecessary, only for passing API test return ( diff --git a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py index 34fb60ca68..3c62885f12 100644 --- a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py +++ b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py @@ -169,7 +169,7 @@ def _process_action(self, current_agent, action): if action.continuous is not None: self._current_action[current_behavior].continuous[ current_index - ] = action.continuous[0] + ] = action.continuous if action.discrete is not None: self._current_action[current_behavior].discrete[ current_index @@ -186,7 +186,33 @@ def _process_action(self, current_agent, action): def _step(self): for behavior_name, actions in self._current_action.items(): self._env.set_actions(behavior_name, actions) - self._env.step() + + def step_and_return_steps(behavior_name): + self._env.step() + decision_steps, termination_steps = self._env.get_steps(behavior_name) + return decision_steps, termination_steps + + # DecisionSteps are assumed come in synchronously at every `DecisionPeriod` frame, + # but TerminationSteps can be sent inbetween. Therefore, to collect step information about all agents, + # we need to continue stepping the environment. + # NOTE: This can lead to returning TerminationSteps and subsequent DecisionSteps at the same time for an agent + # (but this was also possible before). + for behavior_name in self._env.behavior_specs.keys(): + decision_steps, termination_steps = step_and_return_steps(behavior_name) + collected_decision_steps = decision_steps + collected_termination_steps = termination_steps + while not len(set(collected_decision_steps.agent_id)) >= len(self._agents): + decision_steps, termination_steps = step_and_return_steps(behavior_name) + if len(decision_steps) > 0: + collected_decision_steps += decision_steps + if len(termination_steps) > 0: + collected_termination_steps += termination_steps + + self._env._env_state[behavior_name] = ( + collected_decision_steps, + collected_termination_steps, + ) + self._reset_states() for behavior_name in self._env.behavior_specs.keys(): terminations, truncations, rewards, cumulative_rewards = self._batch_update( @@ -311,7 +337,7 @@ def truncations(self): @property def agents(self): - return sorted(self._live_agents) + return sorted(self._agents) @property def rewards(self):