diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 08a8e33e72..7d1fffd938 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -56,9 +56,13 @@ repos:
(?x)^(
.*cs.meta|
.*.css|
- .*.meta
+ .*.meta|
+ .*.asset|
+ .*.prefab|
+ .*.unity|
+ .*.json
)$
- args: [--fix=lf]
+ args: [--fix=crlf]
- id: trailing-whitespace
name: trailing-whitespace-markdown
diff --git a/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb b/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb
index e5d3d45c8b..83aad09aba 100644
--- a/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb
+++ b/colab/Colab_UnityEnvironment_4_SB3VectorEnv.ipynb
@@ -161,8 +161,8 @@
"from pathlib import Path\n",
"from typing import Callable, Any\n",
"\n",
- "import gym\n",
- "from gym import Env\n",
+ "import gymnasium as gym\n",
+ "from gymnasium import Env\n",
"\n",
"from stable_baselines3 import PPO\n",
"from stable_baselines3.common.vec_env import VecMonitor, VecEnv, SubprocVecEnv\n",
diff --git a/docs/Python-Gym-API-Documentation.md b/docs/Python-Gym-API-Documentation.md
index b35771fc46..e92edce5e0 100644
--- a/docs/Python-Gym-API-Documentation.md
+++ b/docs/Python-Gym-API-Documentation.md
@@ -59,18 +59,22 @@ Environment initialization
#### reset
```python
- | reset() -> Union[List[np.ndarray], np.ndarray]
+ | reset(*, seed: int | None = None, options: dict[str, Any] | None = None) -> Tuple[np.ndarray, Dict]
```
-Resets the state of the environment and returns an initial observation.
-Returns: observation (object/list): the initial observation of the
-space.
+Resets the state of the environment and returns an initial observation and info.
+
+**Returns**:
+
+- `observation` _object/list_ - the initial observation of the
+ space.
+- `info` _dict_ - contains auxiliary diagnostic information.
#### step
```python
- | step(action: List[Any]) -> GymStepResult
+ | step(action: Any) -> GymStepResult
```
Run one timestep of the environment's dynamics. When end of
@@ -86,14 +90,15 @@ Accepts an action and returns a tuple (observation, reward, done, info).
- `observation` _object/list_ - agent's observation of the current environment
reward (float/list) : amount of reward returned after previous action
-- `done` _boolean/list_ - whether the episode has ended.
+- `terminated` _boolean/list_ - whether the episode has ended by termination.
+- `truncated` _boolean/list_ - whether the episode has ended by truncation.
- `info` _dict_ - contains auxiliary diagnostic information.
#### render
```python
- | render(mode="rgb_array")
+ | render()
```
Return the latest visual observations.
diff --git a/docs/Python-Gym-API.md b/docs/Python-Gym-API.md
index 97869899ce..59ce44eeb6 100644
--- a/docs/Python-Gym-API.md
+++ b/docs/Python-Gym-API.md
@@ -93,7 +93,7 @@ observation, a single discrete action and a single Agent in the scene.
Add the following code to the `train_unity.py` file:
```python
-import gym
+import gymnasium as gym
from baselines import deepq
from baselines import logger
diff --git a/docs/Python-PettingZoo-API-Documentation.md b/docs/Python-PettingZoo-API-Documentation.md
index 233e45e805..423a09d64e 100644
--- a/docs/Python-PettingZoo-API-Documentation.md
+++ b/docs/Python-PettingZoo-API-Documentation.md
@@ -21,7 +21,6 @@
* [action\_space](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.action_space)
* [side\_channel](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.side_channel)
* [reset](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.reset)
- * [seed](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.seed)
* [render](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.render)
* [close](#mlagents_envs.envs.unity_pettingzoo_base_env.UnityPettingzooBaseEnv.close)
@@ -137,7 +136,7 @@ Initializes a Unity Parallel environment wrapper.
#### reset
```python
- | reset() -> Dict[str, Any]
+ | reset(seed: int | None = None, options: dict | None = None) -> Tuple[Dict[str, Any], Dict[str, Dict]]
```
Resets the environment.
@@ -207,32 +206,24 @@ of an environment with `env.side_channel[]`.
#### reset
```python
- | reset()
+ | reset(seed: int | None = None, options: dict | None = None) -> Any
```
Resets the environment.
-
-#### seed
-
-```python
- | seed(seed=None)
-```
-
-Reseeds the environment (making the resulting environment deterministic).
-`reset()` must be called after `seed()`, and before `step()`.
-
#### render
```python
- | render(mode="human")
+ | render()
```
NOT SUPPORTED.
-Displays a rendered frame from the environment, if supported.
-Alternate render modes in the default environments are `'rgb_array'`
+Renders the environment as specified by self.render_mode, if supported.
+
+Render mode can be `human` to display a window.
+Other render modes in the default environments are `'rgb_array'`
which returns a numpy array and is supported by all environments outside of classic,
and `'ansi'` which returns the strings printed (specific to classic environments).
diff --git a/docs/Python-PettingZoo-API.md b/docs/Python-PettingZoo-API.md
index 2c62ed8415..651932a9e4 100644
--- a/docs/Python-PettingZoo-API.md
+++ b/docs/Python-PettingZoo-API.md
@@ -25,13 +25,13 @@ Here's an example of interacting with wrapped environment:
```python
from mlagents_envs.environment import UnityEnvironment
-from mlagents_envs.envs import UnityToPettingZooWrapper
+from mlagents_envs.envs.unity_aec_env import UnityAECEnv
unity_env = UnityEnvironment("StrikersVsGoalie")
-env = UnityToPettingZooWrapper(unity_env)
+env = UnityAECEnv(unity_env)
env.reset()
for agent in env.agent_iter():
- observation, reward, done, info = env.last()
+ observation, reward, terminated, truncated, info = env.last()
action = policy(observation, agent)
env.step(action)
```
diff --git a/ml-agents-envs/README.md b/ml-agents-envs/README.md
index 4db68723d2..b6b94f7a18 100644
--- a/ml-agents-envs/README.md
+++ b/ml-agents-envs/README.md
@@ -12,14 +12,6 @@ The LLAPI is used by the trainer implementation in `mlagents`.
`mlagents_envs` can be used independently of `mlagents` for Python
communication.
-## Installation
-
-Install the `mlagents_envs` package with:
-
-```sh
-python -m pip install mlagents_envs==1.1.0
-```
-
## Usage & More Information
See
@@ -42,3 +34,32 @@ scene with the ML-Agents SDK, check out the main
- Communication between Unity and the Python `UnityEnvironment` is not secure.
- On Linux, ports are not released immediately after the communication closes.
As such, you cannot reuse ports right after closing a `UnityEnvironment`.
+
+## Development and publishing (Wargaming artifactory)
+
+Since this package does not seem to be maintained anymore be the official developers, we have forked it to the Wargaming gitlab and are maintaining it there.
+Publishing is done via the [Wargaming artifactory](https://ed.artifactory.wgdp.io:443/artifactory/api/pypi/mlopsbi-pypi/simple).
+
+To contribute to the `mlagents_envs` package, please work on a branch and create a merge request to `master` once ready.
+Once the merge request is approved and merged to `master` branch, a gitlab pipeline will automatically create a new git tag and publish the new version to the Wargaming artifactory.
+
+## Installation (Wargaming artifactory)
+
+Since publishing is done via the Wargaming artifactory, you can use this package as dependency by adding the following to your `pyproject.toml`:
+
+```toml
+[tool.poetry.dependencies]
+mlagents-envs = { version = "^0.1", source = "artifactory" }
+
+[[tool.poetry.source]]
+name = "artifactory"
+url = "https://ed.artifactory.wgdp.io:443/artifactory/api/pypi/mlopsbi-pypi/simple"
+priority = "explicit"
+```
+
+
+Or you can install the `mlagents_envs` package from the Wargaming artifactory using pip:
+
+```bash
+pip install mlagents-envs --extra-index-url https://ed.artifactory.wgdp.io:443/artifactory/api/pypi/mlopsbi-pypi/simple
+```
diff --git a/ml-agents-envs/mlagents_envs/base_env.py b/ml-agents-envs/mlagents_envs/base_env.py
index 67deb26e85..eb3acdfff2 100644
--- a/ml-agents-envs/mlagents_envs/base_env.py
+++ b/ml-agents-envs/mlagents_envs/base_env.py
@@ -138,6 +138,30 @@ def __getitem__(self, agent_id: AgentId) -> DecisionStep:
def __iter__(self) -> Iterator[Any]:
yield from self.agent_id
+ def __add__(self, other: "DecisionSteps") -> "DecisionSteps":
+ assert isinstance(other, DecisionSteps)
+
+ combined_terminal_steps = DecisionSteps(
+ list(np.hstack([self.obs, other.obs])),
+ np.hstack([self.reward, other.reward]),
+ np.hstack([self.agent_id, other.agent_id]),
+ list(np.hstack([self.action_mask, other.action_mask]))
+ if self.action_mask or other.action_mask
+ else None,
+ np.hstack([self.group_id, other.group_id]),
+ np.hstack([self.group_reward, other.group_reward]),
+ )
+ combined_terminal_steps._agent_id_to_index = {
+ **self.agent_id_to_index,
+ # shift index of added termination steps because of appending
+ **{
+ agent_id: index + len(self)
+ for agent_id, index in other.agent_id_to_index.items()
+ },
+ }
+
+ return combined_terminal_steps
+
@staticmethod
def empty(spec: "BehaviorSpec") -> "DecisionSteps":
"""
@@ -245,6 +269,28 @@ def __getitem__(self, agent_id: AgentId) -> TerminalStep:
def __iter__(self) -> Iterator[Any]:
yield from self.agent_id
+ def __add__(self, other: "TerminalSteps") -> "TerminalSteps":
+ assert isinstance(other, TerminalSteps)
+
+ combined_terminal_steps = TerminalSteps(
+ list(np.hstack([self.obs, other.obs])),
+ np.hstack([self.reward, other.reward]),
+ np.hstack([self.interrupted, other.interrupted]),
+ np.hstack([self.agent_id, other.agent_id]),
+ np.hstack([self.group_id, other.group_id]),
+ np.hstack([self.group_reward, other.group_reward]),
+ )
+ combined_terminal_steps._agent_id_to_index = {
+ **self.agent_id_to_index,
+ # shift index of added termination steps because of appending
+ **{
+ agent_id: index + len(self)
+ for agent_id, index in other.agent_id_to_index.items()
+ },
+ }
+
+ return combined_terminal_steps
+
@staticmethod
def empty(spec: "BehaviorSpec") -> "TerminalSteps":
"""
diff --git a/ml-agents-envs/mlagents_envs/envs/env_helpers.py b/ml-agents-envs/mlagents_envs/envs/env_helpers.py
index 768e670603..7d8ef10687 100644
--- a/ml-agents-envs/mlagents_envs/envs/env_helpers.py
+++ b/ml-agents-envs/mlagents_envs/envs/env_helpers.py
@@ -17,7 +17,11 @@ def _unwrap_batch_steps(batch_steps, behavior_name):
termination_id = [
_behavior_to_agent_id(behavior_name, i) for i in termination_batch.agent_id
]
- agents = decision_id + termination_id
+ agents = decision_id
+ for id in termination_id:
+ if id not in agents:
+ agents.append(id)
+
obs = {
agent_id: [batch_obs[i] for batch_obs in termination_batch.obs]
for i, agent_id in enumerate(termination_id)
@@ -40,30 +44,46 @@ def _unwrap_batch_steps(batch_steps, behavior_name):
}
)
obs = {k: v if len(v) > 1 else v[0] for k, v in obs.items()}
- dones = {agent_id: True for agent_id in termination_id}
- dones.update({agent_id: False for agent_id in decision_id})
rewards = {
- agent_id: termination_batch.reward[i]
- for i, agent_id in enumerate(termination_id)
+ agent_id: decision_batch.reward[i] for i, agent_id in enumerate(decision_id)
}
rewards.update(
- {agent_id: decision_batch.reward[i] for i, agent_id in enumerate(decision_id)}
+ {
+ agent_id: termination_batch.reward[i]
+ for i, agent_id in enumerate(termination_id)
+ }
)
cumulative_rewards = {k: v for k, v in rewards.items()}
infos = {}
+ terminations = {}
+ truncations = {}
for i, agent_id in enumerate(decision_id):
infos[agent_id] = {}
infos[agent_id]["behavior_name"] = behavior_name
infos[agent_id]["group_id"] = decision_batch.group_id[i]
infos[agent_id]["group_reward"] = decision_batch.group_reward[i]
+ truncations[agent_id] = False
+ terminations[agent_id] = False
for i, agent_id in enumerate(termination_id):
infos[agent_id] = {}
infos[agent_id]["behavior_name"] = behavior_name
infos[agent_id]["group_id"] = termination_batch.group_id[i]
infos[agent_id]["group_reward"] = termination_batch.group_reward[i]
- infos[agent_id]["interrupted"] = termination_batch.interrupted[i]
+ truncated = bool(termination_batch.interrupted[i])
+ infos[agent_id]["interrupted"] = truncated
+ truncations[agent_id] = truncated
+ terminations[agent_id] = not truncated
id_map = {agent_id: i for i, agent_id in enumerate(decision_id)}
- return agents, obs, dones, rewards, cumulative_rewards, infos, id_map
+ return (
+ agents,
+ obs,
+ terminations,
+ truncations,
+ rewards,
+ cumulative_rewards,
+ infos,
+ id_map,
+ )
def _parse_behavior(full_behavior):
diff --git a/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py b/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py
index 4bb6fdf390..d7dea3fc10 100644
--- a/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py
+++ b/ml-agents-envs/mlagents_envs/envs/unity_aec_env.py
@@ -1,5 +1,5 @@
from typing import Any, Optional
-from gym import error
+from gymnasium import error
from mlagents_envs.base_env import BaseEnv
from pettingzoo import AECEnv
@@ -53,7 +53,8 @@ def observe(self, agent_id):
return (
self._observations[agent_id],
self._cumm_rewards[agent_id],
- self._dones[agent_id],
+ self._terminations[agent_id],
+ self._truncations[agent_id],
self._infos[agent_id],
)
@@ -61,8 +62,10 @@ def last(self, observe=True):
"""
returns observation, cumulative reward, done, info for the current agent (specified by self.agent_selection)
"""
- obs, reward, done, info = self.observe(self._agents[self._agent_index])
- return obs if observe else None, reward, done, info
+ obs, cumm_rewards, terminated, truncated, info = self.observe(
+ self._agents[self._agent_index]
+ )
+ return obs if observe else None, cumm_rewards, terminated, truncated, info
@property
def agent_selection(self):
diff --git a/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py b/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py
index df29a95c9a..f4209ba837 100644
--- a/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py
+++ b/ml-agents-envs/mlagents_envs/envs/unity_gym_env.py
@@ -3,8 +3,8 @@
import numpy as np
from typing import Any, Dict, List, Optional, Tuple, Union
-import gym
-from gym import error, spaces
+import gymnasium as gym
+from gymnasium import error, spaces
from mlagents_envs.base_env import ActionTuple, BaseEnv
from mlagents_envs.base_env import DecisionSteps, TerminalSteps
@@ -20,7 +20,7 @@ class UnityGymException(error.Error):
logger = logging_util.get_logger(__name__)
-GymStepResult = Tuple[np.ndarray, float, bool, Dict]
+GymStepResult = Tuple[np.ndarray, float, bool, bool, Dict]
class UnityToGymWrapper(gym.Env):
@@ -151,11 +151,16 @@ def __init__(
else:
self._observation_space = list_spaces[0] # only return the first one
- def reset(self) -> Union[List[np.ndarray], np.ndarray]:
- """Resets the state of the environment and returns an initial observation.
- Returns: observation (object/list): the initial observation of the
+ def reset(
+ self, *, seed: int | None = None, options: dict[str, Any] | None = None
+ ) -> Tuple[np.ndarray, Dict]:
+ """Resets the state of the environment and returns an initial observation and info.
+ Returns:
+ observation (object/list): the initial observation of the
space.
+ info (dict): contains auxiliary diagnostic information.
"""
+ super().reset(seed=seed, options=options)
self._env.reset()
decision_step, _ = self._env.get_steps(self.name)
n_agents = len(decision_step)
@@ -163,9 +168,9 @@ def reset(self) -> Union[List[np.ndarray], np.ndarray]:
self.game_over = False
res: GymStepResult = self._single_step(decision_step)
- return res[0]
+ return res[0], res[4]
- def step(self, action: List[Any]) -> GymStepResult:
+ def step(self, action: Any) -> GymStepResult:
"""Run one timestep of the environment's dynamics. When end of
episode is reached, you are responsible for calling `reset()`
to reset this environment's state.
@@ -175,14 +180,15 @@ def step(self, action: List[Any]) -> GymStepResult:
Returns:
observation (object/list): agent's observation of the current environment
reward (float/list) : amount of reward returned after previous action
- done (boolean/list): whether the episode has ended.
+ terminated (boolean/list): whether the episode has ended by termination.
+ truncated (boolean/list): whether the episode has ended by truncation.
info (dict): contains auxiliary diagnostic information.
"""
if self.game_over:
raise UnityGymException(
"You are calling 'step()' even though this environment has already "
- "returned done = True. You must always call 'reset()' once you "
- "receive 'done = True'."
+ "returned `terminated` or `truncated` as True. You must always call 'reset()' once you "
+ "receive `terminated` or `truncated` as True."
)
if self._flattener is not None:
# Translate action into list
@@ -227,9 +233,19 @@ def _single_step(self, info: Union[DecisionSteps, TerminalSteps]) -> GymStepResu
visual_obs = self._get_vis_obs_list(info)
self.visual_obs = self._preprocess_single(visual_obs[0][0])
- done = isinstance(info, TerminalSteps)
+ if isinstance(info, TerminalSteps):
+ interrupted = info.interrupted[0]
+ terminated, truncated = not interrupted, interrupted
+ else:
+ terminated, truncated = False, False
- return (default_observation, info.reward[0], done, {"step": info})
+ return (
+ default_observation,
+ info.reward[0],
+ terminated,
+ truncated,
+ {"step": info},
+ )
def _preprocess_single(self, single_visual_obs: np.ndarray) -> np.ndarray:
if self.uint8_visual:
@@ -276,7 +292,7 @@ def _get_vec_obs_size(self) -> int:
result += obs_spec.shape[0]
return result
- def render(self, mode="rgb_array"):
+ def render(self):
"""
Return the latest visual observations.
Note that it will not render a new frame of the environment.
diff --git a/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py b/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py
index 09398d27fa..9121199e4a 100644
--- a/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py
+++ b/ml-agents-envs/mlagents_envs/envs/unity_parallel_env.py
@@ -1,5 +1,5 @@
from typing import Optional, Dict, Any, Tuple
-from gym import error
+from gymnasium import error
from mlagents_envs.base_env import BaseEnv
from pettingzoo import ParallelEnv
@@ -20,13 +20,17 @@ def __init__(self, env: BaseEnv, seed: Optional[int] = None):
"""
super().__init__(env, seed)
- def reset(self) -> Dict[str, Any]:
+ def reset(
+ self,
+ seed: int | None = None,
+ options: dict | None = None,
+ ) -> Tuple[Dict[str, Any], Dict[str, Dict]]:
"""
Resets the environment.
"""
- super().reset()
+ super().reset(seed=seed, options=options)
- return self._observations
+ return self._observations, self._infos
def step(self, actions: Dict[str, Any]) -> Tuple:
self._assert_loaded()
@@ -46,8 +50,13 @@ def step(self, actions: Dict[str, Any]) -> Tuple:
# Step environment
self._step()
- # Agent cleanup and sorting
- self._cleanup_agents()
+ # Agent sorting
self._live_agents.sort() # unnecessary, only for passing API test
- return self._observations, self._rewards, self._dones, self._infos
+ return (
+ self._observations,
+ self._rewards,
+ self._terminations,
+ self._truncations,
+ self._infos,
+ )
diff --git a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py
index 3457f18c88..3c62885f12 100644
--- a/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py
+++ b/ml-agents-envs/mlagents_envs/envs/unity_pettingzoo_base_env.py
@@ -1,7 +1,7 @@
import atexit
from typing import Optional, List, Set, Dict, Any, Tuple
import numpy as np
-from gym import error, spaces
+from gymnasium import error, spaces
from mlagents_envs.base_env import BaseEnv, ActionTuple
from mlagents_envs.envs.env_helpers import _agent_id_to_behavior, _unwrap_batch_steps
@@ -32,7 +32,8 @@ def __init__(
self._possible_agents: Set[str] = set() # all agents that have ever appear
self._agent_id_to_index: Dict[str, int] = {} # agent_id: index in decision step
self._observations: Dict[str, np.ndarray] = {} # agent_id: obs
- self._dones: Dict[str, bool] = {} # agent_id: done
+ self._terminations: Dict[str, bool] = {} # agent_id: terminated
+ self._truncations: Dict[str, bool] = {} # agent_id: truncated
self._rewards: Dict[str, float] = {} # agent_id: reward
self._cumm_rewards: Dict[str, float] = {} # agent_id: reward
self._infos: Dict[str, Dict] = {} # agent_id: info
@@ -45,7 +46,7 @@ def __init__(
if not self._env.behavior_specs:
self._env.step()
for behavior_name in self._env.behavior_specs.keys():
- _, _, _ = self._batch_update(behavior_name)
+ _, _, _, _ = self._batch_update(behavior_name)
self._update_observation_spaces()
self._update_action_spaces()
@@ -132,7 +133,7 @@ def _update_action_spaces(self) -> None:
continue
if act_spec.continuous_size > 0:
c_space = spaces.Box(
- -1, 1, (act_spec.continuous_size,), dtype=np.int32
+ -1, 1, (act_spec.continuous_size,), dtype=np.float64
)
if self._seed is not None:
c_space.seed(self._seed)
@@ -162,13 +163,13 @@ def _process_action(self, current_agent, action):
else:
action = ActionTuple(action, None)
- if not self._dones[current_agent]:
+ if not self._terminations[current_agent] or self._truncations[current_agent]:
current_behavior = _agent_id_to_behavior(current_agent)
current_index = self._agent_id_to_index[current_agent]
if action.continuous is not None:
self._current_action[current_behavior].continuous[
current_index
- ] = action.continuous[0]
+ ] = action.continuous
if action.discrete is not None:
self._current_action[current_behavior].discrete[
current_index
@@ -176,7 +177,8 @@ def _process_action(self, current_agent, action):
else:
self._live_agents.remove(current_agent)
del self._observations[current_agent]
- del self._dones[current_agent]
+ del self._terminations[current_agent]
+ del self._truncations[current_agent]
del self._rewards[current_agent]
del self._cumm_rewards[current_agent]
del self._infos[current_agent]
@@ -184,18 +186,51 @@ def _process_action(self, current_agent, action):
def _step(self):
for behavior_name, actions in self._current_action.items():
self._env.set_actions(behavior_name, actions)
- self._env.step()
+
+ def step_and_return_steps(behavior_name):
+ self._env.step()
+ decision_steps, termination_steps = self._env.get_steps(behavior_name)
+ return decision_steps, termination_steps
+
+ # DecisionSteps are assumed come in synchronously at every `DecisionPeriod` frame,
+ # but TerminationSteps can be sent inbetween. Therefore, to collect step information about all agents,
+ # we need to continue stepping the environment.
+ # NOTE: This can lead to returning TerminationSteps and subsequent DecisionSteps at the same time for an agent
+ # (but this was also possible before).
+ for behavior_name in self._env.behavior_specs.keys():
+ decision_steps, termination_steps = step_and_return_steps(behavior_name)
+ collected_decision_steps = decision_steps
+ collected_termination_steps = termination_steps
+ while not len(set(collected_decision_steps.agent_id)) >= len(self._agents):
+ decision_steps, termination_steps = step_and_return_steps(behavior_name)
+ if len(decision_steps) > 0:
+ collected_decision_steps += decision_steps
+ if len(termination_steps) > 0:
+ collected_termination_steps += termination_steps
+
+ self._env._env_state[behavior_name] = (
+ collected_decision_steps,
+ collected_termination_steps,
+ )
+
self._reset_states()
for behavior_name in self._env.behavior_specs.keys():
- dones, rewards, cumulative_rewards = self._batch_update(behavior_name)
- self._dones.update(dones)
+ terminations, truncations, rewards, cumulative_rewards = self._batch_update(
+ behavior_name
+ )
+ self._terminations.update(terminations)
+ self._truncations.update(truncations)
self._rewards.update(rewards)
self._cumm_rewards.update(cumulative_rewards)
self._agent_index = 0
def _cleanup_agents(self):
- for current_agent, done in self.dones.items():
- if done:
+ for current_agent, terminated in self.terminations.items():
+ if terminated:
+ self._live_agents.remove(current_agent)
+
+ for current_agent, truncated in self.truncations.items():
+ if truncated:
self._live_agents.remove(current_agent)
@property
@@ -226,25 +261,33 @@ def _reset_states(self):
self._live_agents = []
self._agents = []
self._observations = {}
- self._dones = {}
+ self._terminations = {}
+ self._truncations = {}
self._rewards = {}
self._cumm_rewards = {}
self._infos = {}
self._agent_id_to_index = {}
- def reset(self):
+ def reset(
+ self,
+ seed: int | None = None,
+ options: dict | None = None,
+ ) -> Any:
"""
Resets the environment.
"""
+ self._seed = seed
+
self._assert_loaded()
self._agent_index = 0
self._reset_states()
self._possible_agents = set()
self._env.reset()
for behavior_name in self._env.behavior_specs.keys():
- _, _, _ = self._batch_update(behavior_name)
+ _, _, _, _ = self._batch_update(behavior_name)
self._live_agents.sort() # unnecessary, only for passing API test
- self._dones = {agent: False for agent in self._agents}
+ self._terminations = {agent: False for agent in self._agents}
+ self._truncations = {agent: False for agent in self._agents}
self._rewards = {agent: 0 for agent in self._agents}
self._cumm_rewards = {agent: 0 for agent in self._agents}
@@ -256,7 +299,8 @@ def _batch_update(self, behavior_name):
(
agents,
obs,
- dones,
+ terminations,
+ truncations,
rewards,
cumulative_rewards,
infos,
@@ -268,33 +312,32 @@ def _batch_update(self, behavior_name):
self._infos.update(infos)
self._agent_id_to_index.update(id_map)
self._possible_agents.update(agents)
- return dones, rewards, cumulative_rewards
-
- def seed(self, seed=None):
- """
- Reseeds the environment (making the resulting environment deterministic).
- `reset()` must be called after `seed()`, and before `step()`.
- """
- self._seed = seed
+ return terminations, truncations, rewards, cumulative_rewards
- def render(self, mode="human"):
+ def render(self):
"""
NOT SUPPORTED.
- Displays a rendered frame from the environment, if supported.
- Alternate render modes in the default environments are `'rgb_array'`
+ Renders the environment as specified by self.render_mode, if supported.
+
+ Render mode can be `human` to display a window.
+ Other render modes in the default environments are `'rgb_array'`
which returns a numpy array and is supported by all environments outside of classic,
and `'ansi'` which returns the strings printed (specific to classic environments).
"""
pass
@property
- def dones(self):
- return dict(self._dones)
+ def terminations(self):
+ return dict(self._terminations)
+
+ @property
+ def truncations(self):
+ return dict(self._truncations)
@property
def agents(self):
- return sorted(self._live_agents)
+ return sorted(self._agents)
@property
def rewards(self):
diff --git a/ml-agents-envs/pyproject.toml b/ml-agents-envs/pyproject.toml
new file mode 100644
index 0000000000..104caf7a78
--- /dev/null
+++ b/ml-agents-envs/pyproject.toml
@@ -0,0 +1,38 @@
+[tool.poetry]
+name = "mlagents_envs"
+version = "None"
+description = "Unity Machine Learning Agents Interface"
+homepage = "https://github.com/Unity-Technologies/ml-agents"
+authors = ["Unity Technologies "]
+classifiers=[
+ "Intended Audience :: Developers",
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
+ "License :: OSI Approved :: Apache Software License",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+]
+readme = "README.md"
+
+packages = [
+ { include = "mlagents_envs", from = "." },
+]
+include = ["mlagents_envs/*"]
+exclude = ["*.tests", "*.tests.*", "tests.*", "tests", "colabs", "*.ipynb"]
+
+[tool.poetry.dependencies]
+python = "^3.9"
+grpcio = ">=1.11.0,<=1.48.2"
+Pillow = ">=4.2.1"
+protobuf = ">=3.6,<3.21"
+pyyaml = ">=3.1.0"
+gymnasium = ">=0.25.0"
+pettingzoo = ">=1.22.0"
+numpy = ">=1.23.5,<2.0"
+filelock = ">=3.4.0"
+cloudpickle = "*"
+
+[build-system]
+requires = ["poetry-core>=1.9.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py
index fcbee96151..2eb0a0401d 100644
--- a/ml-agents-envs/setup.py
+++ b/ml-agents-envs/setup.py
@@ -4,7 +4,7 @@
from setuptools.command.install import install
import mlagents_envs
-VERSION = mlagents_envs.__version__
+VERSION = (None,)
EXPECTED_TAG = mlagents_envs.__release_tag__
here = os.path.abspath(os.path.dirname(__file__))
@@ -35,7 +35,7 @@ def run(self):
setup(
name="mlagents_envs",
- version=VERSION,
+ version=None,
description="Unity Machine Learning Agents Interface",
long_description=long_description,
long_description_content_type="text/markdown",
@@ -58,12 +58,12 @@ def run(self):
"Pillow>=4.2.1",
"protobuf>=3.6,<3.21",
"pyyaml>=3.1.0",
- "gym>=0.21.0",
- "pettingzoo==1.15.0",
- "numpy>=1.23.5,<1.24.0",
+ "gymnasium>=0.25.0",
+ "pettingzoo>=1.22.0",
+ "numpy>=1.23.5,<2.0",
"filelock>=3.4.0",
],
- python_requires=">=3.10.1,<=3.10.12",
+ python_requires=">=3.9,<4",
# TODO: Remove this once mypy stops having spurious setuptools issues.
cmdclass={"verify": VerifyVersionCommand}, # type: ignore
)
diff --git a/ml-agents-envs/tests/test_gym.py b/ml-agents-envs/tests/test_gym.py
index 4fc2bf548c..21afdc0c9f 100644
--- a/ml-agents-envs/tests/test_gym.py
+++ b/ml-agents-envs/tests/test_gym.py
@@ -2,7 +2,7 @@
import pytest
import numpy as np
-from gym import spaces
+from gymnasium import spaces
from mlagents_envs.envs.unity_gym_env import UnityToGymWrapper
from mlagents_envs.base_env import (
diff --git a/ml-agents/mlagents/trainers/subprocess_env_manager.py b/ml-agents/mlagents/trainers/subprocess_env_manager.py
index 43d468f2bc..8f767e23d0 100644
--- a/ml-agents/mlagents/trainers/subprocess_env_manager.py
+++ b/ml-agents/mlagents/trainers/subprocess_env_manager.py
@@ -12,7 +12,7 @@
UnityCommunicatorStoppedException,
)
from multiprocessing import Process, Pipe, Queue
-from multiprocessing.connection import Connection
+from multiprocessing.connection import Connection, PipeConnection
from queue import Empty as EmptyQueueException
from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec
from mlagents_envs import logging_util
@@ -77,7 +77,7 @@ class StepResponse(NamedTuple):
class UnityEnvWorker:
- def __init__(self, process: Process, worker_id: int, conn: Connection):
+ def __init__(self, process: Process, worker_id: int, conn: PipeConnection):
self.process = process
self.worker_id = worker_id
self.conn = conn
diff --git a/utils/generate_markdown_docs.py b/utils/generate_markdown_docs.py
index 7566b1bdc7..5ce432b3a2 100755
--- a/utils/generate_markdown_docs.py
+++ b/utils/generate_markdown_docs.py
@@ -6,7 +6,6 @@
import argparse
import hashlib
-
# pydoc-markdown -I . -m module_name --render_toc > doc.md
@@ -52,8 +51,8 @@ def remove_trailing_whitespace(filename):
# compare source and destination and write only if changed
if source_file != destination_file:
num_changed += 1
- with open(filename, "wb") as f:
- f.write(destination_file.encode())
+ with open(filename, "w", newline="\r\n") as f:
+ f.write(destination_file)
if __name__ == "__main__":
@@ -84,7 +83,7 @@ def remove_trailing_whitespace(filename):
for submodule in submodules:
module_args.append("-m")
module_args.append(f"{module_name}.{submodule}")
- with open(output_file_name, "w") as output_file:
+ with open(output_file_name, "wb") as output_file:
subprocess_args = [
"pydoc-markdown",
"-I",