@@ -59,8 +59,8 @@ class AMP_PPO:
59
59
Maximum gradient norm for clipping gradients during backpropagation.
60
60
use_clipped_value_loss : bool, default=True
61
61
Flag indicating whether to use a clipped value loss, as in the original PPO implementation.
62
- use_smooth_clamping : bool, default=False
63
- Flag indicating whether to use exponential clamping on the value loos .
62
+ use_smooth_ratio_clipping : bool, default=False
63
+ Flag indicating whether to apply smooth ( exponential) clipping to the PPO policy ratio .
64
64
schedule : str, default="fixed"
65
65
Learning rate schedule mode ("fixed" or "adaptive" based on KL divergence).
66
66
desired_kl : float, default=0.01
@@ -92,7 +92,7 @@ def __init__(
92
92
schedule : str = "fixed" ,
93
93
desired_kl : float = 0.01 ,
94
94
amp_replay_buffer_size : int = 100000 ,
95
- use_smooth_clamping : bool = False ,
95
+ use_smooth_ratio_clipping : bool = False ,
96
96
device : str = "cpu" ,
97
97
) -> None :
98
98
# Set device and learning hyperparameters
@@ -149,7 +149,7 @@ def __init__(
149
149
self .lam : float = lam
150
150
self .max_grad_norm : float = max_grad_norm
151
151
self .use_clipped_value_loss : bool = use_clipped_value_loss
152
- self .use_smooth_clamped_loss = use_smooth_clamped_loss
152
+ self .use_smooth_ratio_clipping : bool = use_smooth_ratio_clipping
153
153
154
154
def init_storage (
155
155
self ,
@@ -460,7 +460,8 @@ def update(self) -> Tuple[float, float, float, float, float, float, float, float
460
460
min_ = 1.0 - self .clip_param
461
461
max_ = 1.0 + self .clip_param
462
462
463
- if self .use_smooth_clamping :
463
+ # Smooth clamping for the ratio if enabled.
464
+ if self .use_smooth_ratio_clipping :
464
465
clipped_ratio = (
465
466
1
466
467
/ (1 + torch .exp ((- (ratio - min_ ) / (max_ - min_ ) + 0.5 ) * 4 ))
0 commit comments