File tree Expand file tree Collapse file tree 1 file changed +6
-0
lines changed Expand file tree Collapse file tree 1 file changed +6
-0
lines changed Original file line number Diff line number Diff line change 30
30
)
31
31
from megatron .core .utils import (
32
32
get_torch_version ,
33
+ is_te_min_version ,
33
34
is_torch_min_version ,
34
35
)
35
36
from megatron .training .activations import squared_relu
@@ -619,6 +620,11 @@ def validate_args(args, defaults={}):
619
620
assert os .environ .get ('CUDA_DEVICE_MAX_CONNECTIONS' ) != "1" , \
620
621
'FSDP always requires CUDA_DEVICE_MAX_CONNECTIONS value large than one'
621
622
623
+ if args .fp8_param_gather and is_te_min_version ("2.0.0" ):
624
+ args .fp8_param_gather = False
625
+ warnings .warn ('FSDP2 FP8 param gather is not supported yet in TE 2.0, will fallback to bf16' \
626
+ 'all_gather instead, turning off fp8_param_gather' )
627
+
622
628
if args .overlap_param_gather_with_optimizer_step :
623
629
assert args .use_distributed_optimizer , \
624
630
'--overlap-param-gather-with-optimizer-step only supported with distributed optimizer'
You can’t perform that action at this time.
0 commit comments