SynthAether
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 12 additions & 1 deletion b/‎README.md
Lines changed: 12 additions & 1 deletion
diff --git a/‎src/f5_tts/api.py
Lines changed: 9 additions & 14 deletions b/‎src/f5_tts/api.py
Lines changed: 9 additions & 14 deletions
diff --git a/‎src/f5_tts/eval/eval_infer_batch.py
Lines changed: 36 additions & 30 deletions b/‎src/f5_tts/eval/eval_infer_batch.py
Lines changed: 36 additions & 30 deletions
diff --git a/‎src/f5_tts/eval/utils_eval.py
Lines changed: 11 additions & 3 deletions b/‎src/f5_tts/eval/utils_eval.py
Lines changed: 11 additions & 3 deletions
diff --git a/‎src/f5_tts/infer/infer_cli.py
Lines changed: 18 additions & 11 deletions b/‎src/f5_tts/infer/infer_cli.py
Lines changed: 18 additions & 11 deletions
@@ -0,0 +1,3 @@
+[submodule "src/third_party/BigVGAN"]
+	path = src/third_party/BigVGAN
+	url = https://github.com/NVIDIA/BigVGAN.git
@@ -46,7 +46,18 @@ cd F5-TTS
 pip install -e .
 ```
 
-### 3. Docker usage
+### 3. Init submodule( optional, if you want to change the vocoder from vocos to bigvgan)
+
+```bash
+git submodule update --init --recursive
+```
+After that, you need to change the `src/third_party/BigVGAN/bigvgan.py` by adding the following code at the beginning of the file.
+```python
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+```
+
+### 4. Docker usage
 ```bash
 # Build from Dockerfile
 docker build -t f5tts:v1 .
 
@@ -1,24 +1,18 @@
 import random
 import sys
-import tqdm
 from importlib.resources import files
 
 import soundfile as sf
 import torch
+import tqdm
 from cached_path import cached_path
 
+from f5_tts.infer.utils_infer import (hop_length, infer_process, load_model,
+                                      load_vocoder, preprocess_ref_audio_text,
+                                      remove_silence_for_generated_wav,
+                                      save_spectrogram, target_sample_rate)
 from f5_tts.model import DiT, UNetT
 from f5_tts.model.utils import seed_everything
-from f5_tts.infer.utils_infer import (
-    load_vocoder,
-    load_model,
-    infer_process,
-    remove_silence_for_generated_wav,
-    save_spectrogram,
-    preprocess_ref_audio_text,
-    target_sample_rate,
-    hop_length,
-)
 
 
 class F5TTS:
@@ -29,6 +23,7 @@ def __init__(
         vocab_file="",
         ode_method="euler",
         use_ema=True,
+        vocoder_name="vocos",
         local_path=None,
         device=None,
     ):
@@ -44,11 +39,11 @@ def __init__(
         )
 
         # Load models
-        self.load_vocoder_model(local_path)
+        self.load_vocoder_model(vocoder_name, local_path)
         self.load_ema_model(model_type, ckpt_file, vocab_file, ode_method, use_ema)
 
-    def load_vocoder_model(self, local_path):
-        self.vocoder = load_vocoder(local_path is not None, local_path, self.device)
+    def load_vocoder_model(self, vocoder_name, local_path):
+        self.vocoder = load_vocoder(vocoder_name, local_path is not None, local_path, self.device)
 
     def load_ema_model(self, model_type, ckpt_file, vocab_file, ode_method, use_ema):
         if model_type == "F5-TTS":
 
@@ -1,26 +1,23 @@
-import sys
 import os
+import sys
 
 sys.path.append(os.getcwd())
 
-import time
-from tqdm import tqdm
 import argparse
+import time
 from importlib.resources import files
 
 import torch
 import torchaudio
 from accelerate import Accelerator
-from vocos import Vocos
+from tqdm import tqdm
 
-from f5_tts.model import CFM, UNetT, DiT
+from f5_tts.eval.utils_eval import (get_inference_prompt,
+                                    get_librispeech_test_clean_metainfo,
+                                    get_seedtts_testset_metainfo)
+from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder
+from f5_tts.model import CFM, DiT, UNetT
 from f5_tts.model.utils import get_tokenizer
-from f5_tts.infer.utils_infer import load_checkpoint
-from f5_tts.eval.utils_eval import (
-    get_seedtts_testset_metainfo,
-    get_librispeech_test_clean_metainfo,
-    get_inference_prompt,
-)
 
 accelerator = Accelerator()
 device = f"cuda:{accelerator.process_index}"
@@ -31,8 +28,12 @@
 target_sample_rate = 24000
 n_mel_channels = 100
 hop_length = 256
+win_length = 1024
+n_fft = 1024
+extract_backend = "bigvgan"  # 'vocos' or 'bigvgan'
 target_rms = 0.1
 
+
 tokenizer = "pinyin"
 rel_path = str(files("f5_tts").joinpath("../../"))
 
@@ -123,14 +124,11 @@ def main():
 
     # Vocoder model
     local = False
-    if local:
-        vocos_local_path = "../checkpoints/charactr/vocos-mel-24khz"
-        vocos = Vocos.from_hparams(f"{vocos_local_path}/config.yaml")
-        state_dict = torch.load(f"{vocos_local_path}/pytorch_model.bin", weights_only=True, map_location=device)
-        vocos.load_state_dict(state_dict)
-        vocos.eval()
-    else:
-        vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
+    if extract_backend == "vocos":
+        vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
+    elif extract_backend == "bigvgan":
+        vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
+    vocoder = load_vocoder(vocoder_name=extract_backend, is_local=local, local_path=vocoder_local_path)
 
     # Tokenizer
     vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
@@ -139,17 +137,21 @@ def main():
     model = CFM(
         transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
         mel_spec_kwargs=dict(
-            target_sample_rate=target_sample_rate,
-            n_mel_channels=n_mel_channels,
+            n_fft=n_fft,
             hop_length=hop_length,
+            win_length=win_length,
+            n_mel_channels=n_mel_channels,
+            target_sample_rate=target_sample_rate,
+            extract_backend=extract_backend,
         ),
         odeint_kwargs=dict(
             method=ode_method,
         ),
         vocab_char_map=vocab_char_map,
     ).to(device)
 
-    model = load_checkpoint(model, ckpt_path, device, use_ema=use_ema)
+    dtype = torch.float16 if extract_backend == "vocos" else torch.float32
+    model = load_checkpoint(model, ckpt_path, device, dtype, use_ema=use_ema)
 
     if not os.path.exists(output_dir) and accelerator.is_main_process:
         os.makedirs(output_dir)
@@ -178,14 +180,18 @@ def main():
                     no_ref_audio=no_ref_audio,
                     seed=seed,
                 )
-            # Final result
-            for i, gen in enumerate(generated):
-                gen = gen[ref_mel_lens[i] : total_mel_lens[i], :].unsqueeze(0)
-                gen_mel_spec = gen.permute(0, 2, 1)
-                generated_wave = vocos.decode(gen_mel_spec.cpu())
-                if ref_rms_list[i] < target_rms:
-                    generated_wave = generated_wave * ref_rms_list[i] / target_rms
-                torchaudio.save(f"{output_dir}/{utts[i]}.wav", generated_wave, target_sample_rate)
+                # Final result
+                for i, gen in enumerate(generated):
+                    gen = gen[ref_mel_lens[i] : total_mel_lens[i], :].unsqueeze(0)
+                    gen_mel_spec = gen.permute(0, 2, 1)
+                    if extract_backend == "vocos":
+                        generated_wave = vocoder.decode(gen_mel_spec.cpu())
+                    elif extract_backend == "bigvgan":
+                        generated_wave = vocoder(gen_mel_spec)
+
+                    if ref_rms_list[i] < target_rms:
+                        generated_wave = generated_wave * ref_rms_list[i] / target_rms
+                    torchaudio.save(f"{output_dir}/{utts[i]}.wav", generated_wave.squeeze(0).cpu(), target_sample_rate)
 
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
 
@@ -2,15 +2,15 @@
 import os
 import random
 import string
-from tqdm import tqdm
 
 import torch
 import torch.nn.functional as F
 import torchaudio
+from tqdm import tqdm
 
+from f5_tts.eval.ecapa_tdnn import ECAPA_TDNN_SMALL
 from f5_tts.model.modules import MelSpec
 from f5_tts.model.utils import convert_char_to_pinyin
-from f5_tts.eval.ecapa_tdnn import ECAPA_TDNN_SMALL
 
 
 # seedtts testset metainfo: utt, prompt_text, prompt_wav, gt_text, gt_wav
@@ -74,8 +74,11 @@ def get_inference_prompt(
     tokenizer="pinyin",
     polyphone=True,
     target_sample_rate=24000,
+    n_fft=1024,
+    win_length=1024,
     n_mel_channels=100,
     hop_length=256,
+    extract_backend="bigvgan",
     target_rms=0.1,
     use_truth_duration=False,
     infer_batch_size=1,
@@ -94,7 +97,12 @@ def get_inference_prompt(
     )
 
     mel_spectrogram = MelSpec(
-        target_sample_rate=target_sample_rate, n_mel_channels=n_mel_channels, hop_length=hop_length
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        n_mel_channels=n_mel_channels,
+        target_sample_rate=target_sample_rate,
+        extract_backend=extract_backend,
     )
 
     for utt, prompt_text, prompt_wav, gt_text, gt_wav in tqdm(metainfo, desc="Processing prompts..."):
 
@@ -2,23 +2,18 @@
 import codecs
 import os
 import re
-from pathlib import Path
 from importlib.resources import files
+from pathlib import Path
 
 import numpy as np
 import soundfile as sf
 import tomli
 from cached_path import cached_path
 
+from f5_tts.infer.utils_infer import (infer_process, load_model, load_vocoder,
+                                      preprocess_ref_audio_text,
+                                      remove_silence_for_generated_wav)
 from f5_tts.model import DiT, UNetT
-from f5_tts.infer.utils_infer import (
-    load_vocoder,
-    load_model,
-    preprocess_ref_audio_text,
-    infer_process,
-    remove_silence_for_generated_wav,
-)
-
 
 parser = argparse.ArgumentParser(
     prog="python3 infer-cli.py",
@@ -70,6 +65,7 @@
     "--remove_silence",
     help="Remove silence.",
 )
+parser.add_argument("--vocoder_name", type=str, default="vocos", choices=["vocos", "bigvgan"], help="vocoder name")
 parser.add_argument(
     "--load_vocoder_from_local",
     action="store_true",
@@ -111,9 +107,14 @@
 speed = args.speed
 wave_path = Path(output_dir) / "infer_cli_out.wav"
 # spectrogram_path = Path(output_dir) / "infer_cli_out.png"
-vocos_local_path = "../checkpoints/charactr/vocos-mel-24khz"
+if args.vocoder_name == "vocos":
+    vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
+elif args.vocoder_name == "bigvgan":
+    vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
 
-vocoder = load_vocoder(is_local=args.load_vocoder_from_local, local_path=vocos_local_path)
+vocoder = load_vocoder(
+    vocoder_name=args.vocoder_name, is_local=args.load_vocoder_from_local, local_path=vocoder_local_path
+)
 
 
 # load models
@@ -136,6 +137,12 @@
         ckpt_step = 1200000
         ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
         # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors; local path
+    elif args.vocoder_name == "bigvgan":  # TODO: need to test
+        repo_name = "F5-TTS"
+        exp_name = "F5TTS_Base_bigvgan"
+        ckpt_step = 1250000
+        ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
+
 
 print(f"Using {model}...")
 ema_model = load_model(model_cls, model_cfg, ckpt_file, vocab_file)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "src/third_party/BigVGAN"]`
	`2`	`+ path = src/third_party/BigVGAN`
	`3`	`+ url = https://github.com/NVIDIA/BigVGAN.git`