Refactor vocabulary loading and clean up tokenizer methods (#136)

thewh1teagle · web-flow · commit 46417b406956 · 2025-03-29T20:54:39.000+03:00
* Refactor vocabulary loading and clean up tokenizer methods

* Refactor example scripts and clean up code formatting

* vocab

* Fix vocab loading to return the entire JSON object instead of just the 'vocab' key

* Add voice configuration download and update vocab loading from config.json

* Refactor config file handling and improve code formatting

* Simplify GitHub Actions workflow by removing codespell step and renaming job

* beta

* Add configuration file for istftnet and update .gitignore to include it

* Bump version to 0.4.6-beta.1 in pyproject.toml

* Enhance Kokoro initialization by adding vocab_config parameter and update examples to include config.json

* Update Chinese example to use new voice and adjust vocab_config handling in Kokoro class

* tests

* config

* add 1.1 in chinses misaki example
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -9,10 +9,9 @@ on:
     branches: [main]
   workflow_dispatch:
 jobs:
-  codespell_and_ruff:
+  ruff:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
-    - uses: codespell-project/actions-codespell@v2
-    - uses: astral-sh/ruff-action@v3
-    - run: ruff format --diff
+      - uses: actions/checkout@v4
+      - uses: astral-sh/ruff-action@v3
+      - run: ruff format --diff
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@ wheels/
 *.wav
 *.json
 !.vscode/*.json
+!src/kokoro_onnx/config.json
 espeak-ng-data/
 *.tar.gz
 *.dylib
diff --git a/examples/chinese.py b/examples/chinese.py
@@ -12,6 +12,7 @@
     Download these files
     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/kokoro-v1.1-zh.onnx
     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/voices-v1.1-zh.bin
+    https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/raw/main/config.json
 4. Run
     uv run main.py
 """
@@ -21,11 +22,11 @@
 from misaki import zh
 
 # Misaki G2P with espeak-ng fallback
-g2p = zh.ZHG2P()
+g2p = zh.ZHG2P(version="1.1")
 
 text = "千里之行，始于足下。"
-voice = "af_maple"
-kokoro = Kokoro("kokoro-v1.1-zh.onnx", "voices-v1.1-zh.bin")
+voice = "zf_001"
+kokoro = Kokoro("kokoro-v1.1-zh.onnx", "voices-v1.1-zh.bin", vocab_config="config.json")
 phonemes, _ = g2p(text)
 samples, sample_rate = kokoro.create(phonemes, voice=voice, speed=1.0, is_phonemes=True)
 sf.write("audio.wav", samples, sample_rate)
diff --git a/examples/french.py b/examples/french.py
@@ -19,12 +19,12 @@
 
 import soundfile as sf
 from kokoro_onnx import Kokoro
-from misaki import en, espeak
+from misaki import espeak
 from misaki.espeak import EspeakG2P
 
 # Misaki G2P with espeak-ng fallback
 fallback = espeak.EspeakFallback(british=False)
-g2p = EspeakG2P(language='fr-fr')
+g2p = EspeakG2P(language="fr-fr")
 
 # Kokoro
 kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
diff --git a/examples/hindi.py b/examples/hindi.py
@@ -19,12 +19,12 @@
 
 import soundfile as sf
 from kokoro_onnx import Kokoro
-from misaki import en, espeak
+from misaki import espeak
 from misaki.espeak import EspeakG2P
 
 # Misaki G2P with espeak-ng fallback
 fallback = espeak.EspeakFallback(british=False)
-g2p = EspeakG2P(language='hi')
+g2p = EspeakG2P(language="hi")
 
 # Kokoro
 kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
diff --git a/examples/italian.py b/examples/italian.py
@@ -19,12 +19,12 @@
 
 import soundfile as sf
 from kokoro_onnx import Kokoro
-from misaki import en, espeak
+from misaki import espeak
 from misaki.espeak import EspeakG2P
 
 # Misaki G2P with espeak-ng fallback
 fallback = espeak.EspeakFallback(british=False)
-g2p = EspeakG2P(language='it')
+g2p = EspeakG2P(language="it")
 
 # Kokoro
 kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
diff --git a/examples/japanse.py b/examples/japanse.py
@@ -12,6 +12,7 @@
     Download these files
     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/kokoro-v1.1-zh.onnx
     https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.1/voices-v1.1-zh.bin
+    https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/raw/main/config.json
 4. Run
     uv run main.py
 """
@@ -25,7 +26,7 @@
 
 text = "「人生を夢見るな。夢を生きろ。」"
 voice = "jf_alpha"
-kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
+kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin", vocab_config="config.json")
 phonemes, _ = g2p(text)
 samples, sample_rate = kokoro.create(phonemes, voice=voice, speed=1.0, is_phonemes=True)
 sf.write("audio.wav", samples, sample_rate)
diff --git a/examples/portuguese.py b/examples/portuguese.py
@@ -19,12 +19,12 @@
 
 import soundfile as sf
 from kokoro_onnx import Kokoro
-from misaki import en, espeak
+from misaki import espeak
 from misaki.espeak import EspeakG2P
 
 # Misaki G2P with espeak-ng fallback
 fallback = espeak.EspeakFallback(british=False)
-g2p = EspeakG2P(language='pt-br')
+g2p = EspeakG2P(language="pt-br")
 
 # Kokoro
 kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
diff --git a/examples/spanish.py b/examples/spanish.py
@@ -19,12 +19,12 @@
 
 import soundfile as sf
 from kokoro_onnx import Kokoro
-from misaki import en, espeak
+from misaki import espeak
 from misaki.espeak import EspeakG2P
 
 # Misaki G2P with espeak-ng fallback
 fallback = espeak.EspeakFallback(british=False)
-g2p = EspeakG2P(language='es')
+g2p = EspeakG2P(language="es")
 
 # Kokoro
 kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "kokoro-onnx"
-version = "0.4.5"
+version = "0.4.6-beta.1"
 description = "TTS with kokoro and onnx runtime"
 readme = "README.md"
 authors = [
diff --git a/scripts/fetch_voices.py b/scripts/fetch_voices.py
@@ -21,6 +21,7 @@
 import torch
 import os
 from tqdm import tqdm
+from pathlib import Path
 
 config = {
     "Kokoro-82M-v1.1-zh": {
@@ -39,11 +40,22 @@
 
 def get_voice_names(api_url):
     resp = requests.get(api_url)
+    resp.raise_for_status()
     data = resp.json()
     names = [voice["path"][7:-3] for voice in data]
     return names
 
 
+def download_config():
+    resp = requests.get(
+        "https://huggingface.co/hexgrad/Kokoro-82M/raw/main/config.json"
+    )
+    resp.raise_for_status()
+    content = resp.content
+    with open(Path(__file__).parent / "../src/kokoro_onnx/config.json", "wb") as fp:
+        fp.write(content)
+
+
 def download_voices(voice_url: str, names: list[str], npz_path: str):
     count = len(names)
 
@@ -77,6 +89,7 @@ def main():
         )
         voice_names = get_voice_names(api_url)
         download_voices(voice_url, voice_names, npz_path)
+        download_config()
 
 
 main()
diff --git a/src/kokoro_onnx/__init__.py b/src/kokoro_onnx/__init__.py
@@ -11,6 +11,7 @@
 import numpy as np
 import onnxruntime as rt
 from numpy.typing import NDArray
+import json
 
 from .config import (
     MAX_PHONEME_LENGTH,
@@ -29,6 +30,7 @@ def __init__(
         model_path: str,
         voices_path: str,
         espeak_config: EspeakConfig | None = None,
+        vocab_config: dict | str = None,
     ):
         # Show useful information for bug reports
         log.debug(
@@ -53,7 +55,17 @@ def __init__(
         log.debug(f"Providers: {providers}")
         self.sess = rt.InferenceSession(model_path, providers=providers)
         self.voices: np.ndarray = np.load(voices_path)
-        self.tokenizer = Tokenizer(espeak_config)
+
+        vocab = None
+
+        if isinstance(vocab_config, str):
+            with open(vocab_config, "r") as fp:
+                config = json.load(fp)
+                vocab = config["vocab"]
+        elif isinstance(vocab, dict):
+            vocab = vocab["vocab"]
+
+        self.tokenizer = Tokenizer(espeak_config, vocab=vocab)
 
     @classmethod
     def from_session(
@@ -87,24 +99,21 @@ def _create_audio(
 
         voice = voice[len(tokens)]
         tokens = [[0, *tokens, 0]]
-        if 'input_ids' in [i.name for i in self.sess.get_inputs()]:
+        if "input_ids" in [i.name for i in self.sess.get_inputs()]:
             # Newer export versions
             inputs = {
-                'input_ids': tokens,
-                'style': np.array(voice, dtype=np.float32),
-                'speed': np.array([speed], dtype=np.int32)
+                "input_ids": tokens,
+                "style": np.array(voice, dtype=np.float32),
+                "speed": np.array([speed], dtype=np.int32),
             }
         else:
             inputs = {
-                'tokens': tokens,
-                'style': voice,
-                'speed': np.ones(1, dtype=np.float32) * speed
+                "tokens": tokens,
+                "style": voice,
+                "speed": np.ones(1, dtype=np.float32) * speed,
             }
-        
-        audio = self.sess.run(
-            None,
-            inputs
-        )[0]
+
+        audio = self.sess.run(None, inputs)[0]
         audio_duration = len(audio) / SAMPLE_RATE
         create_duration = time.time() - start_t
         rtf = create_duration / audio_duration
diff --git a/src/kokoro_onnx/config.json b/src/kokoro_onnx/config.json
diff --git a/src/kokoro_onnx/config.py b/src/kokoro_onnx/config.py
diff --git a/src/kokoro_onnx/tokenizer.py b/src/kokoro_onnx/tokenizer.py
diff --git a/uv.lock b/uv.lock