Almost properly scale on onnx export

nshmyrev · nshmyrev · commit dcaf8c9424ba · 2024-01-25T22:57:27.000+01:00
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="vosk-tts",
-    version="0.3.53",
+    version="0.3.54",
     author="Alpha Cephei Inc",
     author_email="contact@alphacephei.com",
     description="Offline text to speech synthesis",
diff --git a/training/stft.py b/training/stft.py
@@ -253,6 +253,9 @@ def inverse(self, magnitude, phase):
             stride=self.hop_length,
             padding=0)
 
+        # scale by hop ratio
+        inverse_transform *= float(self.filter_length) / self.hop_length
+
         inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
         inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
 
diff --git a/vosk_tts/synth.py b/vosk_tts/synth.py
@@ -15,7 +15,7 @@ def audio_float_to_int16(self,
         audio: np.ndarray, max_wav_value: float = 32767.0
     ) -> np.ndarray:
         """Normalize audio and convert to int16 range"""
-        audio_norm = audio * max_wav_value * 3.0
+        audio_norm = audio * max_wav_value
         audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
         audio_norm = audio_norm.astype("int16")
         return audio_norm