fix pops in tts-1-hd

2025-06-26 18:16:32 +00:00 · 2024-03-20 18:11:11 -04:00
parent 1a553f18d7
commit 48fa91ec93
2 changed files with 9 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -25,10 +25,10 @@ Details:

 If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.

-Version: 0.7.2, 2024-03-20
+Version: 0.7.3, 2024-03-20

 * Allow different xtts versions per voice in `voice_to_speaker.yaml`, ex. xtts_v2.0.2
-* Quality: Fix xtts sample rate (24000 vs. 22050 for piper)
+* Quality: Fix xtts sample rate (24000 vs. 22050 for piper) and pops
 * use CUDA 12.2-base in Dockerfile

 API Documentation
--- a/main.py
+++ b/main.py
@@ -71,9 +71,12 @@ class GenerateSpeechRequest(BaseModel):
    response_format: str = "mp3" # mp3, opus, aac, flac
    speed: float = 1.0 # 0.25 - 4.0

-def build_ffmpeg_args(sample_rate, response_format):
+def build_ffmpeg_args(response_format, input_format, sample_rate):
    # Convert the output to the desired format using ffmpeg
-    ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", sample_rate, "-ac", "1", "-i", "-"]
+    if input_format == 'raw':
+        ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", sample_rate, "-ac", "1", "-i", "-"]
+    else:
+        ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "WAV", "-i", "-"]
    
    if response_format == "mp3":
        ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"])
@@ -123,7 +126,7 @@ async def generate_speech(request: GenerateSpeechRequest):
        tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
        tts_proc.stdin.close()
        tts_io_out = tts_proc.stdout
-        ffmpeg_args = build_ffmpeg_args("22050", response_format)
+        ffmpeg_args = build_ffmpeg_args(response_format, input_format="raw", sample_rate="22050")

    # Use xtts for tts-1-hd
    elif model == 'tts-1-hd':
@@ -138,8 +141,7 @@ async def generate_speech(request: GenerateSpeechRequest):

            xtts = xtts_wrapper(tts_model, device=args.xtts_device)

-        # input sample rate is 22050, output is 24000...
-        ffmpeg_args = build_ffmpeg_args("24000", response_format)
+        ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")

        # tts speed doesn't seem to work well
        if speed < 0.5: