Fix xtts sample rate

2025-06-26 18:16:32 +00:00 · 2024-03-20 16:10:31 -04:00
parent 62c9d3caac
commit 3bc5fb5d21
4 changed files with 69 additions and 28 deletions
--- a/14
+++ b/14
@@ -1,4 +1,7 @@
-FROM nvidia/cuda:11.8.0-base-ubuntu22.04
+FROM nvidia/cuda:12.2.0-base-ubuntu22.04
+
+ENV COQUI_TOS_AGREED=1
+ENV PRELOAD_MODEL=xtts

 RUN apt-get update && \
    apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip && \
@@ -9,12 +12,11 @@ RUN mkdir -p /app/voices
 # default clone of the default voice is really bad, use a better default
 COPY voices/alloy-alt.wav /app/voices/
 WORKDIR /app
-COPY requirements.txt /app/
+COPY *.txt /app/
 RUN pip install -r requirements.txt
-COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
-
+COPY *.sh /app/
 RUN ./download_voices_tts-1.sh
 RUN ./download_voices_tts-1-hd.sh
+COPY *.py *.yaml *.md LICENSE /app/

-ENV COQUI_TOS_AGREED=1
-CMD python main.py --host 0.0.0.0 --port 8000 --preload xtts
+CMD python main.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL
--- a/README.md
+++ b/README.md
@@ -24,11 +24,11 @@ Details:

 If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.

-Version: 0.7.0
-
-Last update: 2024-03-20
+Version: 0.7.2, 2024-03-20

 * Allow different xtts versions per voice in `voice_to_speaker.yaml`, ex. xtts_v2.0.2
+* Quality: Fix xtts sample rate (24000 vs. 22050 for piper)
+* use CUDA 12.2-base in Dockerfile

 API Documentation
 -----------------
@@ -56,16 +56,20 @@ Usage
 -----

 ```
-usage: main.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload_xtts] [-P PORT] [-H HOST]
+usage: main.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT]
+               [-H HOST]

 OpenedAI Speech API Server

 options:
  -h, --help            show this help message and exit
-  --piper_cuda          Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough (default: False)
+  --piper_cuda          Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me,
+                        but cpu is fast enough (default: False)
  --xtts_device XTTS_DEVICE
-                        Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
-  --preload_xtts        Preload the xtts model. By default it's loaded on first use. (default: False)
+                        Set the device for the xtts model. The special value of 'none' will use
+                        piper for all models. (default: cuda)
+  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on
+                        first use. (default: None)
  -P PORT, --port PORT  Server tcp port (default: 8000)
  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: localhost)
 ```
@@ -124,3 +128,31 @@ docker compose up
 ```

 If you want a minimal docker image with piper support only (900MB vs. 13GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to change this.
+
+Custom Voices Howto
+-------------------
+
+Custom voices should be mono 22050 hz sample rate WAV files with low noise (no background music, etc.) and not contain any partial words. Sample voices for xtts should be at least 6 seconds, but can be longer, but longer doesn't always produce better results.
+
+You can use FFmpeg to process your audio files and prepare them for xtts, here are some examples:
+
+```shell
+# convert a multi-channel audio file to mono, set sample rate to 22050 hz, trim to 6 seconds, and output as WAV file.
+ffmpeg -i input.mp3 -ac 1 -ar 22050 -t 6 -y me.wav
+# use a simple noise filter to clean up audio, and select a start time start for sampling.
+ffmpeg -i input.wav -af "highpass=f=200, lowpass=f=3000" -ac 1 -ar 22050 -ss 00:13:26.2 -t 6 -y me.wav
+# A more complex noise reduction setup with volume adjustment
+ffmpeg -i input.mkv -af "highpass=f=200, lowpass=f=3000, volume=5, afftdn=nf=25" -ac 1 -ar 22050 -ss 00:13:26.2 -t 6 -y me.wav
+```
+
+Once you WAV file is prepared, save it in the voices/ folder and update the `voice_to_speaker.yaml` file with the new file name.
+
+For example:
+
+```yaml
+...
+tts-1-hd:
+  me:
+    model: xtts_v2.0.2 # you can specify different xtts versions
+    speaker: voices/me.wav # this could be you
+```
--- a/download_voices_tts-1-hd.sh
+++ b/download_voices_tts-1-hd.sh
@@ -1,5 +1,4 @@
 #!/bin/sh
 export COQUI_TOS_AGREED=1
-model="xtts" # others are possible, ex. xtts_v2.0.2
-python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
+python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')"
 ./download_samples.sh
--- a/main.py
+++ b/main.py
@@ -71,6 +71,21 @@ class GenerateSpeechRequest(BaseModel):
    response_format: str = "mp3" # mp3, opus, aac, flac
    speed: float = 1.0 # 0.25 - 4.0

+def build_ffmpeg_args(sample_rate, response_format):
+    # Convert the output to the desired format using ffmpeg
+    ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", sample_rate, "-ac", "1", "-i", "-"]
+    
+    if response_format == "mp3":
+        ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"])
+    elif response_format == "opus":
+        ffmpeg_args.extend(["-f", "ogg", "-c:a", "libopus"])
+    elif response_format == "aac":
+        ffmpeg_args.extend(["-f", "adts", "-c:a", "aac", "-ab", "64k"])
+    elif response_format == "flac":
+        ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
+
+    return ffmpeg_args
+
@app.post("/v1/audio/speech", response_class=StreamingResponse)
 async def generate_speech(request: GenerateSpeechRequest):
    global xtts, args
@@ -90,18 +105,7 @@ async def generate_speech(request: GenerateSpeechRequest):
    elif response_format == "flac":
        media_type = "audio/x-flac"

-    # Convert the output to the desired format using ffmpeg
-    ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", "22050", "-ac", "1", "-i", "-"]
-
-    if response_format == "mp3":
-        ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"])
-    elif response_format == "opus":
-        ffmpeg_args.extend(["-f", "ogg", "-c:a", "libopus"])
-    elif response_format == "aac":
-        ffmpeg_args.extend(["-f", "adts", "-c:a", "aac", "-ab", "64k"])
-    elif response_format == "flac":
-        ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
-
+    ffmpeg_args = None
    tts_io_out = None

    # Use piper for tts-1, and if xtts_device == none use for all models.
@@ -119,6 +123,7 @@ async def generate_speech(request: GenerateSpeechRequest):
        tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
        tts_proc.stdin.close()
        tts_io_out = tts_proc.stdout
+        ffmpeg_args = build_ffmpeg_args("22050", response_format)

    # Use xtts for tts-1-hd
    elif model == 'tts-1-hd':
@@ -128,6 +133,9 @@ async def generate_speech(request: GenerateSpeechRequest):
            xtts = xtts_wrapper(tts_model, device=args.xtts_device)
            # XXX probably should GC/torch cleanup here

+        # input sample rate is 22050, output is 24000...
+        ffmpeg_args = build_ffmpeg_args("24000", response_format)
+
        # tts speed doesn't seem to work well
        if speed < 0.5:
            speed = speed / 0.5
@@ -153,7 +161,7 @@ if __name__ == "__main__":

    parser.add_argument('--piper_cuda', action='store_true', default=False, help="Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough") 
    parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
-    parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts'). By default it's loaded on first use.")
+    parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
    parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
    parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0")