reorder docker, allow different xtts model versions

2025-06-26 18:16:32 +00:00 · 2024-03-20 12:33:32 -04:00
parent 3fe07873f7
commit b4756dc1db
5 changed files with 26 additions and 23 deletions
--- a/8
+++ b/8
@@ -6,13 +6,15 @@ RUN apt-get update && \

 #RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
-COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
+# default clone of the default voice is really bad, use a better default
 COPY voices/alloy-alt.wav /app/voices/
 WORKDIR /app
-
+COPY requirements.txt /app/
 RUN pip install -r requirements.txt
+COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
+
 RUN ./download_voices_tts-1.sh
 RUN ./download_voices_tts-1-hd.sh

 ENV COQUI_TOS_AGREED=1
-CMD python main.py --host 0.0.0.0 --port 8000 --preload_xtts
+CMD python main.py --host 0.0.0.0 --port 8000 --preload xtts
--- a/Dockerfile.min
+++ b/Dockerfile.min
@@ -9,7 +9,6 @@ RUN pip install piper-tts
 #RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
 COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
-COPY voices/alloy-alt.wav /app/voices/
 WORKDIR /app

 RUN ./download_voices_tts-1.sh
--- a/download_voices_tts-1-hd.sh
+++ b/download_voices_tts-1-hd.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
 export COQUI_TOS_AGREED=1
-model="tts_models/multilingual/multi-dataset/xtts_v2"
+model="xtts" # others are possible, ex. xtts_v2.0.2
 python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
 ./download_samples.sh
--- a/main.py
+++ b/main.py
@@ -17,9 +17,10 @@ args = None
 app = FastAPI()

 class xtts_wrapper():
-    def __init__(self, model_name):
+    def __init__(self, model_name, device):
        global args
-        self.xtts = TTS(model_name=model_name, progress_bar=False).to(args.xtts_device)
+        self.model_name = model_name
+        self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)

    def tts(self, text, speaker_wav, speed):
        tf, file_path = tempfile.mkstemp(suffix='.wav')
@@ -119,13 +120,14 @@ async def generate_speech(request: GenerateSpeechRequest):
        tts_proc.stdin.close()
        tts_io_out = tts_proc.stdout

-    # Use xtts_v2 for tts-1-hd
+    # Use xtts for tts-1-hd
    elif model == 'tts-1-hd':
-        if not xtts:
-            xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
-
        tts_model, speaker = model, speaker = map_voice_to_speaker(voice, 'tts-1-hd')

+        if not xtts or xtts.model_name != tts_model:
+            xtts = xtts_wrapper(tts_model, device=args.xtts_device)
+            # XXX probably should GC/torch cleanup here
+
        # tts speed doesn't seem to work well
        if speed < 0.5:
            speed = speed / 0.5
@@ -151,7 +153,7 @@ if __name__ == "__main__":

    parser.add_argument('--piper_cuda', action='store_true', default=False, help="Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough") 
    parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
-    parser.add_argument('--preload_xtts', action='store_true', default=False, help="Preload the xtts model. By default it's loaded on first use.")
+    parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts'). By default it's loaded on first use.")
    parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
    parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0")

@@ -160,7 +162,7 @@ if __name__ == "__main__":
    if args.xtts_device != "none":
        from TTS.api import TTS

-    if args.preload_xtts:
-        xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
+    if args.preload:
+        xtts = xtts_wrapper(args.preload, device=args.xtts_device)

-    uvicorn.run(app, host=args.host, port=args.port) # , root_path=cwd, access_log=False, log_level="info", ssl_keyfile="cert.pem", ssl_certfile="cert.pem")
+    uvicorn.run(app, host=args.host, port=args.port)
--- a/voice_to_speaker.yaml
+++ b/voice_to_speaker.yaml
@@ -25,26 +25,26 @@ tts-1:
    speaker: 163
 tts-1-hd:
  alloy: 
-    model: tts_models/multilingual/multi-dataset/xtts_v2
+    model: xtts
    speaker: voices/alloy-alt.wav
  alloy-orig: 
-    model: tts_models/multilingual/multi-dataset/xtts_v2
+    model: xtts
    speaker: voices/alloy.wav # it's REALLY BAD
  echo:
-    model: tts_models/multilingual/multi-dataset/xtts_v2
+    model: xtts
    speaker: voices/echo.wav
  fable:
-    model: tts_models/multilingual/multi-dataset/xtts_v2
+    model: xtts
    speaker: voices/fable.wav
  onyx:
-    model: tts_models/multilingual/multi-dataset/xtts_v2
+    model: xtts
    speaker: voices/onyx.wav
  nova:
-    model: tts_models/multilingual/multi-dataset/xtts_v2
+    model: xtts
    speaker: voices/nova.wav
  shimmer:
-    model: tts_models/multilingual/multi-dataset/xtts_v2
+    model: xtts
    speaker: voices/shimmer.wav
  me:
-    model: tts_models/multilingual/multi-dataset/xtts_v2
+    model: xtts_v2.0.2 # you can specify different xtts version
    speaker: voices/me.wav # this could be you