0.6.0 rc1

2025-06-26 18:16:32 +00:00 · 2023-11-27 23:25:44 -05:00
parent 603ee9d54e
commit ba08f6e8f3
6 changed files with 19 additions and 16 deletions
--- a/3
+++ b/3
@@ -1,12 +1,13 @@
 FROM nvidia/cuda:11.8.0-base-ubuntu22.04

 RUN apt-get update && \
-    apt-get install --no-install-recommends -y python-is-python3 python3-pip ffmpeg && \
+    apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip && \
    apt-get clean && rm -rf /var/lib/apt/lists/*

 #RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
 COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
+COPY voices/alloy-alt.wav /app/voices/
 WORKDIR /app

 RUN pip install -r requirements.txt
--- a/Dockerfile.min
+++ b/Dockerfile.min
@@ -1,8 +1,7 @@
 FROM ubuntu:22.04

-# tts-1 only
 RUN apt-get update && \
-    apt-get install --no-install-recommends -y ffmpeg python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn && \
+    apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn && \
    apt-get clean && rm -rf /var/lib/apt/lists/*

 RUN pip install piper-tts
@@ -10,6 +9,7 @@ RUN pip install piper-tts
 #RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
 COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
+COPY voices/alloy-alt.wav /app/voices/
 WORKDIR /app

 RUN ./download_voices_tts-1.sh
--- a/README.md
+++ b/README.md
@@ -40,8 +40,8 @@ Installation instructions
 ```shell
 # Install the Python requirements
 pip install -r requirements.txt
-# install ffmpeg
-sudo apt install ffmpeg
+# install ffmpeg & curl
+sudo apt install ffmpeg curl
 # Download the voice models:
 # for tts-1
 bash download_voices_tts-1.sh
@@ -120,4 +120,4 @@ docker compose build
 docker compose up
 ```

-If you want a minimal docker image with piper only (see: Dockerfile.min). You can edit the `docker-compose.yml` to change this.
+If you want a minimal docker image with piper support only (900MB vs. 13GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to change this.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,14 +3,14 @@ services:
  server:
    build:
      context: .
-      #dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required
-      dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required
+      #dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~900MB
+      dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, 13GB
    stdin_open: true
    tty: true
    ports:
      - "8000:8000"
-    volumes:
-      - .:/app/
+#    volumes:
+#      - .:/app/
    # Below can be removed if not using GPU
    runtime: nvidia
    deploy:
--- a/main.py
+++ b/main.py
@@ -10,7 +10,6 @@ from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 import uvicorn
 from pydantic import BaseModel
-from TTS.api import TTS

 xtts = None
 args = None
@@ -31,7 +30,7 @@ class xtts_wrapper():
            speed=speed,
            file_path=file_path,
        )
-        
+
        os.unlink(file_path)
        return tf

@@ -104,7 +103,7 @@ async def generate_speech(request: GenerateSpeechRequest):
        tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
        tts_proc.stdin.close()
        tts_io_out = tts_proc.stdout
-        
+
    # Use xtts_v2 for tts-1-hd
    elif model == 'tts-1-hd':
        if not xtts:
@@ -115,13 +114,13 @@ async def generate_speech(request: GenerateSpeechRequest):
        # tts speed doesn't seem to work well
        if speed < 0.5:
            speed = speed / 0.5
-            ffmpeg_args.extend(["-af", f"atempo=0.5"]) 
+            ffmpeg_args.extend(["-af", "atempo=0.5"]) 
        if speed > 1.0:
            ffmpeg_args.extend(["-af", f"atempo={speed}"]) 
            speed = 1.0

        tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
-    
+
    # Pipe the output from piper/xtts to the input of ffmpeg
    ffmpeg_args.extend(["-"])
    ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_io_out, stdout=subprocess.PIPE)
@@ -134,7 +133,7 @@ if __name__ == "__main__":
        prog='main.py',
        description='OpenedAI Speech API Server',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    
+
    parser.add_argument('--piper_cuda', action='store_true', default=False, help="Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough") 
    parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
    parser.add_argument('--preload_xtts', action='store_true', default=False, help="Preload the xtts model. By default it's loaded on first use.")
@@ -143,6 +142,9 @@ if __name__ == "__main__":

    args = parser.parse_args()

+    if args.xtts_device != "none":
+        from TTS.api import TTS
+
    if args.preload_xtts:
        xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")

--- a/test_voices.sh
+++ b/test_voices.sh