0.10.0

2025-06-26 18:16:32 +00:00 · 2024-04-26 20:42:33 -04:00 · 2024-04-26 20:42:33 -04:00 · 6864cf03b1
commit 6864cf03b1
parent a2a3d2b3eb
16 changed files with 260 additions and 70 deletions
--- a/.github/workflows/build-docker.yml
+++ b/.github/workflows/build-docker.yml
@ -0,0 +1,127 @@
 name: Build and Publish Docker Image
 on:
  workflow_dispatch:
  push:
    branches:
      - 'main'
  release:
    types: [published]
 jobs:
  build-and-push-image:
    runs-on: ubuntu-latest
    permissions:
      contents: read
      packages: write
    env:
      # Set up environment variables for the job
      DOCKER_REGISTRY: ghcr.io
      IMAGE_NAME: ${{ github.repository }}
      TAG: ${{ github.sha }}
    steps:
      - name: Check out code
        uses: actions/checkout@v4
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
        with:
          install: true
      # Log in to the GitHub Container Registry only when not running on a pull request event
      - name: Login to Docker Registry
        uses: docker/login-action@v2
        with:
          registry: ${{ env.DOCKER_REGISTRY }}
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Extract metadata (tags, labels) for Docker
        id: meta
        uses: docker/metadata-action@v4
        with:
          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
      # Build and push the Docker image to GHCR for the main branch or specific tags
      - name: Build and Push Docker Image
        if: github.ref == 'refs/heads/main'
        uses: docker/build-push-action@v4
        with:
          context: .
          file: Dockerfile
          push: true
          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
          labels: version=${{ github.run_id }}
      # For tagged releases, build and push the Docker image with the corresponding tag
      - name: Build and Push Docker Image (Tagged)
        if: startsWith(github.ref, 'refs/tags/')
        uses: docker/build-push-action@v4
        with:
          context: .
          file: Dockerfile
          push: true
          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
          labels: version=${{ github.run_id }}
  build-and-push-alt-image:
    runs-on: ubuntu-latest
    permissions:
      contents: read
      packages: write
    env:
      # Set up environment variables for the job
      DOCKER_REGISTRY: ghcr.io
      IMAGE_NAME: matatonic/openedai-speech-min
      TAG: ${{ github.sha }}
    steps:
      - name: Check out code
        uses: actions/checkout@v4
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
        with:
          install: true
      # Log in to the GitHub Container Registry only when not running on a pull request event
      - name: Login to Docker Registry
        uses: docker/login-action@v2
        with:
          registry: ${{ env.DOCKER_REGISTRY }}
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Extract metadata (tags, labels) for Docker
        id: meta
        uses: docker/metadata-action@v4
        with:
          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
      # Build and push the Docker image to GHCR for the main branch or specific tags
      - name: Build and Push Docker Image
        if: github.ref == 'refs/heads/main'
        uses: docker/build-push-action@v4
        with:
          context: .
          file: Dockerfile.min
          push: true
          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
          labels: version=${{ github.run_id }}
      # For tagged releases, build and push the Docker image with the corresponding tag
      - name: Build and Push Docker Image (Tagged)
        if: startsWith(github.ref, 'refs/tags/')
        uses: docker/build-push-action@v4
        with:
          context: .
          file: Dockerfile.min
          push: true
          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
          labels: version=${{ github.run_id }}
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,8 @@
 voices/
 .env
 speech.env
 config/pre_process_map.yaml
 config/voice_to_speaker.yaml
 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/13
+++ b/13
@ -1,24 +1,17 @@
 FROM python:3.11-slim
 ENV COQUI_TOS_AGREED=1
 ENV PRELOAD_MODEL=xtts
 # or PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1
 RUN apt-get update && \
    apt-get install --no-install-recommends -y curl git ffmpeg
 #RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
 # default clone of the default voice is really bad, use a better default
 COPY voices/alloy-alt.wav /app/voices/
 WORKDIR /app
 COPY *.txt /app/
 RUN pip install --no-cache -r requirements.txt
-COPY *.sh /app/
+COPY *.sh *.py *.yaml *.md LICENSE config /app/
 RUN ./download_voices_tts-1.sh
 RUN ./download_voices_tts-1-hd.sh
 COPY *.py *.yaml *.md LICENSE /app/
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
-CMD python speech.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL
+ENV CLI_COMMAND="python speech.py"
 CMD $CLI_COMMAND
--- a/Dockerfile.min
+++ b/Dockerfile.min
@ -3,15 +3,13 @@ FROM python:3.11-slim
 RUN apt-get update && \
    apt-get install --no-install-recommends -y ffmpeg curl
-RUN pip install --no-cache piper-tts pyyaml fastapi uvicorn
+RUN pip install --no-cache piper-tts==1.2.0 pyyaml fastapi uvicorn
 #RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
 COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
 WORKDIR /app
 RUN ./download_voices_tts-1.sh
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
-CMD python speech.py --host 0.0.0.0 --port 8000 --xtts_device none
+ENV CLI_COMMAND="python speech.py --xtts_device none"
 CMD $CLI_COMMAND
--- a/README.md
+++ b/README.md
@ -25,6 +25,12 @@ Details:
 If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
 Version: 0.10.0, 2024-04-26
 * Better upgrades: Reorganize config files under config, voice models under voices
 * * **If you customized your `voice_to_speaker.yaml` or `pre_process_map.yaml` you need to move them to the `config/` folder.**
 * default listen host to 0.0.0.0
 Version: 0.9.0, 2024-04-23
 * Fix bug with yaml and loading UTF-8
@ -54,45 +60,47 @@ API Documentation
 Installation instructions
 -------------------------
-You can run the server via docker like so (**recommended**):
+1) Download the models & voices
 ```shell
 # for tts-1 / piper
 bash download_voices_tts-1.sh
 # and for tts-1-hd / xtts
 bash download_voices_tts-1-hd.sh
 ```
 2a) Docker (**recommended**): You can run the server via docker like so:
 ```shell
 cp sample.env speech.env # edit to suit your environment as needed, you can preload a model on startup
 docker compose up
 ```
-If you want a minimal docker image with piper support only (900MB vs. 13.5GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
+If you want a minimal docker image with piper support only (~1GB vs. ~10GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
-Manual instructions:
+2b) Manual instructions:
 ```shell
 # Install the Python requirements
 pip install -r requirements.txt
 # install ffmpeg and curl
 sudo apt install ffmpeg curl
-# Download the voice models:
+python speech.py
 # for tts-1
 bash download_voices_tts-1.sh
 # and for tts-1-hd
 bash download_voices_tts-1-hd.sh
 ```
 Usage
 -----
 ```
-usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT]
+usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST]
                 [-H HOST]
 OpenedAI Speech API Server
 options:
  -h, --help            show this help message and exit
-  --piper_cuda          Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me,
+  --piper_cuda          Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough (default: False)
                        but cpu is fast enough (default: False)
  --xtts_device XTTS_DEVICE
-                        Set the device for the xtts model. The special value of 'none' will use
+                        Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
-                        piper for all models. (default: cuda)
+  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on
                        first use. (default: None)
  -P PORT, --port PORT  Server tcp port (default: 8000)
-  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: localhost)
+  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
 ```
 Sample API Usage
@ -141,10 +149,32 @@ with client.audio.speech.with_streaming_response.create(
 Also see the `say.py` sample application for an example of how to use the openai-python API.
 ```
-$ python say.py -i "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
+$ python say.py -t "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
-$ python say.py -i "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
+$ python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
 ```
 ```
 usage: say.py [-h] [-m MODEL] [-v VOICE] [-f {mp3,aac,opus,flac}] [-s SPEED] [-t TEXT] [-i INPUT] [-o OUTPUT] [-p]
 Text to speech using the OpenAI API
 options:
  -h, --help            show this help message and exit
  -m MODEL, --model MODEL
                        The model to use (default: tts-1)
  -v VOICE, --voice VOICE
                        The voice of the speaker (default: alloy)
  -f {mp3,aac,opus,flac}, --format {mp3,aac,opus,flac}
                        The output audio format (default: mp3)
  -s SPEED, --speed SPEED
                        playback speed, 0.25-4.0 (default: 1.0)
  -t TEXT, --text TEXT  Provide text to read on the command line (default: None)
  -i INPUT, --input INPUT
                        Read text from a file (default is to read from stdin) (default: None)
  -o OUTPUT, --output OUTPUT
                        The filename to save the output to (default: None)
  -p, --playsound       Play the audio (default: False)
 ```
 Custom Voices Howto
 -------------------
--- a/config/pre_process_map.default.yaml
+++ b/config/pre_process_map.default.yaml
--- a/config/voice_to_speaker.default.yaml
+++ b/config/voice_to_speaker.default.yaml
@ -2,7 +2,7 @@
  some_other_voice_name_you_want:
    model: voices/choose your own model.onnx
    speaker: set your own speaker
-  alloy: 
+  alloy:
    model: voices/en_US-libritts_r-medium.onnx
    speaker: 79 # 64, 79, 80, 101, 130
  echo:
@ -24,7 +24,7 @@
    model: voices/en_US-libritts_r-medium.onnx
    speaker: 163
 tts-1-hd:
-  alloy: 
+  alloy:
    model: xtts
    speaker: voices/alloy-alt.wav
  alloy-orig: 
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,16 +1,17 @@
 services:
  server:
    build:
      context: .
      dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, ~10GB
      #dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
-    #command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000",  "--preload", "parler-tts/parler_tts_mini_v0.1"]
+    image: ghcr.io/matatonic/openedai-speech
-    command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "xtts"]
+    #image: ghcr.io/matatonic/openedai-speech-min
-    #command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--xtts_device", "none"] # min
+    env_file: speech.env
    ports:
      - "8000:8000"
-#    volumes:
+    volumes:
-#      - .:/app/
+      - ./voices:/app/voices
      - ./config:/app/config
    #restart: unless-stopped # install as a service
    # Below can be removed if not using GPU
    runtime: nvidia
    deploy:
--- a/download_samples.sh
+++ b/download_samples.sh
@ -1,4 +1,4 @@
 #!/bin/sh
 for i in alloy echo fable onyx nova shimmer; do
-	curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
+	[ ! -e "voices/$i.wav" ] && curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
 done
--- a/download_voices_tts-1-hd.sh
+++ b/download_voices_tts-1-hd.sh
@ -1,4 +1,9 @@
 #!/bin/sh
 export COQUI_TOS_AGREED=1
-python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')"
+export TTS_HOME=voices
 MODELS=${*:-xtts}
 for model in $MODELS; do
 	python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
 done
 ./download_samples.sh
--- a/download_voices_tts-1.sh
+++ b/download_voices_tts-1.sh
@ -1,5 +1,5 @@
 #!/bin/sh
-models="en_GB-northern_english_male-medium en_US-libritts_r-medium" # en_US-ryan-high 
+models=${*:-"en_GB-northern_english_male-medium en_US-libritts_r-medium"} # en_US-ryan-high
 piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null
 for i in $models ; do
 	piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null
--- a/openedai.py
+++ b/openedai.py
@ -3,8 +3,8 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import PlainTextResponse
 class OpenAIStub(FastAPI):
-    def __init__(self) -> None:
+    def __init__(self, **kwargs) -> None:
-        super().__init__()
+        super().__init__(**kwargs)
        self.models = {}
        self.add_middleware(
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 fastapi
 uvicorn
 # piper-tts
-piper-tts
+piper-tts==1.2.0
 onnxruntime-gpu
 # xtts
 TTS
--- a/sample.env
+++ b/sample.env
@ -0,0 +1,6 @@
 TTS_HOME=voices
 HF_HOME=voices
 #PRELOAD_MODEL=xtts
 #PRELOAD_MODEL=xtts_v2.0.2
 #CLI_COMMAND="python speech.py --preload $PRELOAD_MODEL"
 #CLI_COMMAND="python speech.py --xtts_device none" # for piper only
--- a/say.py
+++ b/say.py
@ -2,6 +2,7 @@
 import sys
 import os
 import atexit
 import tempfile
 import argparse
@ -20,19 +21,23 @@ import openai
 def parse_args(argv):
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(
-    parser.add_argument("-m", "--model", type=str, default="tts-1")#, choices=["tts-1", "tts-1-hd"])
+        description='Text to speech using the OpenAI API',
-    parser.add_argument("-v", "--voice", type=str, default="alloy")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"])
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"])
+    )
-    parser.add_argument("-s", "--speed", type=float, default=1.0)
+    parser.add_argument("-m", "--model", type=str, default="tts-1", help="The model to use")#, choices=["tts-1", "tts-1-hd"])
-    parser.add_argument("-i", "--input", type=str)
+    parser.add_argument("-v", "--voice", type=str, default="alloy", help="The voice of the speaker")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"])
    parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"], help="The output audio format")
    parser.add_argument("-s", "--speed", type=float, default=1.0, help="playback speed, 0.25-4.0")
    parser.add_argument("-t", "--text", type=str, default=None, help="Provide text to read on the command line")
    parser.add_argument("-i", "--input", type=str, default=None, help="Read text from a file (default is to read from stdin)")
    if playsound is None:
-        parser.add_argument("-o", "--output", type=str) # required
+        parser.add_argument("-o", "--output", type=str, help="The filename to save the output to") # required
        parser.add_argument("-p", "--playsound", type=None, default=None, help="python playsound not found. pip install playsound")
    else:
-        parser.add_argument("-o", "--output", type=str, default=None) # not required
+        parser.add_argument("-o", "--output", type=str, default=None, help="The filename to save the output to") # not required
-        parser.add_argument("-p", "--playsound", action="store_true")
+        parser.add_argument("-p", "--playsound", action="store_true", help="Play the audio")
    args = parser.parse_args(argv)
@ -50,6 +55,17 @@ if __name__ == "__main__":
        print("Must select one of playsound (-p) or output file name (-o)")
        sys.exit(1)
    if args.input is None and args.text is None:
        text = sys.stdin.read()
    elif args.text:
        text = args.text
    elif args.input:
        if os.path.exists(args.input):
            with open(args.input, 'r') as f:
                text = f.read()
        else:
            print(f"Warning! File not found: {args.input}\nFalling back to old behavior for -i")
            text = args.input
    client = openai.OpenAI(
        # This part is not needed if you set these environment variables before import openai
@ -60,21 +76,21 @@ if __name__ == "__main__":
    )
    if args.playsound and args.output is None:
-        tf, args.output = file_path = tempfile.mkstemp(suffix='.wav')
+        _, args.output = tempfile.mkstemp(suffix='.wav')
-    else:
+        
-        tf = None
+        def cleanup():
            os.unlink(args.output)
        atexit.register(cleanup)
    with client.audio.speech.with_streaming_response.create(
        model=args.model,
        voice=args.voice,
        speed=args.speed,
        response_format=args.format,
-        input=args.input,
+        input=text,
    ) as response:
        response.stream_to_file(args.output)
-    if args.playsound:
+        if args.playsound:
-        playsound(args.output)
+            playsound(args.output)
    if tf:
        os.unlink(args.output)
--- a/speech.py
+++ b/speech.py
@ -65,9 +65,18 @@ class parler_tts():
        return tf
 def default_exists(filename: str):
    if not os.path.exists(filename):
        basename, ext = os.path.splitext(filename)
        default = f"{basename}.default{ext}"
        with open(default, 'r') as from_file:
            with open(filename, 'w') as to_file:
                to_file.write(from_file.read())
 # Read pre process map on demand so it can be changed without restarting the server
 def preprocess(raw_input):
-    with open('pre_process_map.yaml', 'r', encoding='utf8') as file:
+    default_exists('config/pre_process_map.yaml')
    with open('config/pre_process_map.yaml', 'r', encoding='utf8') as file:
        pre_process_map = yaml.safe_load(file)
        for a, b in pre_process_map:
            raw_input = re.sub(a, b, raw_input)
@ -75,9 +84,10 @@ def preprocess(raw_input):
 # Read voice map on demand so it can be changed without restarting the server
 def map_voice_to_speaker(voice: str, model: str):
-    with open('voice_to_speaker.yaml', 'r', encoding='utf8') as file:
+    default_exists('config/voice_to_speaker.yaml')
    with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
        voice_map = yaml.safe_load(file)
-        return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'], 
+        return (voice_map[model][voice]['model'], voice_map[model][voice]['speaker'])
 class GenerateSpeechRequest(BaseModel):
    model: str = "tts-1" # or "tts-1-hd"
@ -197,7 +207,7 @@ if __name__ == "__main__":
    parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
    parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
    parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
-    parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0")
+    parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
    args = parser.parse_args()