0.10.0

2025-06-26 18:16:32 +00:00 · 2024-04-26 20:42:33 -04:00 · 2024-04-26 20:42:33 -04:00 · 6864cf03b1
commit 6864cf03b1
parent a2a3d2b3eb
16 changed files with 260 additions and 70 deletions
--- a/.github/workflows/build-docker.yml
+++ b/.github/workflows/build-docker.yml
@ -0,0 +1,127 @@
+name: Build and Publish Docker Image
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'main'
+  release:
+    types: [published]
+
+jobs:
+  build-and-push-image:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    env:
+      # Set up environment variables for the job
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: ${{ github.repository }}
+      TAG: ${{ github.sha }}
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
+
+  build-and-push-alt-image:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    env:
+      # Set up environment variables for the job
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: matatonic/openedai-speech-min
+      TAG: ${{ github.sha }}
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile.min
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile.min
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
+
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,8 @@
 voices/
+.env
+speech.env
+config/pre_process_map.yaml
+config/voice_to_speaker.yaml

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/13
+++ b/13
@ -1,24 +1,17 @@
 FROM python:3.11-slim

 ENV COQUI_TOS_AGREED=1
-ENV PRELOAD_MODEL=xtts
-# or PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1

 RUN apt-get update && \
    apt-get install --no-install-recommends -y curl git ffmpeg

-#RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
-# default clone of the default voice is really bad, use a better default
-COPY voices/alloy-alt.wav /app/voices/
 WORKDIR /app
 COPY *.txt /app/
 RUN pip install --no-cache -r requirements.txt
-COPY *.sh /app/
-RUN ./download_voices_tts-1.sh
-RUN ./download_voices_tts-1-hd.sh
-COPY *.py *.yaml *.md LICENSE /app/
+COPY *.sh *.py *.yaml *.md LICENSE config /app/

 RUN apt-get clean && rm -rf /var/lib/apt/lists/*

-CMD python speech.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL
+ENV CLI_COMMAND="python speech.py"
+CMD $CLI_COMMAND
--- a/Dockerfile.min
+++ b/Dockerfile.min
@ -3,15 +3,13 @@ FROM python:3.11-slim
 RUN apt-get update && \
    apt-get install --no-install-recommends -y ffmpeg curl

-RUN pip install --no-cache piper-tts pyyaml fastapi uvicorn
+RUN pip install --no-cache piper-tts==1.2.0 pyyaml fastapi uvicorn

-#RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
 COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
 WORKDIR /app

-RUN ./download_voices_tts-1.sh
-
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*

-CMD python speech.py --host 0.0.0.0 --port 8000 --xtts_device none
+ENV CLI_COMMAND="python speech.py --xtts_device none"
+CMD $CLI_COMMAND
--- a/README.md
+++ b/README.md
@ -25,6 +25,12 @@ Details:
 If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.


+Version: 0.10.0, 2024-04-26
+
+* Better upgrades: Reorganize config files under config, voice models under voices
+* * **If you customized your `voice_to_speaker.yaml` or `pre_process_map.yaml` you need to move them to the `config/` folder.**
+* default listen host to 0.0.0.0
+
 Version: 0.9.0, 2024-04-23

 * Fix bug with yaml and loading UTF-8
@ -54,45 +60,47 @@ API Documentation
 Installation instructions
 -------------------------

-You can run the server via docker like so (**recommended**):
+1) Download the models & voices
 ```shell
+# for tts-1 / piper
+bash download_voices_tts-1.sh
+# and for tts-1-hd / xtts
+bash download_voices_tts-1-hd.sh
+```
+
+2a) Docker (**recommended**): You can run the server via docker like so:
+```shell
+cp sample.env speech.env # edit to suit your environment as needed, you can preload a model on startup
 docker compose up
 ```
-If you want a minimal docker image with piper support only (900MB vs. 13.5GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
+If you want a minimal docker image with piper support only (~1GB vs. ~10GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.

-Manual instructions:
+2b) Manual instructions:
 ```shell
 # Install the Python requirements
 pip install -r requirements.txt
 # install ffmpeg and curl
 sudo apt install ffmpeg curl
-# Download the voice models:
-# for tts-1
-bash download_voices_tts-1.sh
-# and for tts-1-hd
-bash download_voices_tts-1-hd.sh
+python speech.py
 ```

 Usage
 -----

 ```
-usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT]
-                 [-H HOST]
+usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST]

 OpenedAI Speech API Server

 options:
  -h, --help            show this help message and exit
-  --piper_cuda          Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me,
-                        but cpu is fast enough (default: False)
+  --piper_cuda          Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough (default: False)
  --xtts_device XTTS_DEVICE
-                        Set the device for the xtts model. The special value of 'none' will use
-                        piper for all models. (default: cuda)
-  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on
-                        first use. (default: None)
+                        Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
+  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
  -P PORT, --port PORT  Server tcp port (default: 8000)
-  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: localhost)
+  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
+
 ```

 Sample API Usage
@ -141,10 +149,32 @@ with client.audio.speech.with_streaming_response.create(
 Also see the `say.py` sample application for an example of how to use the openai-python API.

 ```
-$ python say.py -i "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
-$ python say.py -i "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
+$ python say.py -t "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
+$ python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
 ```

+```
+usage: say.py [-h] [-m MODEL] [-v VOICE] [-f {mp3,aac,opus,flac}] [-s SPEED] [-t TEXT] [-i INPUT] [-o OUTPUT] [-p]
+
+Text to speech using the OpenAI API
+
+options:
+  -h, --help            show this help message and exit
+  -m MODEL, --model MODEL
+                        The model to use (default: tts-1)
+  -v VOICE, --voice VOICE
+                        The voice of the speaker (default: alloy)
+  -f {mp3,aac,opus,flac}, --format {mp3,aac,opus,flac}
+                        The output audio format (default: mp3)
+  -s SPEED, --speed SPEED
+                        playback speed, 0.25-4.0 (default: 1.0)
+  -t TEXT, --text TEXT  Provide text to read on the command line (default: None)
+  -i INPUT, --input INPUT
+                        Read text from a file (default is to read from stdin) (default: None)
+  -o OUTPUT, --output OUTPUT
+                        The filename to save the output to (default: None)
+  -p, --playsound       Play the audio (default: False)
+```

 Custom Voices Howto
 -------------------
--- a/config/pre_process_map.default.yaml
+++ b/config/pre_process_map.default.yaml
--- a/config/voice_to_speaker.default.yaml
+++ b/config/voice_to_speaker.default.yaml
@ -2,7 +2,7 @@
  some_other_voice_name_you_want:
    model: voices/choose your own model.onnx
    speaker: set your own speaker
-  alloy: 
+  alloy:
    model: voices/en_US-libritts_r-medium.onnx
    speaker: 79 # 64, 79, 80, 101, 130
  echo:
@ -24,7 +24,7 @@
    model: voices/en_US-libritts_r-medium.onnx
    speaker: 163
 tts-1-hd:
-  alloy: 
+  alloy:
    model: xtts
    speaker: voices/alloy-alt.wav
  alloy-orig: 
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,16 +1,17 @@
 services:
  server:
    build:
-      context: .
      dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, ~10GB
      #dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
-    #command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000",  "--preload", "parler-tts/parler_tts_mini_v0.1"]
-    command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "xtts"]
-    #command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--xtts_device", "none"] # min
+    image: ghcr.io/matatonic/openedai-speech
+    #image: ghcr.io/matatonic/openedai-speech-min
+    env_file: speech.env
    ports:
      - "8000:8000"
-#    volumes:
-#      - .:/app/
+    volumes:
+      - ./voices:/app/voices
+      - ./config:/app/config
+    #restart: unless-stopped # install as a service
    # Below can be removed if not using GPU
    runtime: nvidia
    deploy:
--- a/download_samples.sh
+++ b/download_samples.sh
@ -1,4 +1,4 @@
 #!/bin/sh
 for i in alloy echo fable onyx nova shimmer; do
-	curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
+	[ ! -e "voices/$i.wav" ] && curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
 done
--- a/download_voices_tts-1-hd.sh
+++ b/download_voices_tts-1-hd.sh
@ -1,4 +1,9 @@
 #!/bin/sh
 export COQUI_TOS_AGREED=1
-python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')"
+export TTS_HOME=voices
+
+MODELS=${*:-xtts}
+for model in $MODELS; do
+	python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
+done
 ./download_samples.sh
--- a/download_voices_tts-1.sh
+++ b/download_voices_tts-1.sh
@ -1,5 +1,5 @@
 #!/bin/sh
-models="en_GB-northern_english_male-medium en_US-libritts_r-medium" # en_US-ryan-high 
+models=${*:-"en_GB-northern_english_male-medium en_US-libritts_r-medium"} # en_US-ryan-high
 piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null
 for i in $models ; do
 	piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null
--- a/openedai.py
+++ b/openedai.py
@ -3,8 +3,8 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import PlainTextResponse

 class OpenAIStub(FastAPI):
-    def __init__(self) -> None:
-        super().__init__()
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
        self.models = {}
            
        self.add_middleware(
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 fastapi
 uvicorn
 # piper-tts
-piper-tts
+piper-tts==1.2.0
 onnxruntime-gpu
 # xtts
 TTS
--- a/sample.env
+++ b/sample.env
@ -0,0 +1,6 @@
+TTS_HOME=voices
+HF_HOME=voices
+#PRELOAD_MODEL=xtts
+#PRELOAD_MODEL=xtts_v2.0.2
+#CLI_COMMAND="python speech.py --preload $PRELOAD_MODEL"
+#CLI_COMMAND="python speech.py --xtts_device none" # for piper only
--- a/say.py
+++ b/say.py
@ -2,6 +2,7 @@

 import sys
 import os
+import atexit
 import tempfile
 import argparse

@ -20,19 +21,23 @@ import openai


 def parse_args(argv):
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-m", "--model", type=str, default="tts-1")#, choices=["tts-1", "tts-1-hd"])
-    parser.add_argument("-v", "--voice", type=str, default="alloy")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"])
-    parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"])
-    parser.add_argument("-s", "--speed", type=float, default=1.0)
-    parser.add_argument("-i", "--input", type=str)
+    parser = argparse.ArgumentParser(
+        description='Text to speech using the OpenAI API',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("-m", "--model", type=str, default="tts-1", help="The model to use")#, choices=["tts-1", "tts-1-hd"])
+    parser.add_argument("-v", "--voice", type=str, default="alloy", help="The voice of the speaker")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"])
+    parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"], help="The output audio format")
+    parser.add_argument("-s", "--speed", type=float, default=1.0, help="playback speed, 0.25-4.0")
+    parser.add_argument("-t", "--text", type=str, default=None, help="Provide text to read on the command line")
+    parser.add_argument("-i", "--input", type=str, default=None, help="Read text from a file (default is to read from stdin)")
    
    if playsound is None:
-        parser.add_argument("-o", "--output", type=str) # required
+        parser.add_argument("-o", "--output", type=str, help="The filename to save the output to") # required
        parser.add_argument("-p", "--playsound", type=None, default=None, help="python playsound not found. pip install playsound")
    else:
-        parser.add_argument("-o", "--output", type=str, default=None) # not required
-        parser.add_argument("-p", "--playsound", action="store_true")
+        parser.add_argument("-o", "--output", type=str, default=None, help="The filename to save the output to") # not required
+        parser.add_argument("-p", "--playsound", action="store_true", help="Play the audio")

    args = parser.parse_args(argv)

@ -50,6 +55,17 @@ if __name__ == "__main__":
        print("Must select one of playsound (-p) or output file name (-o)")
        sys.exit(1)

+    if args.input is None and args.text is None:
+        text = sys.stdin.read()
+    elif args.text:
+        text = args.text
+    elif args.input:
+        if os.path.exists(args.input):
+            with open(args.input, 'r') as f:
+                text = f.read()
+        else:
+            print(f"Warning! File not found: {args.input}\nFalling back to old behavior for -i")
+            text = args.input

    client = openai.OpenAI(
        # This part is not needed if you set these environment variables before import openai
@ -60,21 +76,21 @@ if __name__ == "__main__":
    )

    if args.playsound and args.output is None:
-        tf, args.output = file_path = tempfile.mkstemp(suffix='.wav')
-    else:
-        tf = None
+        _, args.output = tempfile.mkstemp(suffix='.wav')
+        
+        def cleanup():
+            os.unlink(args.output)
+
+        atexit.register(cleanup)

    with client.audio.speech.with_streaming_response.create(
        model=args.model,
        voice=args.voice,
        speed=args.speed,
        response_format=args.format,
-        input=args.input,
+        input=text,
    ) as response:
        response.stream_to_file(args.output)

-    if args.playsound:
-        playsound(args.output)
-    
-    if tf:
-        os.unlink(args.output)
+        if args.playsound:
+            playsound(args.output)
--- a/speech.py
+++ b/speech.py
@ -65,9 +65,18 @@ class parler_tts():
        return tf


+def default_exists(filename: str):
+    if not os.path.exists(filename):
+        basename, ext = os.path.splitext(filename)
+        default = f"{basename}.default{ext}"
+        with open(default, 'r') as from_file:
+            with open(filename, 'w') as to_file:
+                to_file.write(from_file.read())
+
 # Read pre process map on demand so it can be changed without restarting the server
 def preprocess(raw_input):
-    with open('pre_process_map.yaml', 'r', encoding='utf8') as file:
+    default_exists('config/pre_process_map.yaml')
+    with open('config/pre_process_map.yaml', 'r', encoding='utf8') as file:
        pre_process_map = yaml.safe_load(file)
        for a, b in pre_process_map:
            raw_input = re.sub(a, b, raw_input)
@ -75,9 +84,10 @@ def preprocess(raw_input):

 # Read voice map on demand so it can be changed without restarting the server
 def map_voice_to_speaker(voice: str, model: str):
-    with open('voice_to_speaker.yaml', 'r', encoding='utf8') as file:
+    default_exists('config/voice_to_speaker.yaml')
+    with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
        voice_map = yaml.safe_load(file)
-        return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'], 
+        return (voice_map[model][voice]['model'], voice_map[model][voice]['speaker'])

 class GenerateSpeechRequest(BaseModel):
    model: str = "tts-1" # or "tts-1-hd"
@ -197,7 +207,7 @@ if __name__ == "__main__":
    parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
    parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
    parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
-    parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0")
+    parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")

    args = parser.parse_args()