Merge pull request #21 from matatonic/dev

0.13.0
2025-06-26 18:16:32 +00:00 · 2024-06-25 17:28:02 -04:00 · 2024-06-25 17:28:02 -04:00 · 65c03e3448
commit 65c03e3448
parent 18c73ce827 34bf525c89
20 changed files with 492 additions and 161 deletions
--- a/.github/workflows/build-docker.yml
+++ b/.github/workflows/build-docker.yml
@ -55,6 +55,7 @@ jobs:
          push: true
          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64

      # For tagged releases, build and push the Docker image with the corresponding tag
      - name: Build and Push Docker Image (Tagged)
@ -66,8 +67,9 @@ jobs:
          push: true
          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64

-  build-and-push-alt-image:
+  build-and-push-min-image:
    runs-on: ubuntu-latest

    permissions:
@ -113,6 +115,7 @@ jobs:
          push: true
          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64

      # For tagged releases, build and push the Docker image with the corresponding tag
      - name: Build and Push Docker Image (Tagged)
@ -124,4 +127,70 @@ jobs:
          push: true
          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+
+  build-and-push-rocm-image:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    env:
+      # Set up environment variables for the job
+      USE_ROCM: 1
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: ${{ github.repository }}-rocm
+      TAG: ${{ github.sha }}
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+          build-args: |
+            USE_ROCM=1
+
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+          build-args: |
+            USE_ROCM=1

--- a/19
+++ b/19
@ -1,23 +1,28 @@
 FROM python:3.11-slim

-RUN apt-get update && \
-    apt-get install --no-install-recommends -y curl ffmpeg git && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+ARG TARGETPLATFORM
+RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then apt-get install --no-install-recommends -y build-essential ; fi
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*

 WORKDIR /app
 RUN mkdir -p voices config

-COPY requirements.txt /app/
+ARG USE_ROCM
+ENV USE_ROCM=${USE_ROCM}
+
+COPY requirements*.txt /app/
+RUN if [ "${USE_ROCM}" = "1" ]; then mv /app/requirements-rocm.txt /app/requirements.txt; fi
 RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt

-COPY speech.py openedai.py say.py *.sh README.md LICENSE /app/
-COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /app/config/
+COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/

 ARG PRELOAD_MODEL
 ENV PRELOAD_MODEL=${PRELOAD_MODEL}
 ENV TTS_HOME=voices
 ENV HF_HOME=voices
-ENV OPENEDAI_LOG_LEVEL=INFO
 ENV COQUI_TOS_AGREED=1

 CMD bash startup.sh
--- a/Dockerfile.min
+++ b/Dockerfile.min
@ -1,20 +1,20 @@
 FROM python:3.11-slim

-RUN apt-get update && \
-    apt-get install --no-install-recommends -y curl ffmpeg && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+ARG TARGETPLATFORM
+RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then apt-get install --no-install-recommends -y build-essential ; fi
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*

 WORKDIR /app
 RUN mkdir -p voices config

-RUN --mount=type=cache,target=/root/.cache/pip pip install piper-tts==1.2.0 pyyaml fastapi uvicorn loguru numpy\<2
-
-
-COPY speech.py openedai.py say.py *.sh README.md LICENSE /app/
-COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /app/config/
+COPY requirements*.txt /app/
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements-min.txt
+COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/

 ENV TTS_HOME=voices
 ENV HF_HOME=voices
-ENV OPENEDAI_LOG_LEVEL=INFO

 CMD bash startup.min.sh
--- a/README.md
+++ b/README.md
@ -19,6 +19,7 @@ Details:
 * Model `tts-1-hd` via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, but requires around 4GB GPU VRAM)
  * Custom cloned voices can be used for tts-1-hd, See: [Custom Voices Howto](#custom-voices-howto)
  * 🌐 [Multilingual](#multilingual) support with XTTS voices
+  * [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
 * Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`


@ -26,6 +27,16 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s

 ## Recent Changes

+Version 0.13.0, 2024-06-25
+
+* Added [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
+* Initial prebuilt arm64 image support (Apple M-series, Raspberry Pi - MPS is not supported in XTTS/torch), thanks @JakeStevenson, @hchasens
+* Initial attempt at AMD GPU (ROCm 5.7) support
+* Parler-tts support removed
+* Move the *.default.yaml to the root folder
+* Run the docker as a service by default (`restart: unless-stopped`)
+* Added `audio_reader.py` for streaming text input and reading long texts
+
 Version 0.12.3, 2024-06-17

 * Additional logging details for BadRequests (400)
@ -75,23 +86,24 @@ Version: 0.7.3, 2024-03-20

 ## Installation instructions

-1) Copy the `sample.env` to `speech.env` (customize if needed)
+### Create a `speech.env` environment file
+
+Copy the `sample.env` to `speech.env` (customize if needed)
 ```bash
 cp sample.env speech.env
 ```

-2. Option: Docker (**recommended**) (prebuilt images are available)
-
-Run the server:
-```shell
-docker compose up
+#### Defaults
+```bash
+TTS_HOME=voices
+HF_HOME=voices
+#PRELOAD_MODEL=xtts
+#PRELOAD_MODEL=xtts_v2.0.2
+#EXTRA_ARGS=--log-level DEBUG
+#USE_ROCM=1
 ```
-For a minimal docker image with only piper support (<1GB vs. 8GB), use `docker compose -f docker-compose.min.yml up`

-To install the docker image as a service, edit the `docker-compose.yml` and uncomment `restart: unless-stopped`, then start the service with: `docker compose up -d`
-
-
-2. Option: Manual installation:
+### Option A: Manual installation
 ```shell
 # install curl and ffmpeg
 sudo apt install curl ffmpeg
@ -99,38 +111,43 @@ sudo apt install curl ffmpeg
 python -m venv .venv
 source .venv/bin/activate
 # Install the Python requirements
+# - use requirements-rocm.txt for AMD GPU (ROCm support)
+# - use requirements-min.txt for piper only (CPU only)
 pip install -r requirements.txt
 # run the server
 bash startup.sh
 ```

+> On first run, the voice models will be downloaded automatically. This might take a while depending on your network connection.

-## Usage
+### Option B: Docker Image (*recommended*)

-```
-usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST] [-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
-
-OpenedAI Speech API Server
-
-options:
-  -h, --help            show this help message and exit
-  --xtts_device XTTS_DEVICE
-                        Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
-  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
-  -P PORT, --port PORT  Server tcp port (default: 8000)
-  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
-  -L {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
-                        Set the log level (default: INFO)
+#### Nvidia GPU (cuda)

+```shell
+docker compose up
 ```

-## API Documentation
+#### AMD GPU (ROCm support)

-* [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech)
-* [OpenAI API Reference](https://platform.openai.com/docs/api-reference/audio/createSpeech)
+```shell
+docker compose -d docker-compose.rocm.yml up
+```
+
+#### ARM64 (Apple M-series, Raspberry Pi)
+
+> XTTS only has CPU support here and will be very slow, you can use the Nvidia image for XTTS with CPU (slow), or use the piper only image (recommended)
+
+#### CPU only, No GPU (piper only)
+
+> For a minimal docker image with only piper support (<1GB vs. 8GB).
+
+```shell
+docker compose -f docker-compose.min.yml up
+```


-### Sample API Usage
+## Sample Usage

 You can use it like this:

@ -147,7 +164,7 @@ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -
 Or just like this:

 ```shell
-curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
+curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
    "input": "The quick brown fox jumped over the lazy dog."}' > speech.mp3
 ```

@ -175,33 +192,24 @@ with client.audio.speech.with_streaming_response.create(
 Also see the `say.py` sample application for an example of how to use the openai-python API.

 ```shell
-python say.py -t "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
-python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
+# play the audio, requires 'pip install playsound'
+python say.py -t "The quick brown fox jumped over the lazy dog." -p
+# save to a file in flac format
+python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac
 ```

+You can also try the included `audio_reader.py` for listening to longer text and streamed input.
+
+Example usage:
+```bash
+python audio_reader.py -s 2 < LICENSE # read the software license - fast
 ```
-usage: say.py [-h] [-m MODEL] [-v VOICE] [-f {mp3,aac,opus,flac}] [-s SPEED] [-t TEXT] [-i INPUT] [-o OUTPUT] [-p]

-Text to speech using the OpenAI API
+## OpenAI API Documentation and Guide

-options:
-  -h, --help            show this help message and exit
-  -m MODEL, --model MODEL
-                        The model to use (default: tts-1)
-  -v VOICE, --voice VOICE
-                        The voice of the speaker (default: alloy)
-  -f {mp3,aac,opus,flac}, --format {mp3,aac,opus,flac}
-                        The output audio format (default: mp3)
-  -s SPEED, --speed SPEED
-                        playback speed, 0.25-4.0 (default: 1.0)
-  -t TEXT, --text TEXT  Provide text to read on the command line (default: None)
-  -i INPUT, --input INPUT
-                        Read text from a file (default is to read from stdin) (default: None)
-  -o OUTPUT, --output OUTPUT
-                        The filename to save the output to (default: None)
-  -p, --playsound       Play the audio (default: False)
+* [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech)
+* [OpenAI API Reference](https://platform.openai.com/docs/api-reference/audio/createSpeech)

-```

 ## Custom Voices Howto

@ -251,13 +259,13 @@ For example:
 ...
 tts-1-hd:
  me:
-    model: xtts_v2.0.2 # you can specify different xtts versions
+    model: xtts
    speaker: voices/me.wav # this could be you
 ```

 ## Multilingual

-Multilingual support was added in version 0.11.0 and is available only with the XTTS v2 model.
+Multilingual cloning support was added in version 0.11.0 and is available only with the XTTS v2 model. To use multilingual voices with piper simply download a language specific voice.

 Coqui XTTSv2 has support for 16 languages: English (`en`), Spanish (`es`), French (`fr`), German (`de`), Italian (`it`), Portuguese (`pt`), Polish (`pl`), Turkish (`tr`), Russian (`ru`), Dutch (`nl`), Czech (`cs`), Arabic (`ar`), Chinese (`zh-cn`), Japanese (`ja`), Hungarian (`hu`) and Korean (`ko`).

@ -284,3 +292,24 @@ Remove:
 These lines were added to the `config/pre_process_map.yaml` config file by default before version 0.11.0:

 4) Your new multi-lingual speaker voice is ready to use!
+
+
+## Custom Fine-Tuned Model Support
+
+Adding a custom xtts model is simple. Here is an example of how to add a custom fine-tuned 'halo' XTTS model.
+
+1) Save the model folder under `voices/` (all 4 files are required, including the vocab.json from the model)
+```
+openedai-speech$ ls voices/halo/
+config.json  vocab.json  model.pth  sample.wav
+```
+2) Add the custom voice entry under the `tts-1-hd` section of `config/voice_to_speaker.yaml`:
+```yaml
+tts-1-hd:
+...
+  halo:
+    model: halo # This name is required to be unique
+    speaker: voices/halo/sample.wav # voice sample is required
+    model_path: voices/halo
+```
+3) The model will be loaded when you access the voice for the first time (`--preload` doesn't work with custom models yet)
--- a/add_voice.py
+++ b/add_voice.py
@ -0,0 +1,63 @@
+#!/usr/bin/env python
+
+import argparse
+import os
+import shutil
+import yaml
+
+print("!! WARNING EXPERIMENTAL !! - THIS TOOL WILL ERASE ALL COMMENTS FROM THE CONFIG FILES .. OR WORSE!!")
+
+parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+parser.add_argument('sample', action='store', help="Set the wav sample file")
+parser.add_argument('-n', '--name', action='store', help="Set the name for the voice (by default will use the WAV file name)")
+parser.add_argument('-l', '--language', action='store', default="en", help="Set the language for the voice",
+                    choices=['en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko'])
+parser.add_argument('--openai-model', action='store', default="tts-1-hd", help="Set the openai model for the voice")
+parser.add_argument('--xtts-model', action='store', default="xtts", help="Set the xtts model for the voice (if using a custom model, also set model_path)")
+parser.add_argument('--model-path', action='store', default=None, help="Set the path for a custom xtts model")
+parser.add_argument('--config-path', action='store', default="config/voice_to_speaker.yaml", help="Set the config file path")
+parser.add_argument('--voice-path', action='store', default="voices", help="Set the default voices file path")
+parser.add_argument('--default-path', action='store', default="voice_to_speaker.default.yaml", help="Set the default config file path")
+
+args = parser.parse_args()
+
+basename = os.path.basename(args.sample)
+name_noext, ext = os.path.splitext(basename)
+
+if not args.name:
+    args.name = name_noext
+else:
+    basename = f"{args.name}.wav"
+
+dest_file = os.path.join(args.voice_path, basename)
+if args.sample != dest_file:
+    shutil.copy2(args.sample, dest_file)
+
+if not os.path.exists(args.config_path):
+    shutil.copy2(args.default_path, args.config_path)
+
+with open(args.config_path, 'r', encoding='utf8') as file:
+    voice_map = yaml.safe_load(file)
+
+model_conf = voice_map.get(args.openai_model, {})
+model_conf[args.name] = {
+    'model': args.xtts_model,
+    'speaker': os.path.join(args.voice_path, basename),
+    'language': args.language,
+}
+if args.model_path:
+    model_conf[args.name]['model_path'] = args.model_path
+voice_map[args.openai_model] = model_conf
+
+with open(args.config_path, 'w', encoding='utf8') as ofile:
+    yaml.safe_dump(voice_map, ofile, default_flow_style=False, allow_unicode=True)
+
+print(f"Updated: {args.config_path}")
+print(f"Added voice: {args.openai_model}/{args.name}")
+print(f"Added section:")
+print(f"{args.openai_model}:")
+print(f"  {args.name}:")
+print(f"    model: {model_conf[args.name]['model']}")
+print(f"    speaker: {model_conf[args.name]['speaker']}")
+print(f"    language: {model_conf[args.name]['language']}")
--- a/audio_reader.py
+++ b/audio_reader.py
@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+try:
+    import dotenv
+    dotenv.load_dotenv()
+except ImportError:
+    pass
+
+import argparse
+import os
+import pysbd
+import queue
+import sys
+import tempfile
+import threading
+import shutil
+import sys
+import tempfile
+import contextlib
+
+import openai
+
+try:
+    from playsound import playsound
+except ImportError:
+    print("Error: missing required package 'playsound'. !pip install playsound")
+    sys.exit(1)
+
+@contextlib.contextmanager
+def tempdir():
+    path = tempfile.mkdtemp()
+    try:
+        yield path
+    finally:
+        try:
+            shutil.rmtree(path)
+        except IOError:
+            sys.stderr.write('Failed to clean up temp dir {}'.format(path))
+
+class SimpleAudioPlayer:
+    def __init__(self):
+        self._queue = queue.Queue()
+        self.running = True
+        self._thread = threading.Thread(target=self.__play_audio_loop, daemon=True)
+        self._thread.start()
+
+    def put(self, file):
+        self._queue.put(file)
+
+    def stop(self):
+        self.running = False
+        self._thread.join()
+        try:
+            while True:
+                file = self._queue.get_nowait()
+                if os.path.exists(file):
+                    os.unlink(file)
+        except queue.Empty as e:
+            pass
+
+    def __play_audio_loop(self):
+        while self.running:
+            try:
+                while True:
+                    file = self._queue.get(block=True, timeout=0.01)
+
+                    try:
+                        playsound(file)
+                    finally:
+                        os.unlink(file)
+
+            except queue.Empty as e:
+                continue
+
+class OpenAI_tts:
+    def __init__(self, model, voice, speed, base_dir):
+        self.base_dir = base_dir
+        self.openai_client = openai.OpenAI(
+            # export OPENAI_API_KEY=sk-11111111111
+            # export OPENAI_BASE_URL=http://localhost:8000/v1
+            api_key = os.environ.get("OPENAI_API_KEY", "sk-ip"),
+            base_url = os.environ.get("OPENAI_BASE_URL", "http://localhost:8000/v1"),
+        )
+        
+        self.params = {
+            'model': model,
+            'voice': voice,
+            'speed': speed
+        }
+
+    def speech_to_file(self, text: str) -> None:
+        with self.openai_client.audio.speech.with_streaming_response.create(
+                input=text, response_format='opus', **self.params
+            ) as response:
+            tf, output_filename = tempfile.mkstemp(suffix='.wav', prefix="audio_reader_", dir=self.base_dir)
+            response.stream_to_file(output_filename)
+            return output_filename
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Text to speech player',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument('-m', '--model', action='store', default="tts-1", help="The OpenAI model")
+    parser.add_argument('-v', '--voice', action='store', default="alloy", help="The voice to use")
+    parser.add_argument('-s', '--speed', action='store', default=1.0, help="How fast to read the audio")
+
+    args = parser.parse_args()
+
+    try:
+        with tempdir() as base_dir:
+            player = SimpleAudioPlayer()
+            reader = OpenAI_tts(voice=args.voice, model=args.model, speed=args.speed, base_dir=base_dir)
+            seg = pysbd.Segmenter(language='en', clean=True) # text is dirty, clean it up.
+
+            for raw_line in sys.stdin:
+                for line in seg.segment(raw_line):
+                    if not line:
+                        continue
+
+                    print(line)
+                    player.put(reader.speech_to_file(line))
+
+            player.stop()
+
+    except KeyboardInterrupt:
+        pass
--- a/config/config_files_will_go_here.txt
+++ b/config/config_files_will_go_here.txt
--- a/docker-compose.min.yml
+++ b/docker-compose.min.yml
@ -10,4 +10,4 @@ services:
      - ./voices:/app/voices
      - ./config:/app/config
    # To install as a service
-    #restart: unless-stopped
+    restart: unless-stopped
--- a/docker-compose.rocm.yml
+++ b/docker-compose.rocm.yml
@ -0,0 +1,27 @@
+services:
+  server:
+    build:
+      dockerfile: Dockerfile
+      args:
+        - USE_ROCM=1
+    image: ghcr.io/matatonic/openedai-speech-rocm
+    env_file: speech.env
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./voices:/app/voices
+      - ./config:/app/config
+    # To install as a service
+    restart: unless-stopped
+    # For AMD GPU (ROCm) Support
+    cap_add:
+      - SYS_PTRACE
+    devices:
+      - /dev/kfd
+      - /dev/dri
+    security_opt:
+      - seccomp=unconfined
+    group_add:
+      - video
+      - audio
+    ipc: host
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -10,9 +10,7 @@ services:
      - ./voices:/app/voices
      - ./config:/app/config
    # To install as a service
-    #restart: unless-stopped
-    # Set nvidia runtime if it's not the default
-    #runtime: nvidia
+    restart: unless-stopped
    deploy:
      resources:
        reservations:
--- a/config/pre_process_map.default.yaml
+++ b/config/pre_process_map.default.yaml
@ -1,4 +1,4 @@
-# regex pairs to clean the text before speaking
+# regex pairs to clean the text before speaking
 - - ([^.])\.$
  - \1
 - - '&amp;'
--- a/requirements-min.txt
+++ b/requirements-min.txt
@ -0,0 +1,6 @@
+pyyaml
+fastapi
+uvicorn
+loguru
+numpy<2
+piper-tts==1.2.0
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@ -0,0 +1,14 @@
+fastapi
+uvicorn
+loguru
+# piper-tts
+piper-tts==1.2.0
+# xtts
+TTS
+# XXX, 3.8+ has some issue for now
+spacy==3.7.4
+
+# torch==2.2.2 Fixes: https://github.com/matatonic/openedai-speech/issues/9
+# Re:  https://github.com/pytorch/pytorch/issues/121834
+torch==2.2.2; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"
+torchaudio==2.2.2; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"
--- a/requirements.txt
+++ b/requirements.txt
@ -5,10 +5,15 @@ loguru
 piper-tts==1.2.0
 # xtts
 TTS
-# Fixes: https://github.com/matatonic/openedai-speech/issues/9
-# Re:  https://github.com/pytorch/pytorch/issues/121834
-torch==2.2.2
 # XXX, 3.8+ has some issue for now
 spacy==3.7.4
-# parler-tts
-git+https://github.com/huggingface/parler-tts.git
+
+# torch==2.2.2 Fixes: https://github.com/matatonic/openedai-speech/issues/9
+# Re:  https://github.com/pytorch/pytorch/issues/121834
+torch==2.2.2; sys_platform != "darwin"
+torchaudio; sys_platform != "darwin"
+# for MPS accelerated torch on Mac - doesn't work yet, incomplete support in torch and torchaudio
+torch==2.2.2; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
+torchaudio==2.2.2; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
+
+# ROCM (Linux only) - use requirements.amd.txt
--- a/sample.env
+++ b/sample.env
@ -2,4 +2,5 @@ TTS_HOME=voices
 HF_HOME=voices
 #PRELOAD_MODEL=xtts
 #PRELOAD_MODEL=xtts_v2.0.2
-#PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1
+#EXTRA_ARGS=--log-level DEBUG
+#USE_ROCM=1
--- a/speech.py
+++ b/speech.py
@ -11,71 +11,52 @@ import uvicorn
 from pydantic import BaseModel
 from loguru import logger

-# for parler
-try:
-    from parler_tts import ParlerTTSForConditionalGeneration
-    from transformers import AutoTokenizer, logging
-    import torch
-    import soundfile as sf
-    logging.set_verbosity_error()
-    has_parler_tts = True
-except ImportError:
-    logger.info("No parler support found")
-    has_parler_tts = False
-
-from openedai import OpenAIStub, BadRequestError
+from openedai import OpenAIStub, BadRequestError, ServiceUnavailableError

 xtts = None
 args = None
 app = OpenAIStub()

 class xtts_wrapper():
-    def __init__(self, model_name, device):
+    def __init__(self, model_name, device, model_path=None):
        self.model_name = model_name
-        self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)
+
+        logger.info(f"Loading model {self.model_name} to {device}")
+
+        if model_path: # custom model #  and config_path
+            config_path=os.path.join(model_path, 'config.json')
+            self.xtts = TTS(model_path=model_path, config_path=config_path).to(device)
+        else:
+            self.xtts = TTS(model_name=model_name).to(device)

    def tts(self, text, speaker_wav, speed, language):
-        tf, file_path = tempfile.mkstemp(suffix='.wav')
+        tf, file_path = tempfile.mkstemp(suffix='.wav', prefix='openedai-speech-')

-        file_path = self.xtts.tts_to_file(
-            text=text,
-            language=language,
-            speaker_wav=speaker_wav,
-            speed=speed,
-            file_path=file_path,
-        )
+        try:
+            # TODO: support speaker= as voice id instead of just wav
+            file_path = self.xtts.tts_to_file(
+                text=text,
+                language=language,
+                speaker_wav=speaker_wav,
+                speed=speed,
+                file_path=file_path, 
+            )
+
+        finally:
+            os.unlink(file_path)

-        os.unlink(file_path)
        return tf

-class parler_tts():
-    def __init__(self, model_name, device):
-        self.model_name = model_name
-        self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-    def tts(self, text, description):
-        input_ids = self.tokenizer(description, return_tensors="pt").input_ids.to(self.model.device)
-        prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.model.device)
-
-        generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
-        audio_arr = generation.cpu().numpy().squeeze()
-        
-        tf, file_path = tempfile.mkstemp(suffix='.wav')
-        sf.write(file_path, audio_arr, self.model.config.sampling_rate)
-        os.unlink(file_path)
-        return tf
-
-
 def default_exists(filename: str):
    if not os.path.exists(filename):
-        basename, ext = os.path.splitext(filename)
+        fpath, ext = os.path.splitext(filename)
+        basename = os.path.basename(fpath)
        default = f"{basename}.default{ext}"
        
        logger.info(f"{filename} does not exist, setting defaults from {default}")

-        with open(default, 'r') as from_file:
-            with open(filename, 'w') as to_file:
+        with open(default, 'r', encoding='utf8') as from_file:
+            with open(filename, 'w', encoding='utf8') as to_file:
                to_file.write(from_file.read())

 # Read pre process map on demand so it can be changed without restarting the server
@ -97,14 +78,10 @@ def map_voice_to_speaker(voice: str, model: str):
    with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
        voice_map = yaml.safe_load(file)
        try:
-            m = voice_map[model][voice]['model']
-            s = voice_map[model][voice]['speaker']
-            l = voice_map[model][voice].get('language', 'en')
+            return voice_map[model][voice]

        except KeyError as e:
            raise BadRequestError(f"Error loading voice: {voice}, KeyError: {e}", param='voice')
-        
-        return (m, s, l)

 class GenerateSpeechRequest(BaseModel):
    model: str = "tts-1" # or "tts-1-hd"
@ -162,7 +139,15 @@ async def generate_speech(request: GenerateSpeechRequest):

    # Use piper for tts-1, and if xtts_device == none use for all models.
    if model == 'tts-1' or args.xtts_device == 'none':
-        piper_model, speaker, not_used_language = map_voice_to_speaker(voice, 'tts-1')
+        voice_map = map_voice_to_speaker(voice, 'tts-1')
+        try:
+            piper_model = voice_map['model']
+
+        except KeyError as e:
+            raise ServiceUnavailableError(f"Configuration error: tts-1 voice '{voice}' is missing 'model:' setting. KeyError: {e}")
+
+        speaker = voice_map.get('speaker', None)
+
        tts_args = ["piper", "--model", str(piper_model), "--data-dir", "voices", "--download-dir", "voices", "--output-raw"]
        if speaker:
            tts_args.extend(["--speaker", str(speaker)])
@ -177,7 +162,16 @@ async def generate_speech(request: GenerateSpeechRequest):

    # Use xtts for tts-1-hd
    elif model == 'tts-1-hd':
-        tts_model, speaker, language = map_voice_to_speaker(voice, 'tts-1-hd')
+        voice_map = map_voice_to_speaker(voice, 'tts-1-hd')
+        try:
+            tts_model = voice_map['model']
+            speaker = voice_map['speaker']
+
+        except KeyError as e:
+            raise ServiceUnavailableError(f"Configuration error: tts-1-hd voice '{voice}' is missing setting. KeyError: {e}")
+
+        language = voice_map.get('language', 'en')
+        tts_model_path = voice_map.get('model_path', None)

        if xtts is not None and xtts.model_name != tts_model:
            import torch, gc
@ -186,20 +180,9 @@ async def generate_speech(request: GenerateSpeechRequest):
            gc.collect()
            torch.cuda.empty_cache()

-        if 'parler-tts' in tts_model and has_parler_tts:
-            if xtts is None:
-                xtts = parler_tts(tts_model, device=args.xtts_device)
-
-            ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate=str(xtts.model.config.sampling_rate))
-
-            if speed != 1:
-                ffmpeg_args.extend(["-af", f"atempo={speed}"]) 
-
-            tts_io_out = xtts.tts(text=input_text, description=speaker)
-
        else:
            if xtts is None:
-                xtts = xtts_wrapper(tts_model, device=args.xtts_device)
+                xtts = xtts_wrapper(tts_model, device=args.xtts_device, model_path=tts_model_path)

            ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")

@ -221,13 +204,21 @@ async def generate_speech(request: GenerateSpeechRequest):

    return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)

+# We return 'mps' but currently XTTS will not work with mps devices as the cuda support is incomplete
+def auto_torch_device():
+    try:
+        import torch
+        return 'cuda' if torch.cuda.is_available() else 'mps' if ( torch.backends.mps.is_available() and torch.backends.mps.is_built() ) else 'cpu'
+    
+    except:
+        return 'none'

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='OpenedAI Speech API Server',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

-    parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
+    parser.add_argument('--xtts_device', action='store', default=auto_torch_device(), help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
    parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
    parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
    parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
@ -235,6 +226,9 @@ if __name__ == "__main__":

    args = parser.parse_args()

+    default_exists('config/pre_process_map.yaml')
+    default_exists('config/voice_to_speaker.yaml')
+
    logger.remove()
    logger.add(sink=sys.stderr, level=args.log_level)

@ -242,10 +236,7 @@ if __name__ == "__main__":
        from TTS.api import TTS

    if args.preload:
-        if 'parler-tts' in args.preload:
-            xtts = parler_tts(args.preload, device=args.xtts_device)
-        else:
-            xtts = xtts_wrapper(args.preload, device=args.xtts_device)
+        xtts = xtts_wrapper(args.preload, device=args.xtts_device)

    app.register_model('tts-1')
    app.register_model('tts-1-hd')
--- a/startup.bat
+++ b/startup.bat
@ -5,4 +5,4 @@ set /p < speech.env
 call download_voices_tts-1.bat
 call download_voices_tts-1-hd.bat %PRELOAD_MODEL%

-python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL% %OPENEDAI_LOG_LEVEL:+--log-level %OPENEDAI_LOG_LEVEL%
+python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL% %EXTRA_ARGS%
--- a/startup.min.sh
+++ b/startup.min.sh
@ -4,4 +4,4 @@

 bash download_voices_tts-1.sh

-python speech.py --xtts_device none  ${OPENEDAI_LOG_LEVEL:+--log-level $OPENEDAI_LOG_LEVEL}
+python speech.py --xtts_device none $EXTRA_ARGS $@
--- a/startup.sh
+++ b/startup.sh
@ -2,7 +2,9 @@

 [ -f speech.env ] && . speech.env

+echo "First startup may download 2GB of speech models. Please wait."
+
 bash download_voices_tts-1.sh
 bash download_voices_tts-1-hd.sh $PRELOAD_MODEL

-python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} ${OPENEDAI_LOG_LEVEL:+--log-level $OPENEDAI_LOG_LEVEL} $@
+python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} $EXTRA_ARGS $@
--- a/config/voice_to_speaker.default.yaml
+++ b/config/voice_to_speaker.default.yaml
@ -48,9 +48,3 @@ tts-1-hd:
  me:
    model: xtts_v2.0.2 # you can specify different xtts version
    speaker: voices/me.wav # this could be you
-  parler:
-    model: parler-tts/parler_tts_mini_v0.1
-    speaker: A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast.
-  parler2:
-    model: parler-tts/parler_tts_mini_v0.1
-    speaker: A female voice with an Indian accent enunciates every word with precision. The speaker's voice is very close-sounding, and the recording is excellent, capturing her voice with crisp clarity.