0.11.0 - Multilingual, new startup & dockerfiles, Fixes: #5, #6, #8, #9

2025-06-26 18:16:32 +00:00 · 2024-05-29 17:01:11 -04:00 · 2024-05-29 17:01:11 -04:00 · 2fcb7cef0f
commit 2fcb7cef0f
parent 676f3f38c8
17 changed files with 192 additions and 79 deletions
--- a/.github/workflows/build-docker.yml
+++ b/.github/workflows/build-docker.yml
@ -77,7 +77,7 @@ jobs:
    env:
      # Set up environment variables for the job
      DOCKER_REGISTRY: ghcr.io
-      IMAGE_NAME: matatonic/openedai-speech-min
+      IMAGE_NAME: ${{ github.repository }}-min
      TAG: ${{ github.sha }}

    steps:
--- a/31
+++ b/31
@ -1,17 +1,22 @@
 FROM python:3.11-slim

+RUN apt-get update && \
+    apt-get install --no-install-recommends -y curl ffmpeg git && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+RUN mkdir -p voices config
+
+COPY requirements.txt /app/
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
+
+COPY speech.py openedai.py say.py *.sh README.md LICENSE /app/
+COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /app/config/
+
+ARG PRELOAD_MODEL
+ENV PRELOAD_MODEL=${PRELOAD_MODEL}
+ENV TTS_HOME=voices
+ENV HF_HOME=voices
 ENV COQUI_TOS_AGREED=1

-RUN apt-get update && \
-    apt-get install --no-install-recommends -y curl git ffmpeg
-
-RUN mkdir -p /app/voices
-WORKDIR /app
-COPY *.txt /app/
-RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
-COPY *.sh *.py *.yaml *.md LICENSE config /app/
-
-RUN apt-get clean && rm -rf /var/lib/apt/lists/*
-
-ENV CLI_COMMAND="python speech.py"
-CMD $CLI_COMMAND
+CMD bash startup.sh
--- a/Dockerfile.min
+++ b/Dockerfile.min
@ -1,15 +1,19 @@
 FROM python:3.11-slim

 RUN apt-get update && \
-    apt-get install --no-install-recommends -y ffmpeg curl
+    apt-get install --no-install-recommends -y curl ffmpeg && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*

-RUN pip install --no-cache piper-tts==1.2.0 pyyaml fastapi uvicorn
-
-RUN mkdir -p /app/voices
-COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
 WORKDIR /app
+RUN mkdir -p voices config

-RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN --mount=type=cache,target=/root/.cache/pip pip install piper-tts==1.2.0 pyyaml fastapi uvicorn

-ENV CLI_COMMAND="python speech.py --xtts_device none"
-CMD $CLI_COMMAND
+
+COPY speech.py openedai.py say.py *.sh README.md LICENSE /app/
+COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /app/config/
+
+ENV TTS_HOME=voices
+ENV HF_HOME=voices
+
+CMD bash startup.min.sh
--- a/README.md
+++ b/README.md
@ -14,17 +14,26 @@ Full Compatibility:
 * speed 0.25-4.0 (and more)

 Details:
-* model 'tts-1' via [piper tts](https://github.com/rhasspy/piper) (fast, can use cpu)
-* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, but requires around 4GB GPU VRAM)
-* Can be run without TTS/xtts_v2, entirely on cpu
-* Custom cloned voices can be used for tts-1-hd, just save a WAV file in the `/voices/` directory
-* You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) and xtts_v2 speaker clones via the `voice_to_speaker.yaml` configuration file
+* Model `tts-1` via [piper tts](https://github.com/rhasspy/piper) (very fast, runs on cpu)
+  * You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) via the `voice_to_speaker.yaml` configuration file
+* Model `tts-1-hd` via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, but requires around 4GB GPU VRAM)
+  * Custom cloned voices can be used for tts-1-hd, See: [Custom Voices Howto](#custom-voices-howto)
+  * 🌐 [Multilingual](#multilingual) support with XTTS voices
 * Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`

+
 If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.

 ## Recent Changes

+Version 0.11.0, 2024-05-29
+
+* 🌐 [Multilingual](#multilingual) support (16 languages) with XTTS
+* Remove high Unicode filtering from the default `config/pre_process_map.yaml`
+* Update Docker build & app startup. thanks @justinh-rahb
+* Fix: "Plan failed with a cudnnException"
+* Remove piper cuda support
+
 Version: 0.10.1, 2024-05-05

 * Remove `runtime: nvidia` from docker-compose.yml, this assumes nvidia/cuda compatible runtime is available by default. thanks @jmtatsch
@ -53,59 +62,45 @@ Version: 0.7.3, 2024-03-20

 ## Installation instructions

-1) Download the models & voices
-```shell
-# for tts-1 / piper
-bash download_voices_tts-1.sh
-# and for tts-1-hd / xtts
-bash download_voices_tts-1-hd.sh
+1) Copy the `sample.env` to `speech.env` (customize if needed)
+```bash
+cp sample.env speech.env
 ```

-If you have different models which you want to use, both of the download scripts accept arguments for which models to download.
+2. Option: Docker (**recommended**) (prebuilt images are available)

-Example:
+Run the server:
 ```shell
-# Download en_US-ryan-high too
-bash download_voices_tts-1.sh en_US-libritts_r-medium en_GB-northern_english_male-medium en_US-ryan-high
-# Download xtts (latest) and xtts_v2.0.2
-bash download_voices_tts-1-hd.sh xtts xtts_v2.0.2
-```
-
-
-2a) Option 1: Docker (**recommended**) (prebuilt images are available)
-
-You can run the server via docker like so:
-```shell
-cp sample.env speech.env # edit to suit your environment as needed, you can preload a model on startup
 docker compose up
 ```
-If you want a minimal docker image with piper support only (<1GB vs. 8GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
-To install the docker image as a service, edit the `docker-compose.yml` and uncomment `restart: unless-stopped`, then start the service with: `docker compose up -d`.
+For a minimal docker image with only piper support (<1GB vs. 8GB), use `docker compose -f docker-compose.min.yml up`
+
+To install the docker image as a service, edit the `docker-compose.yml` and uncomment `restart: unless-stopped`, then start the service with: `docker compose up -d`


-2b) Option 2: Manual instructions:
+2. Option: Manual installation:
 ```shell
-# install ffmpeg and curl
-sudo apt install ffmpeg curl
-# Create & activate a new virtual environment
+# install curl and ffmpeg
+sudo apt install curl ffmpeg
+# Create & activate a new virtual environment (optional but recommended)
 python -m venv .venv
 source .venv/bin/activate
 # Install the Python requirements
 pip install -r requirements.txt
 # run the server
-python speech.py
+startup.sh
 ```

+
 ## Usage

 ```
-usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST]
+usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST]

 OpenedAI Speech API Server

 options:
  -h, --help            show this help message and exit
-  --piper_cuda          Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough (default: False)
  --xtts_device XTTS_DEVICE
                        Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
@ -194,9 +189,34 @@ options:

 ## Custom Voices Howto

-Custom voices should be mono 22050 hz sample rate WAV files with low noise (no background music, etc.) and not contain any partial words.Sample voices for xtts should be at least 6 seconds long, but they can be longer. However, longer samples do not always produce better results.
+### Piper

-You can use FFmpeg to process your audio files and prepare them for xtts, here are some examples:
+  1. Select the piper voice and model from the [piper samples](https://rhasspy.github.io/piper-samples/)
+  2. Update the `config/voice_to_speaker.yaml` with a new section for the voice, for example:
+```yaml
+...
+tts-1:
+  ryan:
+    model: voices/en_US-ryan-high.onnx
+    speaker: # default speaker
+```
+  3. New models will be downloaded as needed, of you can download them in advance with `download_voices_tts-1.sh`. For example:
+```shell
+bash download_voices_tts-1.sh en_US-ryan-high
+```
+
+### Coqui XTTS v2
+
+Coqui XTTS v2 voice cloning can work with as little as 6 seconds of clear audio. To create a custom voice clone, you must prepare a WAV file sample of the voice.
+
+#### Guidelines for preparing good sample files for Coqui XTTS v2
+* Mono (single channel) 22050 Hz WAV file
+* 6-30 seconds long - longer isn't always better (I've had some good results with as little as 4 seconds)
+* low noise (no hiss or hum)
+* No partial words, breathing, music or backgrounds sounds
+* An even speaking pace with a variety of words is best, like in interviews or audiobooks.
+
+You can use FFmpeg to prepare your audio files, here are some examples:

 ```shell
 # convert a multi-channel audio file to mono, set sample rate to 22050 hz, trim to 6 seconds, and output as WAV file.
@ -207,7 +227,7 @@ ffmpeg -i input.wav -af "highpass=f=200, lowpass=f=3000" -ac 1 -ar 22050 -ss 00:
 ffmpeg -i input.mkv -af "highpass=f=200, lowpass=f=3000, volume=5, afftdn=nf=25" -ac 1 -ar 22050 -ss 00:13:26.2 -t 6 -y me.wav
 ```

-Once your WAV file is prepared, save it in the `/voices/` directory and update the `voice_to_speaker.yaml` file with the new file name.
+Once your WAV file is prepared, save it in the `/voices/` directory and update the `config/voice_to_speaker.yaml` file with the new file name.

 For example:

@ -218,3 +238,33 @@ tts-1-hd:
    model: xtts_v2.0.2 # you can specify different xtts versions
    speaker: voices/me.wav # this could be you
 ```
+
+## Multilingual
+
+Multilingual support was added in version 0.11.0 and is available only with the XTTS v2 model.
+
+Coqui XTTSv2 has support for 16 languages: English (`en`), Spanish (`es`), French (`fr`), German (`de`), Italian (`it`), Portuguese (`pt`), Polish (`pl`), Turkish (`tr`), Russian (`ru`), Dutch (`nl`), Czech (`cs`), Arabic (`ar`), Chinese (`zh-cn`), Japanese (`ja`), Hungarian (`hu`) and Korean (`ko`).
+
+Unfortunately the OpenAI API does not support language, but you can create your own custom speaker voice and set the language for that.
+
+1) Create the WAV file for your speaker, as in [Custom Voices Howto](#custom-voices-howto)
+2) Add the voice to `config/voice_to_speaker.yaml` and include the correct Coqui `language` code for the speaker. For example:
+
+```yaml
+  xunjiang:
+    model: xtts
+    speaker: voices/xunjiang.wav
+    language: zh-cn
+```
+
+3) Don't remove high unicode characters in your `config/pre_process_map.yaml`! If you have these lines, you will need to remove them. For example:
+
+Remove:
+```yaml
+- - '[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+'
+  - ''
+```
+
+These lines were added to the `config/pre_process_map.yaml` config file by default before version 0.11.0:
+
+4) Your new multi-lingual speaker voice is ready to use!
--- a/config/pre_process_map.default.yaml
+++ b/config/pre_process_map.default.yaml
@ -31,8 +31,6 @@
  - ' F.Y. '
 - - ([0-9]+)-([0-9]+)
  - \1 to \2
- - '[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+'
-  - ''
 - - '\*\*\*'
  - '*'
 - - '\*\*'
--- a/docker-compose.min.yml
+++ b/docker-compose.min.yml
@ -0,0 +1,13 @@
+services:
+  server:
+    build:
+      dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
+    image: ghcr.io/matatonic/openedai-speech-min
+    env_file: speech.env
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./voices:/app/voices
+      - ./config:/app/config
+    # To install as a service
+    #restart: unless-stopped
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,10 +1,8 @@
 services:
  server:
    build:
-      dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, ~8GB
-      #dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
+      dockerfile: Dockerfile
    image: ghcr.io/matatonic/openedai-speech
-    #image: ghcr.io/matatonic/openedai-speech-min
    env_file: speech.env
    ports:
      - "8000:8000"
@ -15,7 +13,6 @@ services:
    #restart: unless-stopped
    # Set nvidia runtime if it's not the default
    #runtime: nvidia
-    # The deploy section can be removed it not using GPU
    deploy:
      resources:
        reservations:
--- a/download_samples.bat
+++ b/download_samples.bat
@ -0,0 +1,6 @@
+@echo off
+for %%i in (alloy echo fable onyx nova shimmer) do (
+    if not exist "voices\%%i.wav" (
+        curl -s https://cdn.openai.com/API/docs/audio/%%i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices\%%i.wav
+    )
+)
--- a/download_voices_tts-1-hd.bat
+++ b/download_voices_tts-1-hd.bat
@ -0,0 +1,11 @@
+@echo off
+set COQUI_TOS_AGREED=1
+set TTS_HOME=voices
+
+set MODELS=%* 
+if "%MODELS%" == "" set MODELS=xtts
+
+for %%i in (%MODELS%) do (
+    python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('%%i')"
+)
+call download_samples.bat
--- a/download_voices_tts-1.bat
+++ b/download_voices_tts-1.bat
@ -0,0 +1,8 @@
+@echo off
+set models=%* 
+if "%models%" == "" set models=en_GB-northern_english_male-medium en_US-libritts_r-medium
+
+piper --update-voices --data-dir voices --download-dir voices --model x 2> nul
+for %%i in (%models%) do (
+    if not exist "voices\%%i.onnx" piper --data-dir voices --download-dir voices --model %%i > nul
+)
--- a/download_voices_tts-1.sh
+++ b/download_voices_tts-1.sh
@ -2,5 +2,5 @@
 models=${*:-"en_GB-northern_english_male-medium en_US-libritts_r-medium"} # en_US-ryan-high
 piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null
 for i in $models ; do
-	piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null
+    [ ! -e "voices/$i.onnx" ] && piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null
 done
--- a/requirements.txt
+++ b/requirements.txt
@ -2,9 +2,11 @@ fastapi
 uvicorn
 # piper-tts
 piper-tts==1.2.0
-onnxruntime-gpu
 # xtts
 TTS
+# Fixes: https://github.com/matatonic/openedai-speech/issues/9
+# Re:  https://github.com/pytorch/pytorch/issues/121834
+torch==2.2.2
 # XXX, 3.8+ has some issue for now
 spacy==3.7.4
 # parler-tts
--- a/sample.env
+++ b/sample.env
@ -2,5 +2,4 @@ TTS_HOME=voices
 HF_HOME=voices
 #PRELOAD_MODEL=xtts
 #PRELOAD_MODEL=xtts_v2.0.2
-#CLI_COMMAND="python speech.py --preload $PRELOAD_MODEL"
-#CLI_COMMAND="python speech.py --xtts_device none" # for piper only
+#PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1
--- a/speech.py
+++ b/speech.py
@ -32,12 +32,12 @@ class xtts_wrapper():
        self.model_name = model_name
        self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)

-    def tts(self, text, speaker_wav, speed):
+    def tts(self, text, speaker_wav, speed, language):
        tf, file_path = tempfile.mkstemp(suffix='.wav')

        file_path = self.xtts.tts_to_file(
-            text,
-            language='en',
+            text=text,
+            language=language,
            speaker_wav=speaker_wav,
            speed=speed,
            file_path=file_path,
@ -87,7 +87,7 @@ def map_voice_to_speaker(voice: str, model: str):
    default_exists('config/voice_to_speaker.yaml')
    with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
        voice_map = yaml.safe_load(file)
-        return (voice_map[model][voice]['model'], voice_map[model][voice]['speaker'])
+        return (voice_map[model][voice]['model'], voice_map[model][voice]['speaker'], voice_map[model][voice].get('language', 'en'))

 class GenerateSpeechRequest(BaseModel):
    model: str = "tts-1" # or "tts-1-hd"
@ -138,10 +138,8 @@ async def generate_speech(request: GenerateSpeechRequest):

    # Use piper for tts-1, and if xtts_device == none use for all models.
    if model == 'tts-1' or args.xtts_device == 'none':
-        piper_model, speaker = map_voice_to_speaker(voice, 'tts-1')
+        piper_model, speaker, not_used_language = map_voice_to_speaker(voice, 'tts-1')
        tts_args = ["piper", "--model", str(piper_model), "--data-dir", "voices", "--download-dir", "voices", "--output-raw"]
-        if args.piper_cuda:
-            tts_args.extend(["--cuda"])
        if speaker:
            tts_args.extend(["--speaker", str(speaker)])
        if speed != 1.0:
@ -155,7 +153,7 @@ async def generate_speech(request: GenerateSpeechRequest):

    # Use xtts for tts-1-hd
    elif model == 'tts-1-hd':
-        tts_model, speaker = map_voice_to_speaker(voice, 'tts-1-hd')
+        tts_model, speaker, language = map_voice_to_speaker(voice, 'tts-1-hd')

        if xtts is not None and xtts.model_name != tts_model:
            import torch, gc
@ -189,7 +187,7 @@ async def generate_speech(request: GenerateSpeechRequest):
                ffmpeg_args.extend(["-af", f"atempo={speed}"]) 
                speed = 1.0

-            tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
+            tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed, language=language)

    # Pipe the output from piper/xtts to the input of ffmpeg
    ffmpeg_args.extend(["-"])
@ -203,7 +201,6 @@ if __name__ == "__main__":
        description='OpenedAI Speech API Server',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

-    parser.add_argument('--piper_cuda', action='store_true', default=False, help="Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough") 
    parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
    parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
    parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
--- a/startup.bat
+++ b/startup.bat
@ -0,0 +1,8 @@
+@echo off
+
+set /p < speech.env
+
+call download_voices_tts-1.bat
+call download_voices_tts-1-hd.bat %PRELOAD_MODEL%
+
+python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL%
--- a/startup.min.sh
+++ b/startup.min.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+
+[ -f speech.env ] && . speech.env
+
+bash download_voices_tts-1.sh
+
+python speech.py --xtts_device none
--- a/startup.sh
+++ b/startup.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+[ -f speech.env ] && . speech.env
+
+bash download_voices_tts-1.sh
+bash download_voices_tts-1-hd.sh $PRELOAD_MODEL
+
+python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} $@