This commit is contained in:
matatonic 2024-04-26 20:42:33 -04:00
parent a2a3d2b3eb
commit 6864cf03b1
16 changed files with 260 additions and 70 deletions

127
.github/workflows/build-docker.yml vendored Normal file
View File

@ -0,0 +1,127 @@
name: Build and Publish Docker Image
on:
workflow_dispatch:
push:
branches:
- 'main'
release:
types: [published]
jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
env:
# Set up environment variables for the job
DOCKER_REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
TAG: ${{ github.sha }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true
# Log in to the GitHub Container Registry only when not running on a pull request event
- name: Login to Docker Registry
uses: docker/login-action@v2
with:
registry: ${{ env.DOCKER_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
# Build and push the Docker image to GHCR for the main branch or specific tags
- name: Build and Push Docker Image
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}
# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
if: startsWith(github.ref, 'refs/tags/')
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}
build-and-push-alt-image:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
env:
# Set up environment variables for the job
DOCKER_REGISTRY: ghcr.io
IMAGE_NAME: matatonic/openedai-speech-min
TAG: ${{ github.sha }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true
# Log in to the GitHub Container Registry only when not running on a pull request event
- name: Login to Docker Registry
uses: docker/login-action@v2
with:
registry: ${{ env.DOCKER_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
# Build and push the Docker image to GHCR for the main branch or specific tags
- name: Build and Push Docker Image
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.min
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}
# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
if: startsWith(github.ref, 'refs/tags/')
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.min
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}

4
.gitignore vendored
View File

@ -1,4 +1,8 @@
voices/ voices/
.env
speech.env
config/pre_process_map.yaml
config/voice_to_speaker.yaml
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/

View File

@ -1,24 +1,17 @@
FROM python:3.11-slim FROM python:3.11-slim
ENV COQUI_TOS_AGREED=1 ENV COQUI_TOS_AGREED=1
ENV PRELOAD_MODEL=xtts
# or PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1
RUN apt-get update && \ RUN apt-get update && \
apt-get install --no-install-recommends -y curl git ffmpeg apt-get install --no-install-recommends -y curl git ffmpeg
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices RUN mkdir -p /app/voices
# default clone of the default voice is really bad, use a better default
COPY voices/alloy-alt.wav /app/voices/
WORKDIR /app WORKDIR /app
COPY *.txt /app/ COPY *.txt /app/
RUN pip install --no-cache -r requirements.txt RUN pip install --no-cache -r requirements.txt
COPY *.sh /app/ COPY *.sh *.py *.yaml *.md LICENSE config /app/
RUN ./download_voices_tts-1.sh
RUN ./download_voices_tts-1-hd.sh
COPY *.py *.yaml *.md LICENSE /app/
RUN apt-get clean && rm -rf /var/lib/apt/lists/* RUN apt-get clean && rm -rf /var/lib/apt/lists/*
CMD python speech.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL ENV CLI_COMMAND="python speech.py"
CMD $CLI_COMMAND

View File

@ -3,15 +3,13 @@ FROM python:3.11-slim
RUN apt-get update && \ RUN apt-get update && \
apt-get install --no-install-recommends -y ffmpeg curl apt-get install --no-install-recommends -y ffmpeg curl
RUN pip install --no-cache piper-tts pyyaml fastapi uvicorn RUN pip install --no-cache piper-tts==1.2.0 pyyaml fastapi uvicorn
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices RUN mkdir -p /app/voices
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/ COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
WORKDIR /app WORKDIR /app
RUN ./download_voices_tts-1.sh
RUN apt-get clean && rm -rf /var/lib/apt/lists/* RUN apt-get clean && rm -rf /var/lib/apt/lists/*
CMD python speech.py --host 0.0.0.0 --port 8000 --xtts_device none ENV CLI_COMMAND="python speech.py --xtts_device none"
CMD $CLI_COMMAND

View File

@ -25,6 +25,12 @@ Details:
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults. If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
Version: 0.10.0, 2024-04-26
* Better upgrades: Reorganize config files under config, voice models under voices
* * **If you customized your `voice_to_speaker.yaml` or `pre_process_map.yaml` you need to move them to the `config/` folder.**
* default listen host to 0.0.0.0
Version: 0.9.0, 2024-04-23 Version: 0.9.0, 2024-04-23
* Fix bug with yaml and loading UTF-8 * Fix bug with yaml and loading UTF-8
@ -54,45 +60,47 @@ API Documentation
Installation instructions Installation instructions
------------------------- -------------------------
You can run the server via docker like so (**recommended**): 1) Download the models & voices
```shell ```shell
# for tts-1 / piper
bash download_voices_tts-1.sh
# and for tts-1-hd / xtts
bash download_voices_tts-1-hd.sh
```
2a) Docker (**recommended**): You can run the server via docker like so:
```shell
cp sample.env speech.env # edit to suit your environment as needed, you can preload a model on startup
docker compose up docker compose up
``` ```
If you want a minimal docker image with piper support only (900MB vs. 13.5GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this. If you want a minimal docker image with piper support only (~1GB vs. ~10GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
Manual instructions: 2b) Manual instructions:
```shell ```shell
# Install the Python requirements # Install the Python requirements
pip install -r requirements.txt pip install -r requirements.txt
# install ffmpeg and curl # install ffmpeg and curl
sudo apt install ffmpeg curl sudo apt install ffmpeg curl
# Download the voice models: python speech.py
# for tts-1
bash download_voices_tts-1.sh
# and for tts-1-hd
bash download_voices_tts-1-hd.sh
``` ```
Usage Usage
----- -----
``` ```
usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST]
[-H HOST]
OpenedAI Speech API Server OpenedAI Speech API Server
options: options:
-h, --help show this help message and exit -h, --help show this help message and exit
--piper_cuda Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, --piper_cuda Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough (default: False)
but cpu is fast enough (default: False)
--xtts_device XTTS_DEVICE --xtts_device XTTS_DEVICE
Set the device for the xtts model. The special value of 'none' will use Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
piper for all models. (default: cuda) --preload PRELOAD Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
--preload PRELOAD Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on
first use. (default: None)
-P PORT, --port PORT Server tcp port (default: 8000) -P PORT, --port PORT Server tcp port (default: 8000)
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: localhost) -H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
``` ```
Sample API Usage Sample API Usage
@ -141,10 +149,32 @@ with client.audio.speech.with_streaming_response.create(
Also see the `say.py` sample application for an example of how to use the openai-python API. Also see the `say.py` sample application for an example of how to use the openai-python API.
``` ```
$ python say.py -i "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound' $ python say.py -t "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
$ python say.py -i "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file. $ python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
``` ```
```
usage: say.py [-h] [-m MODEL] [-v VOICE] [-f {mp3,aac,opus,flac}] [-s SPEED] [-t TEXT] [-i INPUT] [-o OUTPUT] [-p]
Text to speech using the OpenAI API
options:
-h, --help show this help message and exit
-m MODEL, --model MODEL
The model to use (default: tts-1)
-v VOICE, --voice VOICE
The voice of the speaker (default: alloy)
-f {mp3,aac,opus,flac}, --format {mp3,aac,opus,flac}
The output audio format (default: mp3)
-s SPEED, --speed SPEED
playback speed, 0.25-4.0 (default: 1.0)
-t TEXT, --text TEXT Provide text to read on the command line (default: None)
-i INPUT, --input INPUT
Read text from a file (default is to read from stdin) (default: None)
-o OUTPUT, --output OUTPUT
The filename to save the output to (default: None)
-p, --playsound Play the audio (default: False)
```
Custom Voices Howto Custom Voices Howto
------------------- -------------------

View File

@ -2,7 +2,7 @@
some_other_voice_name_you_want: some_other_voice_name_you_want:
model: voices/choose your own model.onnx model: voices/choose your own model.onnx
speaker: set your own speaker speaker: set your own speaker
alloy: alloy:
model: voices/en_US-libritts_r-medium.onnx model: voices/en_US-libritts_r-medium.onnx
speaker: 79 # 64, 79, 80, 101, 130 speaker: 79 # 64, 79, 80, 101, 130
echo: echo:
@ -24,7 +24,7 @@
model: voices/en_US-libritts_r-medium.onnx model: voices/en_US-libritts_r-medium.onnx
speaker: 163 speaker: 163
tts-1-hd: tts-1-hd:
alloy: alloy:
model: xtts model: xtts
speaker: voices/alloy-alt.wav speaker: voices/alloy-alt.wav
alloy-orig: alloy-orig:

View File

@ -1,16 +1,17 @@
services: services:
server: server:
build: build:
context: .
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, ~10GB dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, ~10GB
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB #dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "parler-tts/parler_tts_mini_v0.1"] image: ghcr.io/matatonic/openedai-speech
command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "xtts"] #image: ghcr.io/matatonic/openedai-speech-min
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--xtts_device", "none"] # min env_file: speech.env
ports: ports:
- "8000:8000" - "8000:8000"
# volumes: volumes:
# - .:/app/ - ./voices:/app/voices
- ./config:/app/config
#restart: unless-stopped # install as a service
# Below can be removed if not using GPU # Below can be removed if not using GPU
runtime: nvidia runtime: nvidia
deploy: deploy:

View File

@ -1,4 +1,4 @@
#!/bin/sh #!/bin/sh
for i in alloy echo fable onyx nova shimmer; do for i in alloy echo fable onyx nova shimmer; do
curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav [ ! -e "voices/$i.wav" ] && curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
done done

View File

@ -1,4 +1,9 @@
#!/bin/sh #!/bin/sh
export COQUI_TOS_AGREED=1 export COQUI_TOS_AGREED=1
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')" export TTS_HOME=voices
MODELS=${*:-xtts}
for model in $MODELS; do
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
done
./download_samples.sh ./download_samples.sh

View File

@ -1,5 +1,5 @@
#!/bin/sh #!/bin/sh
models="en_GB-northern_english_male-medium en_US-libritts_r-medium" # en_US-ryan-high models=${*:-"en_GB-northern_english_male-medium en_US-libritts_r-medium"} # en_US-ryan-high
piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null
for i in $models ; do for i in $models ; do
piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null

View File

@ -3,8 +3,8 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import PlainTextResponse from fastapi.responses import PlainTextResponse
class OpenAIStub(FastAPI): class OpenAIStub(FastAPI):
def __init__(self) -> None: def __init__(self, **kwargs) -> None:
super().__init__() super().__init__(**kwargs)
self.models = {} self.models = {}
self.add_middleware( self.add_middleware(

View File

@ -1,7 +1,7 @@
fastapi fastapi
uvicorn uvicorn
# piper-tts # piper-tts
piper-tts piper-tts==1.2.0
onnxruntime-gpu onnxruntime-gpu
# xtts # xtts
TTS TTS

6
sample.env Normal file
View File

@ -0,0 +1,6 @@
TTS_HOME=voices
HF_HOME=voices
#PRELOAD_MODEL=xtts
#PRELOAD_MODEL=xtts_v2.0.2
#CLI_COMMAND="python speech.py --preload $PRELOAD_MODEL"
#CLI_COMMAND="python speech.py --xtts_device none" # for piper only

52
say.py
View File

@ -2,6 +2,7 @@
import sys import sys
import os import os
import atexit
import tempfile import tempfile
import argparse import argparse
@ -20,19 +21,23 @@ import openai
def parse_args(argv): def parse_args(argv):
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser(
parser.add_argument("-m", "--model", type=str, default="tts-1")#, choices=["tts-1", "tts-1-hd"]) description='Text to speech using the OpenAI API',
parser.add_argument("-v", "--voice", type=str, default="alloy")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"]) formatter_class=argparse.ArgumentDefaultsHelpFormatter,
parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"]) )
parser.add_argument("-s", "--speed", type=float, default=1.0) parser.add_argument("-m", "--model", type=str, default="tts-1", help="The model to use")#, choices=["tts-1", "tts-1-hd"])
parser.add_argument("-i", "--input", type=str) parser.add_argument("-v", "--voice", type=str, default="alloy", help="The voice of the speaker")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"])
parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"], help="The output audio format")
parser.add_argument("-s", "--speed", type=float, default=1.0, help="playback speed, 0.25-4.0")
parser.add_argument("-t", "--text", type=str, default=None, help="Provide text to read on the command line")
parser.add_argument("-i", "--input", type=str, default=None, help="Read text from a file (default is to read from stdin)")
if playsound is None: if playsound is None:
parser.add_argument("-o", "--output", type=str) # required parser.add_argument("-o", "--output", type=str, help="The filename to save the output to") # required
parser.add_argument("-p", "--playsound", type=None, default=None, help="python playsound not found. pip install playsound") parser.add_argument("-p", "--playsound", type=None, default=None, help="python playsound not found. pip install playsound")
else: else:
parser.add_argument("-o", "--output", type=str, default=None) # not required parser.add_argument("-o", "--output", type=str, default=None, help="The filename to save the output to") # not required
parser.add_argument("-p", "--playsound", action="store_true") parser.add_argument("-p", "--playsound", action="store_true", help="Play the audio")
args = parser.parse_args(argv) args = parser.parse_args(argv)
@ -50,6 +55,17 @@ if __name__ == "__main__":
print("Must select one of playsound (-p) or output file name (-o)") print("Must select one of playsound (-p) or output file name (-o)")
sys.exit(1) sys.exit(1)
if args.input is None and args.text is None:
text = sys.stdin.read()
elif args.text:
text = args.text
elif args.input:
if os.path.exists(args.input):
with open(args.input, 'r') as f:
text = f.read()
else:
print(f"Warning! File not found: {args.input}\nFalling back to old behavior for -i")
text = args.input
client = openai.OpenAI( client = openai.OpenAI(
# This part is not needed if you set these environment variables before import openai # This part is not needed if you set these environment variables before import openai
@ -60,21 +76,21 @@ if __name__ == "__main__":
) )
if args.playsound and args.output is None: if args.playsound and args.output is None:
tf, args.output = file_path = tempfile.mkstemp(suffix='.wav') _, args.output = tempfile.mkstemp(suffix='.wav')
else:
tf = None def cleanup():
os.unlink(args.output)
atexit.register(cleanup)
with client.audio.speech.with_streaming_response.create( with client.audio.speech.with_streaming_response.create(
model=args.model, model=args.model,
voice=args.voice, voice=args.voice,
speed=args.speed, speed=args.speed,
response_format=args.format, response_format=args.format,
input=args.input, input=text,
) as response: ) as response:
response.stream_to_file(args.output) response.stream_to_file(args.output)
if args.playsound: if args.playsound:
playsound(args.output) playsound(args.output)
if tf:
os.unlink(args.output)

View File

@ -65,9 +65,18 @@ class parler_tts():
return tf return tf
def default_exists(filename: str):
if not os.path.exists(filename):
basename, ext = os.path.splitext(filename)
default = f"{basename}.default{ext}"
with open(default, 'r') as from_file:
with open(filename, 'w') as to_file:
to_file.write(from_file.read())
# Read pre process map on demand so it can be changed without restarting the server # Read pre process map on demand so it can be changed without restarting the server
def preprocess(raw_input): def preprocess(raw_input):
with open('pre_process_map.yaml', 'r', encoding='utf8') as file: default_exists('config/pre_process_map.yaml')
with open('config/pre_process_map.yaml', 'r', encoding='utf8') as file:
pre_process_map = yaml.safe_load(file) pre_process_map = yaml.safe_load(file)
for a, b in pre_process_map: for a, b in pre_process_map:
raw_input = re.sub(a, b, raw_input) raw_input = re.sub(a, b, raw_input)
@ -75,9 +84,10 @@ def preprocess(raw_input):
# Read voice map on demand so it can be changed without restarting the server # Read voice map on demand so it can be changed without restarting the server
def map_voice_to_speaker(voice: str, model: str): def map_voice_to_speaker(voice: str, model: str):
with open('voice_to_speaker.yaml', 'r', encoding='utf8') as file: default_exists('config/voice_to_speaker.yaml')
with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
voice_map = yaml.safe_load(file) voice_map = yaml.safe_load(file)
return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'], return (voice_map[model][voice]['model'], voice_map[model][voice]['speaker'])
class GenerateSpeechRequest(BaseModel): class GenerateSpeechRequest(BaseModel):
model: str = "tts-1" # or "tts-1-hd" model: str = "tts-1" # or "tts-1-hd"
@ -197,7 +207,7 @@ if __name__ == "__main__":
parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.") parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.") parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port") parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0") parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
args = parser.parse_args() args = parser.parse_args()