This commit is contained in:
matatonic 2024-04-26 20:42:33 -04:00
parent a2a3d2b3eb
commit 6864cf03b1
16 changed files with 260 additions and 70 deletions

127
.github/workflows/build-docker.yml vendored Normal file
View File

@ -0,0 +1,127 @@
name: Build and Publish Docker Image
on:
workflow_dispatch:
push:
branches:
- 'main'
release:
types: [published]
jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
env:
# Set up environment variables for the job
DOCKER_REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
TAG: ${{ github.sha }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true
# Log in to the GitHub Container Registry only when not running on a pull request event
- name: Login to Docker Registry
uses: docker/login-action@v2
with:
registry: ${{ env.DOCKER_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
# Build and push the Docker image to GHCR for the main branch or specific tags
- name: Build and Push Docker Image
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}
# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
if: startsWith(github.ref, 'refs/tags/')
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}
build-and-push-alt-image:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
env:
# Set up environment variables for the job
DOCKER_REGISTRY: ghcr.io
IMAGE_NAME: matatonic/openedai-speech-min
TAG: ${{ github.sha }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true
# Log in to the GitHub Container Registry only when not running on a pull request event
- name: Login to Docker Registry
uses: docker/login-action@v2
with:
registry: ${{ env.DOCKER_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
# Build and push the Docker image to GHCR for the main branch or specific tags
- name: Build and Push Docker Image
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.min
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}
# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
if: startsWith(github.ref, 'refs/tags/')
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.min
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}

4
.gitignore vendored
View File

@ -1,4 +1,8 @@
voices/
.env
speech.env
config/pre_process_map.yaml
config/voice_to_speaker.yaml
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@ -1,24 +1,17 @@
FROM python:3.11-slim
ENV COQUI_TOS_AGREED=1
ENV PRELOAD_MODEL=xtts
# or PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1
RUN apt-get update && \
apt-get install --no-install-recommends -y curl git ffmpeg
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
# default clone of the default voice is really bad, use a better default
COPY voices/alloy-alt.wav /app/voices/
WORKDIR /app
COPY *.txt /app/
RUN pip install --no-cache -r requirements.txt
COPY *.sh /app/
RUN ./download_voices_tts-1.sh
RUN ./download_voices_tts-1-hd.sh
COPY *.py *.yaml *.md LICENSE /app/
COPY *.sh *.py *.yaml *.md LICENSE config /app/
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
CMD python speech.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL
ENV CLI_COMMAND="python speech.py"
CMD $CLI_COMMAND

View File

@ -3,15 +3,13 @@ FROM python:3.11-slim
RUN apt-get update && \
apt-get install --no-install-recommends -y ffmpeg curl
RUN pip install --no-cache piper-tts pyyaml fastapi uvicorn
RUN pip install --no-cache piper-tts==1.2.0 pyyaml fastapi uvicorn
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
WORKDIR /app
RUN ./download_voices_tts-1.sh
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
CMD python speech.py --host 0.0.0.0 --port 8000 --xtts_device none
ENV CLI_COMMAND="python speech.py --xtts_device none"
CMD $CLI_COMMAND

View File

@ -25,6 +25,12 @@ Details:
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
Version: 0.10.0, 2024-04-26
* Better upgrades: Reorganize config files under config, voice models under voices
* * **If you customized your `voice_to_speaker.yaml` or `pre_process_map.yaml` you need to move them to the `config/` folder.**
* default listen host to 0.0.0.0
Version: 0.9.0, 2024-04-23
* Fix bug with yaml and loading UTF-8
@ -54,45 +60,47 @@ API Documentation
Installation instructions
-------------------------
You can run the server via docker like so (**recommended**):
1) Download the models & voices
```shell
# for tts-1 / piper
bash download_voices_tts-1.sh
# and for tts-1-hd / xtts
bash download_voices_tts-1-hd.sh
```
2a) Docker (**recommended**): You can run the server via docker like so:
```shell
cp sample.env speech.env # edit to suit your environment as needed, you can preload a model on startup
docker compose up
```
If you want a minimal docker image with piper support only (900MB vs. 13.5GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
If you want a minimal docker image with piper support only (~1GB vs. ~10GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
Manual instructions:
2b) Manual instructions:
```shell
# Install the Python requirements
pip install -r requirements.txt
# install ffmpeg and curl
sudo apt install ffmpeg curl
# Download the voice models:
# for tts-1
bash download_voices_tts-1.sh
# and for tts-1-hd
bash download_voices_tts-1-hd.sh
python speech.py
```
Usage
-----
```
usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT]
[-H HOST]
usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST]
OpenedAI Speech API Server
options:
-h, --help show this help message and exit
--piper_cuda Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me,
but cpu is fast enough (default: False)
--piper_cuda Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough (default: False)
--xtts_device XTTS_DEVICE
Set the device for the xtts model. The special value of 'none' will use
piper for all models. (default: cuda)
--preload PRELOAD Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on
first use. (default: None)
Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
--preload PRELOAD Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
-P PORT, --port PORT Server tcp port (default: 8000)
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: localhost)
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
```
Sample API Usage
@ -141,10 +149,32 @@ with client.audio.speech.with_streaming_response.create(
Also see the `say.py` sample application for an example of how to use the openai-python API.
```
$ python say.py -i "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
$ python say.py -i "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
$ python say.py -t "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
$ python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
```
```
usage: say.py [-h] [-m MODEL] [-v VOICE] [-f {mp3,aac,opus,flac}] [-s SPEED] [-t TEXT] [-i INPUT] [-o OUTPUT] [-p]
Text to speech using the OpenAI API
options:
-h, --help show this help message and exit
-m MODEL, --model MODEL
The model to use (default: tts-1)
-v VOICE, --voice VOICE
The voice of the speaker (default: alloy)
-f {mp3,aac,opus,flac}, --format {mp3,aac,opus,flac}
The output audio format (default: mp3)
-s SPEED, --speed SPEED
playback speed, 0.25-4.0 (default: 1.0)
-t TEXT, --text TEXT Provide text to read on the command line (default: None)
-i INPUT, --input INPUT
Read text from a file (default is to read from stdin) (default: None)
-o OUTPUT, --output OUTPUT
The filename to save the output to (default: None)
-p, --playsound Play the audio (default: False)
```
Custom Voices Howto
-------------------

View File

@ -2,7 +2,7 @@
some_other_voice_name_you_want:
model: voices/choose your own model.onnx
speaker: set your own speaker
alloy:
alloy:
model: voices/en_US-libritts_r-medium.onnx
speaker: 79 # 64, 79, 80, 101, 130
echo:
@ -24,7 +24,7 @@
model: voices/en_US-libritts_r-medium.onnx
speaker: 163
tts-1-hd:
alloy:
alloy:
model: xtts
speaker: voices/alloy-alt.wav
alloy-orig:

View File

@ -1,16 +1,17 @@
services:
server:
build:
context: .
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, ~10GB
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "parler-tts/parler_tts_mini_v0.1"]
command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "xtts"]
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--xtts_device", "none"] # min
image: ghcr.io/matatonic/openedai-speech
#image: ghcr.io/matatonic/openedai-speech-min
env_file: speech.env
ports:
- "8000:8000"
# volumes:
# - .:/app/
volumes:
- ./voices:/app/voices
- ./config:/app/config
#restart: unless-stopped # install as a service
# Below can be removed if not using GPU
runtime: nvidia
deploy:

View File

@ -1,4 +1,4 @@
#!/bin/sh
for i in alloy echo fable onyx nova shimmer; do
curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
[ ! -e "voices/$i.wav" ] && curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
done

View File

@ -1,4 +1,9 @@
#!/bin/sh
export COQUI_TOS_AGREED=1
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')"
export TTS_HOME=voices
MODELS=${*:-xtts}
for model in $MODELS; do
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
done
./download_samples.sh

View File

@ -1,5 +1,5 @@
#!/bin/sh
models="en_GB-northern_english_male-medium en_US-libritts_r-medium" # en_US-ryan-high
models=${*:-"en_GB-northern_english_male-medium en_US-libritts_r-medium"} # en_US-ryan-high
piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null
for i in $models ; do
piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null

View File

@ -3,8 +3,8 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import PlainTextResponse
class OpenAIStub(FastAPI):
def __init__(self) -> None:
super().__init__()
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.models = {}
self.add_middleware(

View File

@ -1,7 +1,7 @@
fastapi
uvicorn
# piper-tts
piper-tts
piper-tts==1.2.0
onnxruntime-gpu
# xtts
TTS

6
sample.env Normal file
View File

@ -0,0 +1,6 @@
TTS_HOME=voices
HF_HOME=voices
#PRELOAD_MODEL=xtts
#PRELOAD_MODEL=xtts_v2.0.2
#CLI_COMMAND="python speech.py --preload $PRELOAD_MODEL"
#CLI_COMMAND="python speech.py --xtts_device none" # for piper only

52
say.py
View File

@ -2,6 +2,7 @@
import sys
import os
import atexit
import tempfile
import argparse
@ -20,19 +21,23 @@ import openai
def parse_args(argv):
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", type=str, default="tts-1")#, choices=["tts-1", "tts-1-hd"])
parser.add_argument("-v", "--voice", type=str, default="alloy")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"])
parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"])
parser.add_argument("-s", "--speed", type=float, default=1.0)
parser.add_argument("-i", "--input", type=str)
parser = argparse.ArgumentParser(
description='Text to speech using the OpenAI API',
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument("-m", "--model", type=str, default="tts-1", help="The model to use")#, choices=["tts-1", "tts-1-hd"])
parser.add_argument("-v", "--voice", type=str, default="alloy", help="The voice of the speaker")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"])
parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"], help="The output audio format")
parser.add_argument("-s", "--speed", type=float, default=1.0, help="playback speed, 0.25-4.0")
parser.add_argument("-t", "--text", type=str, default=None, help="Provide text to read on the command line")
parser.add_argument("-i", "--input", type=str, default=None, help="Read text from a file (default is to read from stdin)")
if playsound is None:
parser.add_argument("-o", "--output", type=str) # required
parser.add_argument("-o", "--output", type=str, help="The filename to save the output to") # required
parser.add_argument("-p", "--playsound", type=None, default=None, help="python playsound not found. pip install playsound")
else:
parser.add_argument("-o", "--output", type=str, default=None) # not required
parser.add_argument("-p", "--playsound", action="store_true")
parser.add_argument("-o", "--output", type=str, default=None, help="The filename to save the output to") # not required
parser.add_argument("-p", "--playsound", action="store_true", help="Play the audio")
args = parser.parse_args(argv)
@ -50,6 +55,17 @@ if __name__ == "__main__":
print("Must select one of playsound (-p) or output file name (-o)")
sys.exit(1)
if args.input is None and args.text is None:
text = sys.stdin.read()
elif args.text:
text = args.text
elif args.input:
if os.path.exists(args.input):
with open(args.input, 'r') as f:
text = f.read()
else:
print(f"Warning! File not found: {args.input}\nFalling back to old behavior for -i")
text = args.input
client = openai.OpenAI(
# This part is not needed if you set these environment variables before import openai
@ -60,21 +76,21 @@ if __name__ == "__main__":
)
if args.playsound and args.output is None:
tf, args.output = file_path = tempfile.mkstemp(suffix='.wav')
else:
tf = None
_, args.output = tempfile.mkstemp(suffix='.wav')
def cleanup():
os.unlink(args.output)
atexit.register(cleanup)
with client.audio.speech.with_streaming_response.create(
model=args.model,
voice=args.voice,
speed=args.speed,
response_format=args.format,
input=args.input,
input=text,
) as response:
response.stream_to_file(args.output)
if args.playsound:
playsound(args.output)
if tf:
os.unlink(args.output)
if args.playsound:
playsound(args.output)

View File

@ -65,9 +65,18 @@ class parler_tts():
return tf
def default_exists(filename: str):
if not os.path.exists(filename):
basename, ext = os.path.splitext(filename)
default = f"{basename}.default{ext}"
with open(default, 'r') as from_file:
with open(filename, 'w') as to_file:
to_file.write(from_file.read())
# Read pre process map on demand so it can be changed without restarting the server
def preprocess(raw_input):
with open('pre_process_map.yaml', 'r', encoding='utf8') as file:
default_exists('config/pre_process_map.yaml')
with open('config/pre_process_map.yaml', 'r', encoding='utf8') as file:
pre_process_map = yaml.safe_load(file)
for a, b in pre_process_map:
raw_input = re.sub(a, b, raw_input)
@ -75,9 +84,10 @@ def preprocess(raw_input):
# Read voice map on demand so it can be changed without restarting the server
def map_voice_to_speaker(voice: str, model: str):
with open('voice_to_speaker.yaml', 'r', encoding='utf8') as file:
default_exists('config/voice_to_speaker.yaml')
with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
voice_map = yaml.safe_load(file)
return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'],
return (voice_map[model][voice]['model'], voice_map[model][voice]['speaker'])
class GenerateSpeechRequest(BaseModel):
model: str = "tts-1" # or "tts-1-hd"
@ -197,7 +207,7 @@ if __name__ == "__main__":
parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0")
parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
args = parser.parse_args()