prebuilt docker, cleanup and docs.

This commit is contained in:
matatonic 2024-03-23 14:52:51 -04:00
parent b0a6072047
commit 98d1968af6
7 changed files with 197 additions and 50 deletions

90
.github/workflows/build-docker.yml vendored Normal file
View File

@ -0,0 +1,90 @@
name: Build and Publish Docker Image
on:
workflow_dispatch:
push:
branches:
- 'main'
release:
types: [published]
jobs:
build-and-push-image:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
env:
# Set up environment variables for the job
DOCKER_REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
TAG: ${{ github.sha }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true
# Log in to the GitHub Container Registry only when not running on a pull request event
- name: Login to Docker Registry
uses: docker/login-action@v2
with:
registry: ${{ env.DOCKER_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
# Build and push the Docker image to GHCR for the main branch or specific tags
- name: Build and Push Docker Image
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}
# Build and push the Docker image to GHCR for the main branch or specific tags
- name: Build and Push Docker.min Image
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.min
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:min
labels: version=${{ github.run_id }}
# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
if: startsWith(github.ref, 'refs/tags/')
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}
# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker.min Image (Tagged)
if: startsWith(github.ref, 'refs/tags/')
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.min
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}-min
labels: version=${{ github.run_id }}

View File

@ -1,11 +1,10 @@
FROM nvidia/cuda:12.2.0-base-ubuntu22.04
FROM ubuntu:22.04
ENV COQUI_TOS_AGREED=1
ENV PRELOAD_MODEL=xtts
RUN apt-get update && \
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip && \
apt-get clean && rm -rf /var/lib/apt/lists/*
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
@ -19,4 +18,6 @@ RUN ./download_voices_tts-1.sh
RUN ./download_voices_tts-1-hd.sh
COPY *.py *.yaml *.md LICENSE /app/
CMD python main.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
CMD python speech.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL

View File

@ -1,8 +1,7 @@
FROM ubuntu:22.04
RUN apt-get update && \
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn && \
apt-get clean && rm -rf /var/lib/apt/lists/*
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn
RUN pip install piper-tts
@ -13,4 +12,6 @@ WORKDIR /app
RUN ./download_voices_tts-1.sh
CMD python main.py --host 0.0.0.0 --port 8000 --xtts_device none
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
CMD python speech.py --host 0.0.0.0 --port 8000 --xtts_device none

View File

@ -5,8 +5,7 @@ An OpenAI API compatible text to speech server.
* Compatible with the OpenAI audio/speech API
* Serves the [/v1/audio/speech endpoint](https://platform.openai.com/docs/api-reference/audio/createSpeech)
* Does not connect to the OpenAI API and does not require an OpenAI API Key
* Not affiliated with OpenAI in any way
* Not affiliated with OpenAI in any way, does not require an OpenAI API Key
* A free, private, text-to-speech server with custom voice cloning
Full Compatibility:
@ -17,14 +16,18 @@ Full Compatibility:
Details:
* model 'tts-1' via [piper tts](https://github.com/rhasspy/piper) (fast, can use cpu)
* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, uses almost 4GB GPU VRAM)
* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, but requires around 4GB GPU VRAM)
* Can be run without TTS/xtts_v2, entirely on cpu
* Custom cloned voices can be used for tts-1-hd, just save a WAV file in `/voices/`
* You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) and xtts_v2 speaker clones via `voice_to_speaker.yaml`
* Sometimes certain words or symbols will sound bad, you can fix them with regex via `pre_process_map.yaml`
* Custom cloned voices can be used for tts-1-hd, just save a WAV file in the `/voices/` directory
* You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) and xtts_v2 speaker clones via the `voice_to_speaker.yaml` configuration file
* Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
Version: 0.8.0, 2024-03-23
* Pre-built docker images for :latest, :min and release versions
Version: 0.7.3, 2024-03-20
* Allow different xtts versions per voice in `voice_to_speaker.yaml`, ex. xtts_v2.0.2
@ -41,10 +44,17 @@ API Documentation
Installation instructions
-------------------------
You can run the server via docker like so (**recommended**):
```shell
docker compose up
```
If you want a minimal docker image with piper support only (900MB vs. 13.5GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
Manual instructions:
```shell
# Install the Python requirements
pip install -r requirements.txt
# install ffmpeg & curl
# install ffmpeg and curl
sudo apt install ffmpeg curl
# Download the voice models:
# for tts-1
@ -57,8 +67,8 @@ Usage
-----
```
usage: main.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT]
[-H HOST]
usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT]
[-H HOST]
OpenedAI Speech API Server
@ -118,21 +128,10 @@ with client.audio.speech.with_streaming_response.create(
response.stream_to_file("speech.mp3")
```
Docker support
--------------
You can run the server via docker like so:
```shell
docker compose build
docker compose up
```
If you want a minimal docker image with piper support only (900MB vs. 13GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to change this.
Custom Voices Howto
-------------------
Custom voices should be mono 22050 hz sample rate WAV files with low noise (no background music, etc.) and not contain any partial words. Sample voices for xtts should be at least 6 seconds, but can be longer, but longer doesn't always produce better results.
Custom voices should be mono 22050 hz sample rate WAV files with low noise (no background music, etc.) and not contain any partial words.Sample voices for xtts should be at least 6 seconds long, but they can be longer. However, longer samples do not always produce better results.
You can use FFmpeg to process your audio files and prepare them for xtts, here are some examples:
@ -141,11 +140,11 @@ You can use FFmpeg to process your audio files and prepare them for xtts, here a
ffmpeg -i input.mp3 -ac 1 -ar 22050 -t 6 -y me.wav
# use a simple noise filter to clean up audio, and select a start time start for sampling.
ffmpeg -i input.wav -af "highpass=f=200, lowpass=f=3000" -ac 1 -ar 22050 -ss 00:13:26.2 -t 6 -y me.wav
# A more complex noise reduction setup with volume adjustment
# A more complex noise reduction setup, including volume adjustment
ffmpeg -i input.mkv -af "highpass=f=200, lowpass=f=3000, volume=5, afftdn=nf=25" -ac 1 -ar 22050 -ss 00:13:26.2 -t 6 -y me.wav
```
Once you WAV file is prepared, save it in the voices/ folder and update the `voice_to_speaker.yaml` file with the new file name.
Once your WAV file is prepared, save it in the `/voices/` directory and update the `voice_to_speaker.yaml` file with the new file name.
For example:

View File

@ -1,12 +1,12 @@
version: "3.3"
services:
server:
build:
context: .
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~900MB
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, 13GB
stdin_open: true
tty: true
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~900MB
image: ghcr.io/matatonic/openedai-speech # :latest, :min, :<version>, :<version>-min
command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "xtts"]
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--xtts_device", "none"] # min
ports:
- "8000:8000"
# volumes:

66
openedai.py Normal file
View File

@ -0,0 +1,66 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import PlainTextResponse
class OpenAIStub(FastAPI):
def __init__(self) -> None:
super().__init__()
self.models = {}
self.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"]
)
@self.get('/v1/billing/usage')
@self.get('/v1/dashboard/billing/usage')
async def handle_billing_usage():
return { 'total_usage': 0 }
@self.get("/", response_class=PlainTextResponse)
@self.head("/", response_class=PlainTextResponse)
@self.options("/", response_class=PlainTextResponse)
async def root():
return PlainTextResponse(content="", status_code=200 if self.models else 503)
@self.get("/health")
async def health():
return {"status": "ok" if self.models else "unk" }
@self.get("/v1/models")
async def get_model_list():
return self.model_list()
@self.get("/v1/models/{model}")
async def get_model_info(model_id: str):
return self.model_info(model_id)
def register_model(self, name: str, model: str = None) -> None:
self.models[name] = model if model else name
def deregister_model(self, name: str) -> None:
if name in self.models:
del self.models[name]
def model_info(self, model: str) -> dict:
result = {
"id": model,
"object": "model",
"created": 0,
"owned_by": "user"
}
return result
def model_list(self) -> dict:
if not self.models:
return {}
result = {
"object": "list",
"data": [ self.model_info(model) for model in list(set(self.models.keys() | self.models.values())) if model ]
}
return result

View File

@ -12,9 +12,11 @@ from fastapi.middleware.cors import CORSMiddleware
import uvicorn
from pydantic import BaseModel
import openedai
xtts = None
args = None
app = FastAPI()
app = openedai.OpenAIStub()
class xtts_wrapper():
def __init__(self, model_name, device):
@ -50,20 +52,6 @@ def map_voice_to_speaker(voice: str, model: str):
voice_map = yaml.safe_load(file)
return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'],
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"]
)
@app.get("/", response_class=PlainTextResponse)
@app.head("/", response_class=PlainTextResponse)
@app.options("/", response_class=PlainTextResponse)
async def root():
return PlainTextResponse(content="")
class GenerateSpeechRequest(BaseModel):
model: str = "tts-1" # or "tts-1-hd"
input: str
@ -162,7 +150,6 @@ async def generate_speech(request: GenerateSpeechRequest):
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog='main.py',
description='OpenedAI Speech API Server',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@ -180,4 +167,7 @@ if __name__ == "__main__":
if args.preload:
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
app.register_model('tts-1')
app.register_model('tts-1-hd')
uvicorn.run(app, host=args.host, port=args.port)