mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
prebuilt docker, cleanup and docs.
This commit is contained in:
parent
b0a6072047
commit
98d1968af6
90
.github/workflows/build-docker.yml
vendored
Normal file
90
.github/workflows/build-docker.yml
vendored
Normal file
@ -0,0 +1,90 @@
|
||||
name: Build and Publish Docker Image
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- 'main'
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
build-and-push-image:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
env:
|
||||
# Set up environment variables for the job
|
||||
DOCKER_REGISTRY: ghcr.io
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
TAG: ${{ github.sha }}
|
||||
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
with:
|
||||
install: true
|
||||
|
||||
# Log in to the GitHub Container Registry only when not running on a pull request event
|
||||
- name: Login to Docker Registry
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ${{ env.DOCKER_REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@v4
|
||||
with:
|
||||
images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
|
||||
# Build and push the Docker image to GHCR for the main branch or specific tags
|
||||
- name: Build and Push Docker Image
|
||||
if: github.ref == 'refs/heads/main'
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
file: Dockerfile
|
||||
push: true
|
||||
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
|
||||
labels: version=${{ github.run_id }}
|
||||
|
||||
# Build and push the Docker image to GHCR for the main branch or specific tags
|
||||
- name: Build and Push Docker.min Image
|
||||
if: github.ref == 'refs/heads/main'
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
file: Dockerfile.min
|
||||
push: true
|
||||
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:min
|
||||
labels: version=${{ github.run_id }}
|
||||
|
||||
# For tagged releases, build and push the Docker image with the corresponding tag
|
||||
- name: Build and Push Docker Image (Tagged)
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
file: Dockerfile
|
||||
push: true
|
||||
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
|
||||
labels: version=${{ github.run_id }}
|
||||
|
||||
# For tagged releases, build and push the Docker image with the corresponding tag
|
||||
- name: Build and Push Docker.min Image (Tagged)
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
file: Dockerfile.min
|
||||
push: true
|
||||
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}-min
|
||||
labels: version=${{ github.run_id }}
|
||||
@ -1,11 +1,10 @@
|
||||
FROM nvidia/cuda:12.2.0-base-ubuntu22.04
|
||||
FROM ubuntu:22.04
|
||||
|
||||
ENV COQUI_TOS_AGREED=1
|
||||
ENV PRELOAD_MODEL=xtts
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip
|
||||
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
@ -19,4 +18,6 @@ RUN ./download_voices_tts-1.sh
|
||||
RUN ./download_voices_tts-1-hd.sh
|
||||
COPY *.py *.yaml *.md LICENSE /app/
|
||||
|
||||
CMD python main.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL
|
||||
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
CMD python speech.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL
|
||||
|
||||
@ -1,8 +1,7 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn
|
||||
|
||||
RUN pip install piper-tts
|
||||
|
||||
@ -13,4 +12,6 @@ WORKDIR /app
|
||||
|
||||
RUN ./download_voices_tts-1.sh
|
||||
|
||||
CMD python main.py --host 0.0.0.0 --port 8000 --xtts_device none
|
||||
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
CMD python speech.py --host 0.0.0.0 --port 8000 --xtts_device none
|
||||
|
||||
45
README.md
45
README.md
@ -5,8 +5,7 @@ An OpenAI API compatible text to speech server.
|
||||
|
||||
* Compatible with the OpenAI audio/speech API
|
||||
* Serves the [/v1/audio/speech endpoint](https://platform.openai.com/docs/api-reference/audio/createSpeech)
|
||||
* Does not connect to the OpenAI API and does not require an OpenAI API Key
|
||||
* Not affiliated with OpenAI in any way
|
||||
* Not affiliated with OpenAI in any way, does not require an OpenAI API Key
|
||||
* A free, private, text-to-speech server with custom voice cloning
|
||||
|
||||
Full Compatibility:
|
||||
@ -17,14 +16,18 @@ Full Compatibility:
|
||||
|
||||
Details:
|
||||
* model 'tts-1' via [piper tts](https://github.com/rhasspy/piper) (fast, can use cpu)
|
||||
* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, uses almost 4GB GPU VRAM)
|
||||
* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, but requires around 4GB GPU VRAM)
|
||||
* Can be run without TTS/xtts_v2, entirely on cpu
|
||||
* Custom cloned voices can be used for tts-1-hd, just save a WAV file in `/voices/`
|
||||
* You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) and xtts_v2 speaker clones via `voice_to_speaker.yaml`
|
||||
* Sometimes certain words or symbols will sound bad, you can fix them with regex via `pre_process_map.yaml`
|
||||
* Custom cloned voices can be used for tts-1-hd, just save a WAV file in the `/voices/` directory
|
||||
* You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) and xtts_v2 speaker clones via the `voice_to_speaker.yaml` configuration file
|
||||
* Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`
|
||||
|
||||
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
|
||||
|
||||
Version: 0.8.0, 2024-03-23
|
||||
|
||||
* Pre-built docker images for :latest, :min and release versions
|
||||
|
||||
Version: 0.7.3, 2024-03-20
|
||||
|
||||
* Allow different xtts versions per voice in `voice_to_speaker.yaml`, ex. xtts_v2.0.2
|
||||
@ -41,10 +44,17 @@ API Documentation
|
||||
Installation instructions
|
||||
-------------------------
|
||||
|
||||
You can run the server via docker like so (**recommended**):
|
||||
```shell
|
||||
docker compose up
|
||||
```
|
||||
If you want a minimal docker image with piper support only (900MB vs. 13.5GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
|
||||
|
||||
Manual instructions:
|
||||
```shell
|
||||
# Install the Python requirements
|
||||
pip install -r requirements.txt
|
||||
# install ffmpeg & curl
|
||||
# install ffmpeg and curl
|
||||
sudo apt install ffmpeg curl
|
||||
# Download the voice models:
|
||||
# for tts-1
|
||||
@ -57,8 +67,8 @@ Usage
|
||||
-----
|
||||
|
||||
```
|
||||
usage: main.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT]
|
||||
[-H HOST]
|
||||
usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT]
|
||||
[-H HOST]
|
||||
|
||||
OpenedAI Speech API Server
|
||||
|
||||
@ -118,21 +128,10 @@ with client.audio.speech.with_streaming_response.create(
|
||||
response.stream_to_file("speech.mp3")
|
||||
```
|
||||
|
||||
Docker support
|
||||
--------------
|
||||
|
||||
You can run the server via docker like so:
|
||||
```shell
|
||||
docker compose build
|
||||
docker compose up
|
||||
```
|
||||
|
||||
If you want a minimal docker image with piper support only (900MB vs. 13GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to change this.
|
||||
|
||||
Custom Voices Howto
|
||||
-------------------
|
||||
|
||||
Custom voices should be mono 22050 hz sample rate WAV files with low noise (no background music, etc.) and not contain any partial words. Sample voices for xtts should be at least 6 seconds, but can be longer, but longer doesn't always produce better results.
|
||||
Custom voices should be mono 22050 hz sample rate WAV files with low noise (no background music, etc.) and not contain any partial words.Sample voices for xtts should be at least 6 seconds long, but they can be longer. However, longer samples do not always produce better results.
|
||||
|
||||
You can use FFmpeg to process your audio files and prepare them for xtts, here are some examples:
|
||||
|
||||
@ -141,11 +140,11 @@ You can use FFmpeg to process your audio files and prepare them for xtts, here a
|
||||
ffmpeg -i input.mp3 -ac 1 -ar 22050 -t 6 -y me.wav
|
||||
# use a simple noise filter to clean up audio, and select a start time start for sampling.
|
||||
ffmpeg -i input.wav -af "highpass=f=200, lowpass=f=3000" -ac 1 -ar 22050 -ss 00:13:26.2 -t 6 -y me.wav
|
||||
# A more complex noise reduction setup with volume adjustment
|
||||
# A more complex noise reduction setup, including volume adjustment
|
||||
ffmpeg -i input.mkv -af "highpass=f=200, lowpass=f=3000, volume=5, afftdn=nf=25" -ac 1 -ar 22050 -ss 00:13:26.2 -t 6 -y me.wav
|
||||
```
|
||||
|
||||
Once you WAV file is prepared, save it in the voices/ folder and update the `voice_to_speaker.yaml` file with the new file name.
|
||||
Once your WAV file is prepared, save it in the `/voices/` directory and update the `voice_to_speaker.yaml` file with the new file name.
|
||||
|
||||
For example:
|
||||
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
version: "3.3"
|
||||
services:
|
||||
server:
|
||||
build:
|
||||
context: .
|
||||
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~900MB
|
||||
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, 13GB
|
||||
stdin_open: true
|
||||
tty: true
|
||||
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~900MB
|
||||
image: ghcr.io/matatonic/openedai-speech # :latest, :min, :<version>, :<version>-min
|
||||
command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "xtts"]
|
||||
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--xtts_device", "none"] # min
|
||||
ports:
|
||||
- "8000:8000"
|
||||
# volumes:
|
||||
|
||||
66
openedai.py
Normal file
66
openedai.py
Normal file
@ -0,0 +1,66 @@
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import PlainTextResponse
|
||||
|
||||
class OpenAIStub(FastAPI):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.models = {}
|
||||
|
||||
self.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"]
|
||||
)
|
||||
|
||||
@self.get('/v1/billing/usage')
|
||||
@self.get('/v1/dashboard/billing/usage')
|
||||
async def handle_billing_usage():
|
||||
return { 'total_usage': 0 }
|
||||
|
||||
@self.get("/", response_class=PlainTextResponse)
|
||||
@self.head("/", response_class=PlainTextResponse)
|
||||
@self.options("/", response_class=PlainTextResponse)
|
||||
async def root():
|
||||
return PlainTextResponse(content="", status_code=200 if self.models else 503)
|
||||
|
||||
@self.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok" if self.models else "unk" }
|
||||
|
||||
@self.get("/v1/models")
|
||||
async def get_model_list():
|
||||
return self.model_list()
|
||||
|
||||
@self.get("/v1/models/{model}")
|
||||
async def get_model_info(model_id: str):
|
||||
return self.model_info(model_id)
|
||||
|
||||
def register_model(self, name: str, model: str = None) -> None:
|
||||
self.models[name] = model if model else name
|
||||
|
||||
def deregister_model(self, name: str) -> None:
|
||||
if name in self.models:
|
||||
del self.models[name]
|
||||
|
||||
def model_info(self, model: str) -> dict:
|
||||
result = {
|
||||
"id": model,
|
||||
"object": "model",
|
||||
"created": 0,
|
||||
"owned_by": "user"
|
||||
}
|
||||
return result
|
||||
|
||||
def model_list(self) -> dict:
|
||||
if not self.models:
|
||||
return {}
|
||||
|
||||
result = {
|
||||
"object": "list",
|
||||
"data": [ self.model_info(model) for model in list(set(self.models.keys() | self.models.values())) if model ]
|
||||
}
|
||||
|
||||
return result
|
||||
@ -12,9 +12,11 @@ from fastapi.middleware.cors import CORSMiddleware
|
||||
import uvicorn
|
||||
from pydantic import BaseModel
|
||||
|
||||
import openedai
|
||||
|
||||
xtts = None
|
||||
args = None
|
||||
app = FastAPI()
|
||||
app = openedai.OpenAIStub()
|
||||
|
||||
class xtts_wrapper():
|
||||
def __init__(self, model_name, device):
|
||||
@ -50,20 +52,6 @@ def map_voice_to_speaker(voice: str, model: str):
|
||||
voice_map = yaml.safe_load(file)
|
||||
return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'],
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"]
|
||||
)
|
||||
|
||||
@app.get("/", response_class=PlainTextResponse)
|
||||
@app.head("/", response_class=PlainTextResponse)
|
||||
@app.options("/", response_class=PlainTextResponse)
|
||||
async def root():
|
||||
return PlainTextResponse(content="")
|
||||
|
||||
class GenerateSpeechRequest(BaseModel):
|
||||
model: str = "tts-1" # or "tts-1-hd"
|
||||
input: str
|
||||
@ -162,7 +150,6 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='main.py',
|
||||
description='OpenedAI Speech API Server',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
|
||||
@ -180,4 +167,7 @@ if __name__ == "__main__":
|
||||
if args.preload:
|
||||
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
|
||||
|
||||
app.register_model('tts-1')
|
||||
app.register_model('tts-1-hd')
|
||||
|
||||
uvicorn.run(app, host=args.host, port=args.port)
|
||||
Loading…
Reference in New Issue
Block a user