mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
0.2.0 rc1
This commit is contained in:
parent
0ca7da80c4
commit
7a6abf1538
10
Dockerfile
10
Dockerfile
@ -1,20 +1,18 @@
|
||||
FROM nvidia/cuda:11.8.0-base-ubuntu22.04
|
||||
|
||||
ENV COQUI_TOS_AGREED=1
|
||||
|
||||
#python3.11
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y python3-pip wget ffmpeg && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
#RUN git clone https://github.com/matatonic/openedai-api-audio-speech /app
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
COPY *.py *.yaml *.txt *.md *.sh /app/
|
||||
COPY ./voices/alloy.wav /app/voices/alloy.wav
|
||||
WORKDIR /app
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
RUN pip install -r requirements.txt
|
||||
RUN ./download_voices_tts-1.sh
|
||||
RUN ./download_voices_tts-1-hd.sh
|
||||
|
||||
CMD python3 main.py
|
||||
ENV COQUI_TOS_AGREED=1
|
||||
CMD python3 main.py --host 0.0.0.0 --port 8000 --preload_xtts
|
||||
|
||||
@ -5,12 +5,13 @@ RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y wget ffmpeg python3-pip python3-yaml python3-fastapi python3-uvicorn && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip install piper-tts
|
||||
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
COPY *.py *.yaml *.sh /app/
|
||||
COPY *.py *.yaml *.txt *.md *.sh /app/
|
||||
WORKDIR /app
|
||||
RUN pip install piper-tts
|
||||
|
||||
RUN download_voices_tts-1.sh
|
||||
|
||||
CMD python3 main.py
|
||||
CMD python3 main.py --host 0.0.0.0 --port 8000 --xtts_device none
|
||||
|
||||
87
README.md
87
README.md
@ -1,39 +1,74 @@
|
||||
openedai API for audio/speech
|
||||
OpenedAI API for audio/speech
|
||||
-----------------------------
|
||||
|
||||
This is an API clone of the OpenAI API for text to speech audio generation.
|
||||
|
||||
This is v0.1, so please excuse the rough docs and configuration.
|
||||
* Compatible with the OpenAI audio/speech API
|
||||
* Does not connect to OpenAI
|
||||
* Does not require a (real) OpenAI API Key.
|
||||
* Not affiliated with OpenAI in any way.
|
||||
|
||||
It currently supports 'tts-1' via piper tts (fast, ~1 sec latency), and 'tts-1-hd' via xtts_v2 (slow, also uses a couple gigs of gpu vram).
|
||||
API Support:
|
||||
* model 'tts-1' via [piper tts](https://github.com/rhasspy/piper) (fast, can use cpu)
|
||||
* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 (fast, uses almost 4GB GPU VRAM)
|
||||
* Can be run without TTS/xtts_v2, entirely on cpu
|
||||
|
||||
Installation instructions:
|
||||
--------------------------
|
||||
Compatibility:
|
||||
* `tts-1`: `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable)
|
||||
* `tts-1-hd`: `alloy`, (incomplete, they're all alloy)
|
||||
* Custom cloned voices can be used for tts-1-hd, just save a WAV file in `/voices/`
|
||||
* You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) and xtts_v2 speaker clones via `voice_to_speaker.yaml`
|
||||
* response_format: mp3, opus, aac, or flac
|
||||
* Sometimes certain words or symbols will sound bad, you can fix them with regex via `pre_process_map.yaml`
|
||||
|
||||
```pip install -r requirements.txt```
|
||||
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
|
||||
|
||||
You need to install [ffmpeg](https://ffmpeg.org/download.html)
|
||||
Version: 0.2.0
|
||||
|
||||
To download voices in advance:
|
||||
Last update: 2023-11-27
|
||||
|
||||
API Documentation
|
||||
-----------------
|
||||
|
||||
* [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech)
|
||||
* [OpenAI API Reference](https://platform.openai.com/docs/api-reference/audio/createSpeech)
|
||||
|
||||
|
||||
Installation instructions
|
||||
-------------------------
|
||||
|
||||
for the tts-1 model:
|
||||
```shell
|
||||
piper --update-voices --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
|
||||
piper --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
|
||||
# Install the Python requirements
|
||||
pip install -r requirements.txt
|
||||
# install ffmpeg
|
||||
sudo apt install ffmpeg
|
||||
# Download the voice models:
|
||||
# for tts-1
|
||||
bash download_voices_tts-1.sh
|
||||
# and for tts-1-hd
|
||||
bash download_voices_tts-1-hd.sh
|
||||
```
|
||||
|
||||
for tts-1-hd:
|
||||
```shell
|
||||
COQUI_TOS_AGREED=1
|
||||
tts --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --text "." --language_idx en > /dev/null
|
||||
Usage
|
||||
-----
|
||||
|
||||
```
|
||||
usage: main.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload_xtts] [-P PORT] [-H HOST]
|
||||
|
||||
OpenedAI Speech API Server
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
--piper_cuda Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough (default: False)
|
||||
--xtts_device XTTS_DEVICE
|
||||
Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
|
||||
--preload_xtts Preload the xtts model. By default it's loaded on first use. (default: False)
|
||||
-P PORT, --port PORT Server tcp port (default: 8000)
|
||||
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: localhost)
|
||||
```
|
||||
|
||||
Run the server, it listens on ```port 8000``` by default:
|
||||
|
||||
```python main.py```
|
||||
|
||||
API Usage
|
||||
---------
|
||||
Sample API Usage
|
||||
----------------
|
||||
|
||||
You can use it like this:
|
||||
|
||||
@ -89,12 +124,4 @@ docker compose build
|
||||
docker compose up
|
||||
```
|
||||
|
||||
By default it will build a minimal docker image with piper and tts-1 support only. You can edit docker-compose.yml to change this.
|
||||
|
||||
Voice sounds bad on some words or symbols? Check out ```pre_process_map.yaml``` and add a regular express to replace it with something that sounds right.
|
||||
|
||||
Want to change the voices or add your own? Check out ```voice_to_speaker.yaml```. I tried to map the voices to something similar to the OpenAI voices, but some are better than others.
|
||||
|
||||
If you find a better voice match, please let me know so I can update the defaults.
|
||||
|
||||
Voice models for tts-1-hd/xtts2 are incomplete, you can add your own WAV file samples to make more voices, see allow.wav for a sample.
|
||||
If you want a minimal docker image with piper only (see: Dockerfile.min). You can edit the `docker-compose.yml` to change this.
|
||||
|
||||
@ -3,20 +3,21 @@ services:
|
||||
server:
|
||||
build:
|
||||
context: .
|
||||
#dockerfile: Dockerfile.min
|
||||
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2
|
||||
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required
|
||||
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required
|
||||
stdin_open: true
|
||||
tty: true
|
||||
ports:
|
||||
- "8000:8000"
|
||||
#volumes:
|
||||
# - .:/app/
|
||||
# Below can be removed if not using GPU
|
||||
runtime: nvidia
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
#device_ids: ['0', '1'] # Select a gpu, or
|
||||
count: all
|
||||
#device_ids: ['0', '1']
|
||||
capabilities: [gpu]
|
||||
ports:
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- .:/app/
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
#!/bin/sh
|
||||
COQUI_TOS_AGREED=1
|
||||
tts --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --text "Done" --language_idx "en" --speaker_wav voices/alloy.wav --pipe_out | \
|
||||
ffmpeg -f s16le -ar 22050 -ac 1 -i - > /dev/null
|
||||
rm -f tts_output.wav
|
||||
model="tts_models/multilingual/multi-dataset/xtts_v2"
|
||||
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
|
||||
|
||||
133
main.py
133
main.py
@ -1,84 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import yaml
|
||||
import re
|
||||
import io
|
||||
import os
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import yaml
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import StreamingResponse
|
||||
import uvicorn
|
||||
from pydantic import BaseModel
|
||||
import numpy as np
|
||||
import torch
|
||||
#import TTS
|
||||
from TTS.api import TTS
|
||||
from TTS.config import load_config
|
||||
from TTS.utils.manage import ModelManager
|
||||
from TTS.utils.synthesizer import Synthesizer
|
||||
from TTS.utils.audio.numpy_transforms import save_wav
|
||||
|
||||
piper_cuda = False # onnxruntime-gpu not working for me, but cpu is fast enough
|
||||
xtts_device = 'cuda'
|
||||
|
||||
xtts = None
|
||||
args = None
|
||||
app = FastAPI()
|
||||
|
||||
class FakeBufferedIO(io.BytesIO):
|
||||
def __init__(self):
|
||||
self.buffer = self
|
||||
|
||||
class xtts_wrapper():
|
||||
def __init__(self, model_name):
|
||||
|
||||
self.xtts = TTS(model_name=model_name, progress_bar=False, gpu=True).to(xtts_device)
|
||||
"""
|
||||
vocoder_path, vocoder_config_path = None, None
|
||||
tts_loc = Path(TTS.__file__).parent / '.models.json'
|
||||
manager = ModelManager(tts_loc)
|
||||
model_path, config_path, model_item = manager.download_model(model_name)
|
||||
if not config_path:
|
||||
config_path = os.path.join(model_path, "config.json")
|
||||
#print(model_path, config_path, model_item)
|
||||
#vocoder_path, vocoder_config_path, _ = manager.download_model(model_item["default_vocoder"])
|
||||
|
||||
self.xtts_synthesizer = Synthesizer(
|
||||
tts_checkpoint=model_path,
|
||||
tts_config_path=config_path,
|
||||
#tts_speakers_file=None,
|
||||
#tts_languages_file=None,
|
||||
#vocoder_checkpoint=vocoder_path,
|
||||
#vocoder_config=vocoder_config_path,
|
||||
#encoder_checkpoint="",
|
||||
#encoder_config="",
|
||||
use_cuda=xtts_cuda,
|
||||
)
|
||||
|
||||
self.use_multi_speaker = hasattr(self.xtts_synthesizer.tts_model, "num_speakers") and (
|
||||
self.xtts_synthesizer.tts_model.num_speakers > 1 or self.xtts_synthesizer.tts_speakers_file is not None
|
||||
)
|
||||
self.speaker_manager = getattr(self.xtts_synthesizer.tts_model, "speaker_manager", None)
|
||||
|
||||
self.use_multi_language = hasattr(self.xtts_synthesizer.tts_model, "num_languages") and (
|
||||
self.xtts_synthesizer.tts_model.num_languages > 1 or self.xtts_synthesizer.tts_languages_file is not None
|
||||
)
|
||||
self.language_manager = getattr(self.xtts_synthesizer.tts_model, "language_manager", None)
|
||||
"""
|
||||
self.xtts = TTS(model_name=model_name, progress_bar=False).to(args.xtts_device)
|
||||
|
||||
def tts(self, text, speaker_wav, speed):
|
||||
io_ret = FakeBufferedIO()
|
||||
tf, file_path = tempfile.mkstemp(suffix='.wav')
|
||||
|
||||
file_path = self.xtts.tts_to_file(
|
||||
text,
|
||||
language='en',
|
||||
speaker_wav=speaker_wav,
|
||||
speed=speed,
|
||||
pipe_out=io_ret,
|
||||
file_path=file_path,
|
||||
)
|
||||
|
||||
#self.xtts.synthesizer.save_wav(wav, path='tts_output.wav', pipe_out=io_ret)
|
||||
return io_ret
|
||||
|
||||
xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
|
||||
os.unlink(file_path)
|
||||
return tf
|
||||
|
||||
# Read pre process map on demand so it can be changed without restarting the server
|
||||
def preprocess(raw_input):
|
||||
with open('pre_process_map.yaml', 'r') as file:
|
||||
pre_process_map = yaml.safe_load(file)
|
||||
@ -86,6 +42,7 @@ def preprocess(raw_input):
|
||||
raw_input = re.sub(a, b, raw_input)
|
||||
return raw_input
|
||||
|
||||
# Read voice map on demand so it can be changed without restarting the server
|
||||
def map_voice_to_speaker(voice: str, model: str):
|
||||
with open('voice_to_speaker.yaml', 'r') as file:
|
||||
voice_map = yaml.safe_load(file)
|
||||
@ -98,7 +55,7 @@ class GenerateSpeechRequest(BaseModel):
|
||||
response_format: str = "mp3" # mp3, opus, aac, flac
|
||||
speed: float = 1.0 # 0.25 - 4.0
|
||||
|
||||
@app.post("/v1/audio/speech") #, response_model=StreamingResponse)
|
||||
@app.post("/v1/audio/speech")
|
||||
async def generate_speech(request: GenerateSpeechRequest):
|
||||
input_text = preprocess(request.input)
|
||||
model = request.model
|
||||
@ -120,21 +77,21 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", "22050", "-ac", "1", "-i", "-"]
|
||||
|
||||
if response_format == "mp3":
|
||||
ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"]) # 32k or 64k?
|
||||
ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"])
|
||||
elif response_format == "opus":
|
||||
ffmpeg_args.extend(["-f", "ogg", "-c:a", "libopus"])
|
||||
elif response_format == "aac":
|
||||
ffmpeg_args.extend(["-f", "adts", "-c:a", "aac", "-ab", "64k"])
|
||||
elif response_format == "flac":
|
||||
ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
|
||||
#"-hwaccel:auto"
|
||||
|
||||
tts_io_out = None
|
||||
|
||||
if model == 'tts-1':
|
||||
piper_model, speaker = map_voice_to_speaker(voice, model)
|
||||
# Use piper for tts-1, and if xtts_device == none use for all models.
|
||||
if model == 'tts-1' or args.xtts_device == 'none':
|
||||
piper_model, speaker = map_voice_to_speaker(voice, 'tts-1')
|
||||
tts_args = ["piper", "--model", str(piper_model), "--data-dir", "voices", "--download-dir", "voices", "--output-raw"]
|
||||
if piper_cuda:
|
||||
if args.piper_cuda:
|
||||
tts_args.extend(["--cuda"])
|
||||
if speaker:
|
||||
tts_args.extend(["--speaker", str(speaker)])
|
||||
@ -146,41 +103,41 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
tts_proc.stdin.close()
|
||||
tts_io_out = tts_proc.stdout
|
||||
|
||||
# Use xtts_v2 for tts-1-hd
|
||||
elif model == 'tts-1-hd':
|
||||
tts_model, speaker = model, speaker = map_voice_to_speaker(voice, model)
|
||||
if not xtts:
|
||||
xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
|
||||
|
||||
tts_model, speaker = model, speaker = map_voice_to_speaker(voice, 'tts-1-hd')
|
||||
|
||||
#tts_args = ["tts", "--text", input_text, "--use_cuda", "USE_CUDA", "--model_name", str(tts_model), "--language_idx", "en", "--pipe_out" ]
|
||||
#if speaker:
|
||||
# tts_args.extend(["--speaker_wav", str(speaker)])
|
||||
if speed > 2.0: # tts has a max speed of 2.0
|
||||
ffmpeg_args.extend(["-af", "atempo=2.0"])
|
||||
speed = min(speed / 2.0, 2.0)
|
||||
#if speed != 1.0:
|
||||
# tts_args.extend(["--speed", str(speed)])
|
||||
if speed == 1.0:
|
||||
speed = None
|
||||
|
||||
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
|
||||
|
||||
# if torch.is_tensor(wav):
|
||||
# wav = wav.cpu().numpy()
|
||||
# if isinstance(wav, list):
|
||||
# wav = np.array(wav)
|
||||
|
||||
#tts_io_out = io.BytesIO()
|
||||
#save_wav(wav, tts_io_out)
|
||||
|
||||
#tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
|
||||
# Pipe the output from piper to the input of ffmpeg
|
||||
# Pipe the output from piper/xtts to the input of ffmpeg
|
||||
ffmpeg_args.extend(["-"])
|
||||
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_io_out, stdout=subprocess.PIPE)
|
||||
|
||||
#print(" ".join(tts_args))
|
||||
#print(" ".join(ffmpeg_args))
|
||||
|
||||
return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000) # , root_path=cwd, access_log=False, log_level="info", ssl_keyfile="cert.pem", ssl_certfile="cert.pem")
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='main.py',
|
||||
description='OpenedAI Speech API Server',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
|
||||
parser.add_argument('--piper_cuda', action='store_true', default=False, help="Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough")
|
||||
parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
|
||||
parser.add_argument('--preload_xtts', action='store_true', default=False, help="Preload the xtts model. By default it's loaded on first use.")
|
||||
parser.add_argument('-P', '--port', action='store', default=8000, help="Server tcp port")
|
||||
parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.preload_xtts:
|
||||
xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
|
||||
|
||||
uvicorn.run(app, host=args.host, port=args.port) # , root_path=cwd, access_log=False, log_level="info", ssl_keyfile="cert.pem", ssl_certfile="cert.pem")
|
||||
@ -2,4 +2,4 @@ fastapi
|
||||
uvicorn
|
||||
piper-tts
|
||||
TTS
|
||||
#onnxruntime-gpu #not needed
|
||||
onnxruntime-gpu
|
||||
@ -26,16 +26,16 @@ tts-1-hd:
|
||||
speaker: voices/alloy.wav
|
||||
echo:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/echo.wav # TODO
|
||||
speaker: voices/alloy.wav # voices/echo.wav # TODO
|
||||
fable:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/fable.wav # TODO
|
||||
speaker: voices/alloy.wav # voices/fable.wav # TODO
|
||||
onyx:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/onyx.wav # TODO
|
||||
speaker: voices/alloy.wav # voices/onyx.wav # TODO
|
||||
nova:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/nova.wav # TODO
|
||||
speaker: voices/alloy.wav # voices/nova.wav # TODO
|
||||
shimmer:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/shimmer.wav # TODO
|
||||
speaker: voices/alloy.wav # voices/shimmer.wav # TODO
|
||||
|
||||
Loading…
Reference in New Issue
Block a user