This commit is contained in:
matatonic 2023-11-26 21:41:59 -05:00
parent 93eb45f6e7
commit e4d001da93
8 changed files with 326 additions and 0 deletions

17
Dockerfile.min Normal file
View File

@ -0,0 +1,17 @@
FROM ubuntu:22.04
# tts-1 only
RUN apt-get update && \
apt-get install --no-install-recommends -y wget ffmpeg python3-pip python3-yaml python3-fastapi python3-uvicorn && \
apt-get clean && rm -rf /var/lib/apt/lists/*
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
COPY *.py *.yaml /app/
WORKDIR /app
RUN pip install piper-tts
RUN piper --update-voices --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
RUN piper --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
CMD python3 main.py

98
README.md Normal file
View File

@ -0,0 +1,98 @@
openedai API for audio/speech
-----------------------------
This is an API clone of the OpenAI API for text to speech audio generation.
This is v0.1, so please excuse the rough docs and configuration.
It currently supports 'tts-1' via piper tts (fast, ~1 sec latency), and 'tts-1-hd' via xtts_v2 (slow, also uses a couple gigs of gpu vram).
Installation instructions:
--------------------------
```pip install -r requirements.txt```
To download voices in advance:
for the tts-1 model:
```shell
piper --update-voices --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
piper --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
```
for tts-1-hd:
```shell
COQUI_TOS_AGREED=1
tts --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --text "." --language_idx en > /dev/null
```
Run the server, it listens on ```port 8000``` by default:
```python main.py```
API Usage
---------
You can use it like this:
```shell
curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
"model": "tts-1",
"input": "The quick brown fox jumped over the lazy dog.",
"voice": "alloy",
"response_format": "mp3",
"speed": 1.0
}' > speech.mp3
```
Or just like this:
```shell
curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
"input": "The quick brown fox jumped over the lazy dog."}' > speech.mp3
```
Or like this example from the [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech):
```python
import openai
client = openai.OpenAI(
# This part is not needed if you set these environment variables before import openai
# export OPENAI_API_KEY=sk-11111111111
# export OPENAI_BASE_URL=http://localhost:8000/v1
api_key = "sk-111111111",
base_url = "http://localhost:8000/v1",
)
response = client.audio.speech.create(
model="tts-1",
voice="alloy",
input="Today is a wonderful day to build something people love!"
)
response.stream_to_file("speech.mp3")
```
Docker support
--------------
You can run the server via docker like so:
```shell
docker compose build
docker compose up
```
By default it will build a minimal docker image with piper and tts-1 support only. You can edit docker-compose.yml to change this.
Voice sounds bad on some words or symbols? Check out ```pre_process_map.yaml``` and add a regular express to replace it with something that sounds right.
Want to change the voices or add your own? Check out ```voice_to_speaker.yaml```. I tried to map the voices to something similar to the OpenAI voices, but some are better than others.
If you find a better voice match, please let me know so I can update the defaults.
Voice models for tts-1-hd/xtts2 are incomplete, you can add your own WAV file samples to make more voices, see allow.wav for a sample.

20
docker-compose.yml Normal file
View File

@ -0,0 +1,20 @@
version: "3.3"
services:
server:
build:
context: .
dockerfile: Dockerfile.min
#dockerfile: Dockerfile # for tts-1-hd support via xtts_v2
stdin_open: true
tty: true
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
#device_ids: ['0', '1']
capabilities: [gpu]
ports:
- "8000:8000"

106
main.py Executable file
View File

@ -0,0 +1,106 @@
#!/usr/bin/env python3
import subprocess
import yaml
import re
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import uvicorn
from pydantic import BaseModel
piper_cuda = False # onnxruntime-gpu not working for me, but cpu is fast enough
app = FastAPI()
def preprocess(raw_input):
with open('pre_process_map.yaml', 'r') as file:
pre_process_map = yaml.safe_load(file)
for a, b in pre_process_map:
raw_input = re.sub(a, b, raw_input)
return raw_input
def map_voice_to_speaker(voice: str, model: str):
with open('voice_to_speaker.yaml', 'r') as file:
voice_map = yaml.safe_load(file)
return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'],
class GenerateSpeechRequest(BaseModel):
model: str = "tts-1" # or "tts-1-hd"
input: str
voice: str = "alloy" # alloy, echo, fable, onyx, nova, and shimmer
response_format: str = "mp3" # mp3, opus, aac, flac
speed: float = 1.0 # 0.25 - 4.0
@app.post("/v1/audio/speech") #, response_model=StreamingResponse)
async def generate_speech(request: GenerateSpeechRequest):
input_text = preprocess(request.input)
model = request.model
voice = request.voice
response_format = request.response_format
speed = request.speed
# Set the Content-Type header based on the requested format
if response_format == "mp3":
media_type = "audio/mpeg"
elif response_format == "opus":
media_type = "audio/ogg;codecs=opus"
elif response_format == "aac":
media_type = "audio/aac"
elif response_format == "flac":
media_type = "audio/x-flac"
# Convert the output to the desired format using ffmpeg
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", "22050", "-ac", "1", "-i", "-"]
if response_format == "mp3":
ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"]) # 32k or 64k?
elif response_format == "opus":
ffmpeg_args.extend(["-f", "ogg", "-c:a", "libopus"])
elif response_format == "aac":
ffmpeg_args.extend(["-f", "adts", "-c:a", "aac", "-ab", "64k"])
elif response_format == "flac":
ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
#"-hwaccel:auto"
tts_args = []
tts_proc = None
if model == 'tts-1':
piper_model, speaker = map_voice_to_speaker(voice, model)
tts_args = ["piper", "--model", str(piper_model), "--data-dir", "voices", "--download-dir", "voices", "--output-raw"]
if piper_cuda:
tts_args.extend(["--cuda"])
if speaker:
tts_args.extend(["--speaker", str(speaker)])
if speed != 1.0:
tts_args.extend(["--length-scale", f"{1.0/speed}"])
tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
elif model == 'tts-1-hd':
#tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to(device)
#tts.tts_to_file(text=ttstext, file_path=output_filename, speaker_wav=self.speaker_wav)
tts_model, speaker = model, speaker = map_voice_to_speaker(voice, model)
tts_args = ["tts", "--text", input_text, "--use_cuda", "USE_CUDA", "--model_name", str(tts_model), "--language_idx", "en", "--pipe_out" ]
if speaker:
tts_args.extend(["--speaker_wav", str(speaker)])
if speed > 2.0: # tts has a max speed of 2.0
ffmpeg_args.extend(["-af", "atempo=2.0"])
speed = min(speed / 2.0, 2.0)
if speed != 1.0:
tts_args.extend(["--speed", str(speed)])
tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
# Pipe the output from piper to the input of ffmpeg
ffmpeg_args.extend(["-"])
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_proc.stdout, stdout=subprocess.PIPE)
tts_proc.stdin.close()
#print(" ".join(tts_args))
#print(" ".join(ffmpeg_args))
return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000) # , root_path=cwd, access_log=False, log_level="info", ssl_keyfile="cert.pem", ssl_certfile="cert.pem")

39
pre_process_map.yaml Normal file
View File

@ -0,0 +1,39 @@
# regex pairs to clean the text before speaking
- - ([^.])\.$
- \1
- - '&amp;'
- '&'
- - '&lt;'
- <
- - '&gt;'
- '>'
- - '&quot;'
- '"'
- - '&#x27;'
- ''''
- - '&copy;'
- '©'
- - '&reg;'
- '®'
- - '&nbsp;'
- ' '
- - '"'
- ''
- - ' biases '
- ' bias''s '
- - ex\.
- for example
- - e\.g\.
- for example
- - ' ESG '
- ' E.S.G. '
- - ' FY '
- ' F.Y. '
- - ([0-9]+)-([0-9]+)
- \1 to \2
- - '[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+'
- ''
- - '\*\*\*'
- '*'
- - '\*\*'
- '*'

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
fastapi
uvicorn
piper-tts
TTS
#onnxruntime-gpu #not needed

41
voice_to_speaker.yaml Normal file
View File

@ -0,0 +1,41 @@
tts-1:
some_other_voice_name_you_want:
model: voices/choose your own model.onnx
speaker: set your own speaker
alloy:
model: voices/en_US-libritts_r-medium.onnx
speaker: 118 # 118, 64, 79, 80, 101, 130
echo:
model: voices/en_US-libritts_r-medium.onnx # or en_US-ryan-high:0
speaker: 52 # 52, 102, 134
fable:
model: voices/en_GB-northern_english_male-medium.onnx
speaker: # default speaker
onyx:
model: voices/en_US-libritts_r-medium.onnx
speaker: 55 # 55, 72, 90, 104, 132, 136, 137, 159
nova:
model: voices/en_US-libritts_r-medium.onnx
speaker: 57 # 57, 61, 107, 150, 162
shimmer:
model: voices/en_US-libritts_r-medium.onnx
speaker: 163
tts-1-hd:
alloy:
model: tts_models/multilingual/multi-dataset/xtts_v2
speaker: voices/alloy.wav
echo:
model: tts_models/multilingual/multi-dataset/xtts_v2
speaker: voices/echo.wav # TODO
fable:
model: tts_models/multilingual/multi-dataset/xtts_v2
speaker: voices/fable.wav # TODO
onyx:
model: tts_models/multilingual/multi-dataset/xtts_v2
speaker: voices/onyx.wav # TODO
nova:
model: tts_models/multilingual/multi-dataset/xtts_v2
speaker: voices/nova.wav # TODO
shimmer:
model: tts_models/multilingual/multi-dataset/xtts_v2
speaker: voices/shimmer.wav # TODO

BIN
voices/alloy.wav Normal file

Binary file not shown.