mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
initial
This commit is contained in:
parent
93eb45f6e7
commit
e4d001da93
17
Dockerfile.min
Normal file
17
Dockerfile.min
Normal file
@ -0,0 +1,17 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
# tts-1 only
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y wget ffmpeg python3-pip python3-yaml python3-fastapi python3-uvicorn && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
COPY *.py *.yaml /app/
|
||||
WORKDIR /app
|
||||
RUN pip install piper-tts
|
||||
|
||||
RUN piper --update-voices --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
|
||||
RUN piper --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
|
||||
|
||||
CMD python3 main.py
|
||||
98
README.md
Normal file
98
README.md
Normal file
@ -0,0 +1,98 @@
|
||||
openedai API for audio/speech
|
||||
-----------------------------
|
||||
|
||||
This is an API clone of the OpenAI API for text to speech audio generation.
|
||||
|
||||
This is v0.1, so please excuse the rough docs and configuration.
|
||||
|
||||
It currently supports 'tts-1' via piper tts (fast, ~1 sec latency), and 'tts-1-hd' via xtts_v2 (slow, also uses a couple gigs of gpu vram).
|
||||
|
||||
Installation instructions:
|
||||
--------------------------
|
||||
|
||||
```pip install -r requirements.txt```
|
||||
|
||||
To download voices in advance:
|
||||
|
||||
for the tts-1 model:
|
||||
```shell
|
||||
piper --update-voices --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
|
||||
piper --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
|
||||
```
|
||||
|
||||
for tts-1-hd:
|
||||
```shell
|
||||
COQUI_TOS_AGREED=1
|
||||
tts --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --text "." --language_idx en > /dev/null
|
||||
```
|
||||
|
||||
Run the server, it listens on ```port 8000``` by default:
|
||||
|
||||
```python main.py```
|
||||
|
||||
API Usage
|
||||
---------
|
||||
|
||||
You can use it like this:
|
||||
|
||||
```shell
|
||||
curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
|
||||
"model": "tts-1",
|
||||
"input": "The quick brown fox jumped over the lazy dog.",
|
||||
"voice": "alloy",
|
||||
"response_format": "mp3",
|
||||
"speed": 1.0
|
||||
}' > speech.mp3
|
||||
```
|
||||
|
||||
Or just like this:
|
||||
|
||||
|
||||
```shell
|
||||
curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
|
||||
"input": "The quick brown fox jumped over the lazy dog."}' > speech.mp3
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
Or like this example from the [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech):
|
||||
|
||||
```python
|
||||
import openai
|
||||
|
||||
client = openai.OpenAI(
|
||||
# This part is not needed if you set these environment variables before import openai
|
||||
# export OPENAI_API_KEY=sk-11111111111
|
||||
# export OPENAI_BASE_URL=http://localhost:8000/v1
|
||||
api_key = "sk-111111111",
|
||||
base_url = "http://localhost:8000/v1",
|
||||
)
|
||||
|
||||
response = client.audio.speech.create(
|
||||
model="tts-1",
|
||||
voice="alloy",
|
||||
input="Today is a wonderful day to build something people love!"
|
||||
)
|
||||
|
||||
response.stream_to_file("speech.mp3")
|
||||
```
|
||||
|
||||
Docker support
|
||||
--------------
|
||||
|
||||
You can run the server via docker like so:
|
||||
```shell
|
||||
docker compose build
|
||||
docker compose up
|
||||
```
|
||||
|
||||
By default it will build a minimal docker image with piper and tts-1 support only. You can edit docker-compose.yml to change this.
|
||||
|
||||
Voice sounds bad on some words or symbols? Check out ```pre_process_map.yaml``` and add a regular express to replace it with something that sounds right.
|
||||
|
||||
Want to change the voices or add your own? Check out ```voice_to_speaker.yaml```. I tried to map the voices to something similar to the OpenAI voices, but some are better than others.
|
||||
|
||||
If you find a better voice match, please let me know so I can update the defaults.
|
||||
|
||||
Voice models for tts-1-hd/xtts2 are incomplete, you can add your own WAV file samples to make more voices, see allow.wav for a sample.
|
||||
20
docker-compose.yml
Normal file
20
docker-compose.yml
Normal file
@ -0,0 +1,20 @@
|
||||
version: "3.3"
|
||||
services:
|
||||
server:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.min
|
||||
#dockerfile: Dockerfile # for tts-1-hd support via xtts_v2
|
||||
stdin_open: true
|
||||
tty: true
|
||||
runtime: nvidia
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
#device_ids: ['0', '1']
|
||||
capabilities: [gpu]
|
||||
ports:
|
||||
- "8000:8000"
|
||||
106
main.py
Executable file
106
main.py
Executable file
@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import yaml
|
||||
import re
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import StreamingResponse
|
||||
import uvicorn
|
||||
from pydantic import BaseModel
|
||||
|
||||
piper_cuda = False # onnxruntime-gpu not working for me, but cpu is fast enough
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
def preprocess(raw_input):
|
||||
with open('pre_process_map.yaml', 'r') as file:
|
||||
pre_process_map = yaml.safe_load(file)
|
||||
for a, b in pre_process_map:
|
||||
raw_input = re.sub(a, b, raw_input)
|
||||
return raw_input
|
||||
|
||||
def map_voice_to_speaker(voice: str, model: str):
|
||||
with open('voice_to_speaker.yaml', 'r') as file:
|
||||
voice_map = yaml.safe_load(file)
|
||||
return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'],
|
||||
|
||||
class GenerateSpeechRequest(BaseModel):
|
||||
model: str = "tts-1" # or "tts-1-hd"
|
||||
input: str
|
||||
voice: str = "alloy" # alloy, echo, fable, onyx, nova, and shimmer
|
||||
response_format: str = "mp3" # mp3, opus, aac, flac
|
||||
speed: float = 1.0 # 0.25 - 4.0
|
||||
|
||||
@app.post("/v1/audio/speech") #, response_model=StreamingResponse)
|
||||
async def generate_speech(request: GenerateSpeechRequest):
|
||||
input_text = preprocess(request.input)
|
||||
model = request.model
|
||||
voice = request.voice
|
||||
response_format = request.response_format
|
||||
speed = request.speed
|
||||
|
||||
# Set the Content-Type header based on the requested format
|
||||
if response_format == "mp3":
|
||||
media_type = "audio/mpeg"
|
||||
elif response_format == "opus":
|
||||
media_type = "audio/ogg;codecs=opus"
|
||||
elif response_format == "aac":
|
||||
media_type = "audio/aac"
|
||||
elif response_format == "flac":
|
||||
media_type = "audio/x-flac"
|
||||
|
||||
# Convert the output to the desired format using ffmpeg
|
||||
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", "22050", "-ac", "1", "-i", "-"]
|
||||
|
||||
if response_format == "mp3":
|
||||
ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"]) # 32k or 64k?
|
||||
elif response_format == "opus":
|
||||
ffmpeg_args.extend(["-f", "ogg", "-c:a", "libopus"])
|
||||
elif response_format == "aac":
|
||||
ffmpeg_args.extend(["-f", "adts", "-c:a", "aac", "-ab", "64k"])
|
||||
elif response_format == "flac":
|
||||
ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
|
||||
#"-hwaccel:auto"
|
||||
|
||||
tts_args = []
|
||||
tts_proc = None
|
||||
|
||||
if model == 'tts-1':
|
||||
piper_model, speaker = map_voice_to_speaker(voice, model)
|
||||
tts_args = ["piper", "--model", str(piper_model), "--data-dir", "voices", "--download-dir", "voices", "--output-raw"]
|
||||
if piper_cuda:
|
||||
tts_args.extend(["--cuda"])
|
||||
if speaker:
|
||||
tts_args.extend(["--speaker", str(speaker)])
|
||||
if speed != 1.0:
|
||||
tts_args.extend(["--length-scale", f"{1.0/speed}"])
|
||||
|
||||
tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
|
||||
elif model == 'tts-1-hd':
|
||||
#tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to(device)
|
||||
#tts.tts_to_file(text=ttstext, file_path=output_filename, speaker_wav=self.speaker_wav)
|
||||
tts_model, speaker = model, speaker = map_voice_to_speaker(voice, model)
|
||||
tts_args = ["tts", "--text", input_text, "--use_cuda", "USE_CUDA", "--model_name", str(tts_model), "--language_idx", "en", "--pipe_out" ]
|
||||
if speaker:
|
||||
tts_args.extend(["--speaker_wav", str(speaker)])
|
||||
if speed > 2.0: # tts has a max speed of 2.0
|
||||
ffmpeg_args.extend(["-af", "atempo=2.0"])
|
||||
speed = min(speed / 2.0, 2.0)
|
||||
if speed != 1.0:
|
||||
tts_args.extend(["--speed", str(speed)])
|
||||
|
||||
tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
|
||||
# Pipe the output from piper to the input of ffmpeg
|
||||
ffmpeg_args.extend(["-"])
|
||||
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_proc.stdout, stdout=subprocess.PIPE)
|
||||
tts_proc.stdin.close()
|
||||
|
||||
#print(" ".join(tts_args))
|
||||
#print(" ".join(ffmpeg_args))
|
||||
|
||||
return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000) # , root_path=cwd, access_log=False, log_level="info", ssl_keyfile="cert.pem", ssl_certfile="cert.pem")
|
||||
39
pre_process_map.yaml
Normal file
39
pre_process_map.yaml
Normal file
@ -0,0 +1,39 @@
|
||||
# regex pairs to clean the text before speaking
|
||||
- - ([^.])\.$
|
||||
- \1
|
||||
- - '&'
|
||||
- '&'
|
||||
- - '<'
|
||||
- <
|
||||
- - '>'
|
||||
- '>'
|
||||
- - '"'
|
||||
- '"'
|
||||
- - '''
|
||||
- ''''
|
||||
- - '©'
|
||||
- '©'
|
||||
- - '®'
|
||||
- '®'
|
||||
- - ' '
|
||||
- ' '
|
||||
- - '"'
|
||||
- ''
|
||||
- - ' biases '
|
||||
- ' bias''s '
|
||||
- - ex\.
|
||||
- for example
|
||||
- - e\.g\.
|
||||
- for example
|
||||
- - ' ESG '
|
||||
- ' E.S.G. '
|
||||
- - ' FY '
|
||||
- ' F.Y. '
|
||||
- - ([0-9]+)-([0-9]+)
|
||||
- \1 to \2
|
||||
- - '[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+'
|
||||
- ''
|
||||
- - '\*\*\*'
|
||||
- '*'
|
||||
- - '\*\*'
|
||||
- '*'
|
||||
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@ -0,0 +1,5 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
piper-tts
|
||||
TTS
|
||||
#onnxruntime-gpu #not needed
|
||||
41
voice_to_speaker.yaml
Normal file
41
voice_to_speaker.yaml
Normal file
@ -0,0 +1,41 @@
|
||||
tts-1:
|
||||
some_other_voice_name_you_want:
|
||||
model: voices/choose your own model.onnx
|
||||
speaker: set your own speaker
|
||||
alloy:
|
||||
model: voices/en_US-libritts_r-medium.onnx
|
||||
speaker: 118 # 118, 64, 79, 80, 101, 130
|
||||
echo:
|
||||
model: voices/en_US-libritts_r-medium.onnx # or en_US-ryan-high:0
|
||||
speaker: 52 # 52, 102, 134
|
||||
fable:
|
||||
model: voices/en_GB-northern_english_male-medium.onnx
|
||||
speaker: # default speaker
|
||||
onyx:
|
||||
model: voices/en_US-libritts_r-medium.onnx
|
||||
speaker: 55 # 55, 72, 90, 104, 132, 136, 137, 159
|
||||
nova:
|
||||
model: voices/en_US-libritts_r-medium.onnx
|
||||
speaker: 57 # 57, 61, 107, 150, 162
|
||||
shimmer:
|
||||
model: voices/en_US-libritts_r-medium.onnx
|
||||
speaker: 163
|
||||
tts-1-hd:
|
||||
alloy:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/alloy.wav
|
||||
echo:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/echo.wav # TODO
|
||||
fable:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/fable.wav # TODO
|
||||
onyx:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/onyx.wav # TODO
|
||||
nova:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/nova.wav # TODO
|
||||
shimmer:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/shimmer.wav # TODO
|
||||
BIN
voices/alloy.wav
Normal file
BIN
voices/alloy.wav
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user