mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
0.9.0
This commit is contained in:
parent
bd3c7a601a
commit
4d76aca1af
@ -1,10 +1,11 @@
|
||||
FROM ubuntu:22.04
|
||||
FROM python:3.11-slim
|
||||
|
||||
ENV COQUI_TOS_AGREED=1
|
||||
ENV PRELOAD_MODEL=xtts
|
||||
# or PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip
|
||||
apt-get install --no-install-recommends -y curl git ffmpeg
|
||||
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
@ -12,7 +13,7 @@ RUN mkdir -p /app/voices
|
||||
COPY voices/alloy-alt.wav /app/voices/
|
||||
WORKDIR /app
|
||||
COPY *.txt /app/
|
||||
RUN pip install -r requirements.txt
|
||||
RUN pip install --no-cache -r requirements.txt
|
||||
COPY *.sh /app/
|
||||
RUN ./download_voices_tts-1.sh
|
||||
RUN ./download_voices_tts-1-hd.sh
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
FROM ubuntu:22.04
|
||||
FROM python:3.11-slim
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn
|
||||
apt-get install --no-install-recommends -y ffmpeg curl
|
||||
|
||||
RUN pip install piper-tts
|
||||
RUN pip install --no-cache piper-tts pyyaml fastapi uvicorn
|
||||
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
|
||||
18
README.md
18
README.md
@ -24,6 +24,16 @@ Details:
|
||||
|
||||
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
|
||||
|
||||
|
||||
Version: 0.9.0, 2024-04-23
|
||||
|
||||
* Fix bug with yaml and loading UTF-8
|
||||
* New sample text-to-speech application `say.py`
|
||||
* Smaller docker base image
|
||||
* Add beta [parler-tts](https://huggingface.co/parler-tts/parler_tts_mini_v0.1) support (you can describe very basic features of the speaker voice), See: (https://www.text-description-to-speech.com/) for some examples of how to describe voices. Voices can be defined in the `voice_to_speaker.yaml`.
|
||||
* 2 example [parler-tts](https://huggingface.co/parler-tts/parler_tts_mini_v0.1) voices are included in the `voice_to_speaker.yaml` file.
|
||||
* parler-tts is experimental software and is kind of slow. The exact voice will be slightly different each generation but should be similar to the basic description.
|
||||
|
||||
Version: 0.8.0, 2024-03-23
|
||||
|
||||
* Cleanup, docs update.
|
||||
@ -128,6 +138,14 @@ with client.audio.speech.with_streaming_response.create(
|
||||
response.stream_to_file("speech.mp3")
|
||||
```
|
||||
|
||||
Also see the `say.py` sample application for an example of how to use the openai-python API.
|
||||
|
||||
```
|
||||
$ python say.py -i "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
|
||||
$ python say.py -i "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
|
||||
```
|
||||
|
||||
|
||||
Custom Voices Howto
|
||||
-------------------
|
||||
|
||||
|
||||
@ -2,8 +2,9 @@ services:
|
||||
server:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, 13GB
|
||||
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~900MB
|
||||
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, ~10GB
|
||||
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
|
||||
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "parler-tts/parler_tts_mini_v0.1"]
|
||||
command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "xtts"]
|
||||
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--xtts_device", "none"] # min
|
||||
ports:
|
||||
|
||||
@ -1,5 +1,9 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
# piper-tts
|
||||
piper-tts
|
||||
onnxruntime-gpu
|
||||
# xtts
|
||||
TTS
|
||||
# parler-tts
|
||||
git+https://github.com/huggingface/parler-tts.git
|
||||
|
||||
80
say.py
Executable file
80
say.py
Executable file
@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import sys
|
||||
import os
|
||||
import tempfile
|
||||
import argparse
|
||||
|
||||
try:
|
||||
import dotenv
|
||||
dotenv.load_dotenv(override=True)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from playsound import playsound
|
||||
except ImportError:
|
||||
playsound = None
|
||||
|
||||
import openai
|
||||
|
||||
|
||||
def parse_args(argv):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-m", "--model", type=str, default="tts-1")#, choices=["tts-1", "tts-1-hd"])
|
||||
parser.add_argument("-v", "--voice", type=str, default="alloy")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"])
|
||||
parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"])
|
||||
parser.add_argument("-s", "--speed", type=float, default=1.0)
|
||||
parser.add_argument("-i", "--input", type=str)
|
||||
|
||||
if playsound is None:
|
||||
parser.add_argument("-o", "--output", type=str) # required
|
||||
parser.add_argument("-p", "--playsound", type=None, default=None, help="python playsound not found. pip install playsound")
|
||||
else:
|
||||
parser.add_argument("-o", "--output", type=str, default=None) # not required
|
||||
parser.add_argument("-p", "--playsound", action="store_true")
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args(sys.argv[1:])
|
||||
|
||||
if args.playsound and playsound is None:
|
||||
print("playsound module not found, audio will not be played, use -o <filename> to save output to a file. pip install playsound")
|
||||
sys.exit(1)
|
||||
|
||||
if not args.playsound and not args.output:
|
||||
print("Must select one of playsound (-p) or output file name (-o)")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
client = openai.OpenAI(
|
||||
# This part is not needed if you set these environment variables before import openai
|
||||
# export OPENAI_API_KEY=sk-11111111111
|
||||
# export OPENAI_BASE_URL=http://localhost:8000/v1
|
||||
api_key = os.environ.get("OPENAI_API_KEY", "sk-ip"),
|
||||
base_url = os.environ.get("OPENAI_BASE_URL", "http://localhost:8000/v1"),
|
||||
)
|
||||
|
||||
if args.playsound and args.output is None:
|
||||
tf, args.output = file_path = tempfile.mkstemp(suffix='.wav')
|
||||
else:
|
||||
tf = None
|
||||
|
||||
with client.audio.speech.with_streaming_response.create(
|
||||
model=args.model,
|
||||
voice=args.voice,
|
||||
speed=args.speed,
|
||||
response_format=args.format,
|
||||
input=args.input,
|
||||
) as response:
|
||||
response.stream_to_file(args.output)
|
||||
|
||||
if args.playsound:
|
||||
playsound(args.output)
|
||||
|
||||
if tf:
|
||||
os.unlink(args.output)
|
||||
90
speech.py
90
speech.py
@ -1,17 +1,26 @@
|
||||
#!/usr/bin/env python3
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import yaml
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import StreamingResponse, PlainTextResponse
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import StreamingResponse
|
||||
import uvicorn
|
||||
from pydantic import BaseModel
|
||||
|
||||
# for parler
|
||||
try:
|
||||
from parler_tts import ParlerTTSForConditionalGeneration
|
||||
from transformers import AutoTokenizer, logging
|
||||
import torch
|
||||
import soundfile as sf
|
||||
logging.set_verbosity_error()
|
||||
has_parler_tts = True
|
||||
except ImportError:
|
||||
print("No parler support found")
|
||||
has_parler_tts = False
|
||||
|
||||
import openedai
|
||||
|
||||
xtts = None
|
||||
@ -20,7 +29,6 @@ app = openedai.OpenAIStub()
|
||||
|
||||
class xtts_wrapper():
|
||||
def __init__(self, model_name, device):
|
||||
global args
|
||||
self.model_name = model_name
|
||||
self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)
|
||||
|
||||
@ -38,9 +46,28 @@ class xtts_wrapper():
|
||||
os.unlink(file_path)
|
||||
return tf
|
||||
|
||||
class parler_tts():
|
||||
def __init__(self, model_name, device):
|
||||
self.model_name = model_name
|
||||
self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
def tts(self, text, description):
|
||||
input_ids = self.tokenizer(description, return_tensors="pt").input_ids.to(self.model.device)
|
||||
prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.model.device)
|
||||
|
||||
generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
||||
audio_arr = generation.cpu().numpy().squeeze()
|
||||
|
||||
tf, file_path = tempfile.mkstemp(suffix='.wav')
|
||||
sf.write(file_path, audio_arr, self.model.config.sampling_rate)
|
||||
os.unlink(file_path)
|
||||
return tf
|
||||
|
||||
|
||||
# Read pre process map on demand so it can be changed without restarting the server
|
||||
def preprocess(raw_input):
|
||||
with open('pre_process_map.yaml', 'r') as file:
|
||||
with open('pre_process_map.yaml', 'r', encoding='utf8') as file:
|
||||
pre_process_map = yaml.safe_load(file)
|
||||
for a, b in pre_process_map:
|
||||
raw_input = re.sub(a, b, raw_input)
|
||||
@ -48,7 +75,7 @@ def preprocess(raw_input):
|
||||
|
||||
# Read voice map on demand so it can be changed without restarting the server
|
||||
def map_voice_to_speaker(voice: str, model: str):
|
||||
with open('voice_to_speaker.yaml', 'r') as file:
|
||||
with open('voice_to_speaker.yaml', 'r', encoding='utf8') as file:
|
||||
voice_map = yaml.safe_load(file)
|
||||
return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'],
|
||||
|
||||
@ -120,26 +147,38 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
elif model == 'tts-1-hd':
|
||||
tts_model, speaker = map_voice_to_speaker(voice, 'tts-1-hd')
|
||||
|
||||
if not xtts or xtts.model_name != tts_model:
|
||||
if xtts:
|
||||
import torch, gc
|
||||
del xtts
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
if xtts is not None and xtts.model_name != tts_model:
|
||||
import torch, gc
|
||||
del xtts
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
xtts = xtts_wrapper(tts_model, device=args.xtts_device)
|
||||
if 'parler-tts' in tts_model and has_parler_tts:
|
||||
if not xtts:
|
||||
xtts = parler_tts(tts_model, device=args.xtts_device)
|
||||
|
||||
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")
|
||||
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate=str(xtts.model.config.sampling_rate))
|
||||
|
||||
# tts speed doesn't seem to work well
|
||||
if speed < 0.5:
|
||||
speed = speed / 0.5
|
||||
ffmpeg_args.extend(["-af", "atempo=0.5"])
|
||||
if speed > 1.0:
|
||||
ffmpeg_args.extend(["-af", f"atempo={speed}"])
|
||||
speed = 1.0
|
||||
if speed != 1:
|
||||
ffmpeg_args.extend(["-af", f"atempo={speed}"])
|
||||
|
||||
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
|
||||
tts_io_out = xtts.tts(text=input_text, description=speaker)
|
||||
|
||||
else:
|
||||
if not xtts:
|
||||
xtts = xtts_wrapper(tts_model, device=args.xtts_device)
|
||||
|
||||
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")
|
||||
|
||||
# tts speed doesn't seem to work well
|
||||
if speed < 0.5:
|
||||
speed = speed / 0.5
|
||||
ffmpeg_args.extend(["-af", "atempo=0.5"])
|
||||
if speed > 1.0:
|
||||
ffmpeg_args.extend(["-af", f"atempo={speed}"])
|
||||
speed = 1.0
|
||||
|
||||
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
|
||||
|
||||
# Pipe the output from piper/xtts to the input of ffmpeg
|
||||
ffmpeg_args.extend(["-"])
|
||||
@ -165,7 +204,10 @@ if __name__ == "__main__":
|
||||
from TTS.api import TTS
|
||||
|
||||
if args.preload:
|
||||
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
|
||||
if 'parler-tts' in args.preload:
|
||||
xtts = parler_tts(args.preload, device=args.xtts_device)
|
||||
else:
|
||||
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
|
||||
|
||||
app.register_model('tts-1')
|
||||
app.register_model('tts-1-hd')
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
tts-1:
|
||||
tts-1:
|
||||
some_other_voice_name_you_want:
|
||||
model: voices/choose your own model.onnx
|
||||
speaker: set your own speaker
|
||||
@ -48,3 +48,9 @@ tts-1-hd:
|
||||
me:
|
||||
model: xtts_v2.0.2 # you can specify different xtts version
|
||||
speaker: voices/me.wav # this could be you
|
||||
parler:
|
||||
model: parler-tts/parler_tts_mini_v0.1
|
||||
speaker: A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast.
|
||||
parler2:
|
||||
model: parler-tts/parler_tts_mini_v0.1
|
||||
speaker: A female voice with an Indian accent enunciates every word with precision. The speaker's voice is very close-sounding, and the recording is excellent, capturing her voice with crisp clarity.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user