This commit is contained in:
matatonic 2024-04-23 22:07:23 -04:00
parent bd3c7a601a
commit 4d76aca1af
8 changed files with 185 additions and 33 deletions

View File

@ -1,10 +1,11 @@
FROM ubuntu:22.04
FROM python:3.11-slim
ENV COQUI_TOS_AGREED=1
ENV PRELOAD_MODEL=xtts
# or PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1
RUN apt-get update && \
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip
apt-get install --no-install-recommends -y curl git ffmpeg
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
@ -12,7 +13,7 @@ RUN mkdir -p /app/voices
COPY voices/alloy-alt.wav /app/voices/
WORKDIR /app
COPY *.txt /app/
RUN pip install -r requirements.txt
RUN pip install --no-cache -r requirements.txt
COPY *.sh /app/
RUN ./download_voices_tts-1.sh
RUN ./download_voices_tts-1-hd.sh

View File

@ -1,9 +1,9 @@
FROM ubuntu:22.04
FROM python:3.11-slim
RUN apt-get update && \
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn
apt-get install --no-install-recommends -y ffmpeg curl
RUN pip install piper-tts
RUN pip install --no-cache piper-tts pyyaml fastapi uvicorn
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices

View File

@ -24,6 +24,16 @@ Details:
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
Version: 0.9.0, 2024-04-23
* Fix bug with yaml and loading UTF-8
* New sample text-to-speech application `say.py`
* Smaller docker base image
* Add beta [parler-tts](https://huggingface.co/parler-tts/parler_tts_mini_v0.1) support (you can describe very basic features of the speaker voice), See: (https://www.text-description-to-speech.com/) for some examples of how to describe voices. Voices can be defined in the `voice_to_speaker.yaml`.
* 2 example [parler-tts](https://huggingface.co/parler-tts/parler_tts_mini_v0.1) voices are included in the `voice_to_speaker.yaml` file.
* parler-tts is experimental software and is kind of slow. The exact voice will be slightly different each generation but should be similar to the basic description.
Version: 0.8.0, 2024-03-23
* Cleanup, docs update.
@ -128,6 +138,14 @@ with client.audio.speech.with_streaming_response.create(
response.stream_to_file("speech.mp3")
```
Also see the `say.py` sample application for an example of how to use the openai-python API.
```
$ python say.py -i "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
$ python say.py -i "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
```
Custom Voices Howto
-------------------

View File

@ -2,8 +2,9 @@ services:
server:
build:
context: .
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, 13GB
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~900MB
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, ~10GB
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "parler-tts/parler_tts_mini_v0.1"]
command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "xtts"]
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--xtts_device", "none"] # min
ports:

View File

@ -1,5 +1,9 @@
fastapi
uvicorn
# piper-tts
piper-tts
onnxruntime-gpu
# xtts
TTS
# parler-tts
git+https://github.com/huggingface/parler-tts.git

80
say.py Executable file
View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
import sys
import os
import tempfile
import argparse
try:
import dotenv
dotenv.load_dotenv(override=True)
except ImportError:
pass
try:
from playsound import playsound
except ImportError:
playsound = None
import openai
def parse_args(argv):
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", type=str, default="tts-1")#, choices=["tts-1", "tts-1-hd"])
parser.add_argument("-v", "--voice", type=str, default="alloy")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"])
parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"])
parser.add_argument("-s", "--speed", type=float, default=1.0)
parser.add_argument("-i", "--input", type=str)
if playsound is None:
parser.add_argument("-o", "--output", type=str) # required
parser.add_argument("-p", "--playsound", type=None, default=None, help="python playsound not found. pip install playsound")
else:
parser.add_argument("-o", "--output", type=str, default=None) # not required
parser.add_argument("-p", "--playsound", action="store_true")
args = parser.parse_args(argv)
return args
if __name__ == "__main__":
args = parse_args(sys.argv[1:])
if args.playsound and playsound is None:
print("playsound module not found, audio will not be played, use -o <filename> to save output to a file. pip install playsound")
sys.exit(1)
if not args.playsound and not args.output:
print("Must select one of playsound (-p) or output file name (-o)")
sys.exit(1)
client = openai.OpenAI(
# This part is not needed if you set these environment variables before import openai
# export OPENAI_API_KEY=sk-11111111111
# export OPENAI_BASE_URL=http://localhost:8000/v1
api_key = os.environ.get("OPENAI_API_KEY", "sk-ip"),
base_url = os.environ.get("OPENAI_BASE_URL", "http://localhost:8000/v1"),
)
if args.playsound and args.output is None:
tf, args.output = file_path = tempfile.mkstemp(suffix='.wav')
else:
tf = None
with client.audio.speech.with_streaming_response.create(
model=args.model,
voice=args.voice,
speed=args.speed,
response_format=args.format,
input=args.input,
) as response:
response.stream_to_file(args.output)
if args.playsound:
playsound(args.output)
if tf:
os.unlink(args.output)

View File

@ -1,17 +1,26 @@
#!/usr/bin/env python3
from pathlib import Path
import argparse
import os
import re
import subprocess
import tempfile
import yaml
from fastapi import FastAPI
from fastapi.responses import StreamingResponse, PlainTextResponse
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
import uvicorn
from pydantic import BaseModel
# for parler
try:
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, logging
import torch
import soundfile as sf
logging.set_verbosity_error()
has_parler_tts = True
except ImportError:
print("No parler support found")
has_parler_tts = False
import openedai
xtts = None
@ -20,7 +29,6 @@ app = openedai.OpenAIStub()
class xtts_wrapper():
def __init__(self, model_name, device):
global args
self.model_name = model_name
self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)
@ -38,9 +46,28 @@ class xtts_wrapper():
os.unlink(file_path)
return tf
class parler_tts():
def __init__(self, model_name, device):
self.model_name = model_name
self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
def tts(self, text, description):
input_ids = self.tokenizer(description, return_tensors="pt").input_ids.to(self.model.device)
prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.model.device)
generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
tf, file_path = tempfile.mkstemp(suffix='.wav')
sf.write(file_path, audio_arr, self.model.config.sampling_rate)
os.unlink(file_path)
return tf
# Read pre process map on demand so it can be changed without restarting the server
def preprocess(raw_input):
with open('pre_process_map.yaml', 'r') as file:
with open('pre_process_map.yaml', 'r', encoding='utf8') as file:
pre_process_map = yaml.safe_load(file)
for a, b in pre_process_map:
raw_input = re.sub(a, b, raw_input)
@ -48,7 +75,7 @@ def preprocess(raw_input):
# Read voice map on demand so it can be changed without restarting the server
def map_voice_to_speaker(voice: str, model: str):
with open('voice_to_speaker.yaml', 'r') as file:
with open('voice_to_speaker.yaml', 'r', encoding='utf8') as file:
voice_map = yaml.safe_load(file)
return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'],
@ -120,26 +147,38 @@ async def generate_speech(request: GenerateSpeechRequest):
elif model == 'tts-1-hd':
tts_model, speaker = map_voice_to_speaker(voice, 'tts-1-hd')
if not xtts or xtts.model_name != tts_model:
if xtts:
import torch, gc
del xtts
gc.collect()
torch.cuda.empty_cache()
if xtts is not None and xtts.model_name != tts_model:
import torch, gc
del xtts
gc.collect()
torch.cuda.empty_cache()
xtts = xtts_wrapper(tts_model, device=args.xtts_device)
if 'parler-tts' in tts_model and has_parler_tts:
if not xtts:
xtts = parler_tts(tts_model, device=args.xtts_device)
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate=str(xtts.model.config.sampling_rate))
# tts speed doesn't seem to work well
if speed < 0.5:
speed = speed / 0.5
ffmpeg_args.extend(["-af", "atempo=0.5"])
if speed > 1.0:
ffmpeg_args.extend(["-af", f"atempo={speed}"])
speed = 1.0
if speed != 1:
ffmpeg_args.extend(["-af", f"atempo={speed}"])
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
tts_io_out = xtts.tts(text=input_text, description=speaker)
else:
if not xtts:
xtts = xtts_wrapper(tts_model, device=args.xtts_device)
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")
# tts speed doesn't seem to work well
if speed < 0.5:
speed = speed / 0.5
ffmpeg_args.extend(["-af", "atempo=0.5"])
if speed > 1.0:
ffmpeg_args.extend(["-af", f"atempo={speed}"])
speed = 1.0
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
# Pipe the output from piper/xtts to the input of ffmpeg
ffmpeg_args.extend(["-"])
@ -165,7 +204,10 @@ if __name__ == "__main__":
from TTS.api import TTS
if args.preload:
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
if 'parler-tts' in args.preload:
xtts = parler_tts(args.preload, device=args.xtts_device)
else:
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
app.register_model('tts-1')
app.register_model('tts-1-hd')

View File

@ -1,4 +1,4 @@
tts-1:
tts-1:
some_other_voice_name_you_want:
model: voices/choose your own model.onnx
speaker: set your own speaker
@ -48,3 +48,9 @@ tts-1-hd:
me:
model: xtts_v2.0.2 # you can specify different xtts version
speaker: voices/me.wav # this could be you
parler:
model: parler-tts/parler_tts_mini_v0.1
speaker: A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast.
parler2:
model: parler-tts/parler_tts_mini_v0.1
speaker: A female voice with an Indian accent enunciates every word with precision. The speaker's voice is very close-sounding, and the recording is excellent, capturing her voice with crisp clarity.