This commit is contained in:
matatonic 2023-11-27 16:57:53 -05:00
parent 2fdac61ccb
commit 0ca7da80c4
7 changed files with 128 additions and 17 deletions

2
.gitignore vendored
View File

@ -1,3 +1,5 @@
voices/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

20
Dockerfile Normal file
View File

@ -0,0 +1,20 @@
FROM nvidia/cuda:11.8.0-base-ubuntu22.04
ENV COQUI_TOS_AGREED=1
#python3.11
RUN apt-get update && \
apt-get install --no-install-recommends -y python3-pip wget ffmpeg && \
apt-get clean && rm -rf /var/lib/apt/lists/*
#RUN git clone https://github.com/matatonic/openedai-api-audio-speech /app
RUN mkdir -p /app/voices
COPY *.py *.yaml *.txt *.md *.sh /app/
COPY ./voices/alloy.wav /app/voices/alloy.wav
WORKDIR /app
RUN pip install -r requirements.txt
RUN ./download_voices_tts-1.sh
RUN ./download_voices_tts-1-hd.sh
CMD python3 main.py

View File

@ -7,11 +7,10 @@ RUN apt-get update && \
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
COPY *.py *.yaml /app/
COPY *.py *.yaml *.sh /app/
WORKDIR /app
RUN pip install piper-tts
RUN piper --update-voices --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
RUN piper --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
RUN download_voices_tts-1.sh
CMD python3 main.py

View File

@ -3,8 +3,8 @@ services:
server:
build:
context: .
dockerfile: Dockerfile.min
#dockerfile: Dockerfile # for tts-1-hd support via xtts_v2
#dockerfile: Dockerfile.min
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2
stdin_open: true
tty: true
runtime: nvidia
@ -18,3 +18,5 @@ services:
capabilities: [gpu]
ports:
- "8000:8000"
volumes:
- .:/app/

5
download_voices_tts-1-hd.sh Executable file
View File

@ -0,0 +1,5 @@
#!/bin/sh
COQUI_TOS_AGREED=1
tts --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --text "Done" --language_idx "en" --speaker_wav voices/alloy.wav --pipe_out | \
ffmpeg -f s16le -ar 22050 -ac 1 -i - > /dev/null
rm -f tts_output.wav

3
download_voices_tts-1.sh Executable file
View File

@ -0,0 +1,3 @@
#!/bin/sh
piper --update-voices --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
piper --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null

104
main.py
View File

@ -2,15 +2,83 @@
import subprocess
import yaml
import re
import io
import os
from pathlib import Path
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import uvicorn
from pydantic import BaseModel
import numpy as np
import torch
#import TTS
from TTS.api import TTS
from TTS.config import load_config
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
from TTS.utils.audio.numpy_transforms import save_wav
piper_cuda = False # onnxruntime-gpu not working for me, but cpu is fast enough
xtts_device = 'cuda'
app = FastAPI()
class FakeBufferedIO(io.BytesIO):
def __init__(self):
self.buffer = self
class xtts_wrapper():
def __init__(self, model_name):
self.xtts = TTS(model_name=model_name, progress_bar=False, gpu=True).to(xtts_device)
"""
vocoder_path, vocoder_config_path = None, None
tts_loc = Path(TTS.__file__).parent / '.models.json'
manager = ModelManager(tts_loc)
model_path, config_path, model_item = manager.download_model(model_name)
if not config_path:
config_path = os.path.join(model_path, "config.json")
#print(model_path, config_path, model_item)
#vocoder_path, vocoder_config_path, _ = manager.download_model(model_item["default_vocoder"])
self.xtts_synthesizer = Synthesizer(
tts_checkpoint=model_path,
tts_config_path=config_path,
#tts_speakers_file=None,
#tts_languages_file=None,
#vocoder_checkpoint=vocoder_path,
#vocoder_config=vocoder_config_path,
#encoder_checkpoint="",
#encoder_config="",
use_cuda=xtts_cuda,
)
self.use_multi_speaker = hasattr(self.xtts_synthesizer.tts_model, "num_speakers") and (
self.xtts_synthesizer.tts_model.num_speakers > 1 or self.xtts_synthesizer.tts_speakers_file is not None
)
self.speaker_manager = getattr(self.xtts_synthesizer.tts_model, "speaker_manager", None)
self.use_multi_language = hasattr(self.xtts_synthesizer.tts_model, "num_languages") and (
self.xtts_synthesizer.tts_model.num_languages > 1 or self.xtts_synthesizer.tts_languages_file is not None
)
self.language_manager = getattr(self.xtts_synthesizer.tts_model, "language_manager", None)
"""
def tts(self, text, speaker_wav, speed):
io_ret = FakeBufferedIO()
file_path = self.xtts.tts_to_file(
text,
language='en',
speaker_wav=speaker_wav,
speed=speed,
pipe_out=io_ret,
)
#self.xtts.synthesizer.save_wav(wav, path='tts_output.wav', pipe_out=io_ret)
return io_ret
xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
def preprocess(raw_input):
with open('pre_process_map.yaml', 'r') as file:
pre_process_map = yaml.safe_load(file)
@ -61,8 +129,7 @@ async def generate_speech(request: GenerateSpeechRequest):
ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
#"-hwaccel:auto"
tts_args = []
tts_proc = None
tts_io_out = None
if model == 'tts-1':
piper_model, speaker = map_voice_to_speaker(voice, model)
@ -76,25 +143,38 @@ async def generate_speech(request: GenerateSpeechRequest):
tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
tts_proc.stdin.close()
tts_io_out = tts_proc.stdout
elif model == 'tts-1-hd':
#tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to(device)
#tts.tts_to_file(text=ttstext, file_path=output_filename, speaker_wav=self.speaker_wav)
tts_model, speaker = model, speaker = map_voice_to_speaker(voice, model)
tts_args = ["tts", "--text", input_text, "--use_cuda", "USE_CUDA", "--model_name", str(tts_model), "--language_idx", "en", "--pipe_out" ]
if speaker:
tts_args.extend(["--speaker_wav", str(speaker)])
#tts_args = ["tts", "--text", input_text, "--use_cuda", "USE_CUDA", "--model_name", str(tts_model), "--language_idx", "en", "--pipe_out" ]
#if speaker:
# tts_args.extend(["--speaker_wav", str(speaker)])
if speed > 2.0: # tts has a max speed of 2.0
ffmpeg_args.extend(["-af", "atempo=2.0"])
speed = min(speed / 2.0, 2.0)
if speed != 1.0:
tts_args.extend(["--speed", str(speed)])
#if speed != 1.0:
# tts_args.extend(["--speed", str(speed)])
if speed == 1.0:
speed = None
tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
# if torch.is_tensor(wav):
# wav = wav.cpu().numpy()
# if isinstance(wav, list):
# wav = np.array(wav)
#tts_io_out = io.BytesIO()
#save_wav(wav, tts_io_out)
#tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
# Pipe the output from piper to the input of ffmpeg
ffmpeg_args.extend(["-"])
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_proc.stdout, stdout=subprocess.PIPE)
tts_proc.stdin.close()
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_io_out, stdout=subprocess.PIPE)
#print(" ".join(tts_args))
#print(" ".join(ffmpeg_args))