mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
xtts wip
This commit is contained in:
parent
2fdac61ccb
commit
0ca7da80c4
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,3 +1,5 @@
|
||||
voices/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
||||
20
Dockerfile
Normal file
20
Dockerfile
Normal file
@ -0,0 +1,20 @@
|
||||
FROM nvidia/cuda:11.8.0-base-ubuntu22.04
|
||||
|
||||
ENV COQUI_TOS_AGREED=1
|
||||
|
||||
#python3.11
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y python3-pip wget ffmpeg && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
#RUN git clone https://github.com/matatonic/openedai-api-audio-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
COPY *.py *.yaml *.txt *.md *.sh /app/
|
||||
COPY ./voices/alloy.wav /app/voices/alloy.wav
|
||||
WORKDIR /app
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
RUN ./download_voices_tts-1.sh
|
||||
RUN ./download_voices_tts-1-hd.sh
|
||||
|
||||
CMD python3 main.py
|
||||
@ -7,11 +7,10 @@ RUN apt-get update && \
|
||||
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
COPY *.py *.yaml /app/
|
||||
COPY *.py *.yaml *.sh /app/
|
||||
WORKDIR /app
|
||||
RUN pip install piper-tts
|
||||
|
||||
RUN piper --update-voices --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
|
||||
RUN piper --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
|
||||
RUN download_voices_tts-1.sh
|
||||
|
||||
CMD python3 main.py
|
||||
|
||||
@ -3,8 +3,8 @@ services:
|
||||
server:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.min
|
||||
#dockerfile: Dockerfile # for tts-1-hd support via xtts_v2
|
||||
#dockerfile: Dockerfile.min
|
||||
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2
|
||||
stdin_open: true
|
||||
tty: true
|
||||
runtime: nvidia
|
||||
@ -18,3 +18,5 @@ services:
|
||||
capabilities: [gpu]
|
||||
ports:
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- .:/app/
|
||||
|
||||
5
download_voices_tts-1-hd.sh
Executable file
5
download_voices_tts-1-hd.sh
Executable file
@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
COQUI_TOS_AGREED=1
|
||||
tts --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --text "Done" --language_idx "en" --speaker_wav voices/alloy.wav --pipe_out | \
|
||||
ffmpeg -f s16le -ar 22050 -ac 1 -i - > /dev/null
|
||||
rm -f tts_output.wav
|
||||
3
download_voices_tts-1.sh
Executable file
3
download_voices_tts-1.sh
Executable file
@ -0,0 +1,3 @@
|
||||
#!/bin/sh
|
||||
piper --update-voices --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
|
||||
piper --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
|
||||
104
main.py
104
main.py
@ -2,15 +2,83 @@
|
||||
import subprocess
|
||||
import yaml
|
||||
import re
|
||||
import io
|
||||
import os
|
||||
from pathlib import Path
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import StreamingResponse
|
||||
import uvicorn
|
||||
from pydantic import BaseModel
|
||||
import numpy as np
|
||||
import torch
|
||||
#import TTS
|
||||
from TTS.api import TTS
|
||||
from TTS.config import load_config
|
||||
from TTS.utils.manage import ModelManager
|
||||
from TTS.utils.synthesizer import Synthesizer
|
||||
from TTS.utils.audio.numpy_transforms import save_wav
|
||||
|
||||
piper_cuda = False # onnxruntime-gpu not working for me, but cpu is fast enough
|
||||
xtts_device = 'cuda'
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
class FakeBufferedIO(io.BytesIO):
|
||||
def __init__(self):
|
||||
self.buffer = self
|
||||
|
||||
class xtts_wrapper():
|
||||
def __init__(self, model_name):
|
||||
|
||||
self.xtts = TTS(model_name=model_name, progress_bar=False, gpu=True).to(xtts_device)
|
||||
"""
|
||||
vocoder_path, vocoder_config_path = None, None
|
||||
tts_loc = Path(TTS.__file__).parent / '.models.json'
|
||||
manager = ModelManager(tts_loc)
|
||||
model_path, config_path, model_item = manager.download_model(model_name)
|
||||
if not config_path:
|
||||
config_path = os.path.join(model_path, "config.json")
|
||||
#print(model_path, config_path, model_item)
|
||||
#vocoder_path, vocoder_config_path, _ = manager.download_model(model_item["default_vocoder"])
|
||||
|
||||
self.xtts_synthesizer = Synthesizer(
|
||||
tts_checkpoint=model_path,
|
||||
tts_config_path=config_path,
|
||||
#tts_speakers_file=None,
|
||||
#tts_languages_file=None,
|
||||
#vocoder_checkpoint=vocoder_path,
|
||||
#vocoder_config=vocoder_config_path,
|
||||
#encoder_checkpoint="",
|
||||
#encoder_config="",
|
||||
use_cuda=xtts_cuda,
|
||||
)
|
||||
|
||||
self.use_multi_speaker = hasattr(self.xtts_synthesizer.tts_model, "num_speakers") and (
|
||||
self.xtts_synthesizer.tts_model.num_speakers > 1 or self.xtts_synthesizer.tts_speakers_file is not None
|
||||
)
|
||||
self.speaker_manager = getattr(self.xtts_synthesizer.tts_model, "speaker_manager", None)
|
||||
|
||||
self.use_multi_language = hasattr(self.xtts_synthesizer.tts_model, "num_languages") and (
|
||||
self.xtts_synthesizer.tts_model.num_languages > 1 or self.xtts_synthesizer.tts_languages_file is not None
|
||||
)
|
||||
self.language_manager = getattr(self.xtts_synthesizer.tts_model, "language_manager", None)
|
||||
"""
|
||||
|
||||
def tts(self, text, speaker_wav, speed):
|
||||
io_ret = FakeBufferedIO()
|
||||
file_path = self.xtts.tts_to_file(
|
||||
text,
|
||||
language='en',
|
||||
speaker_wav=speaker_wav,
|
||||
speed=speed,
|
||||
pipe_out=io_ret,
|
||||
)
|
||||
|
||||
#self.xtts.synthesizer.save_wav(wav, path='tts_output.wav', pipe_out=io_ret)
|
||||
return io_ret
|
||||
|
||||
xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
|
||||
|
||||
def preprocess(raw_input):
|
||||
with open('pre_process_map.yaml', 'r') as file:
|
||||
pre_process_map = yaml.safe_load(file)
|
||||
@ -61,8 +129,7 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
|
||||
#"-hwaccel:auto"
|
||||
|
||||
tts_args = []
|
||||
tts_proc = None
|
||||
tts_io_out = None
|
||||
|
||||
if model == 'tts-1':
|
||||
piper_model, speaker = map_voice_to_speaker(voice, model)
|
||||
@ -76,25 +143,38 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
|
||||
tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
|
||||
tts_proc.stdin.close()
|
||||
tts_io_out = tts_proc.stdout
|
||||
|
||||
elif model == 'tts-1-hd':
|
||||
#tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to(device)
|
||||
#tts.tts_to_file(text=ttstext, file_path=output_filename, speaker_wav=self.speaker_wav)
|
||||
tts_model, speaker = model, speaker = map_voice_to_speaker(voice, model)
|
||||
tts_args = ["tts", "--text", input_text, "--use_cuda", "USE_CUDA", "--model_name", str(tts_model), "--language_idx", "en", "--pipe_out" ]
|
||||
if speaker:
|
||||
tts_args.extend(["--speaker_wav", str(speaker)])
|
||||
|
||||
#tts_args = ["tts", "--text", input_text, "--use_cuda", "USE_CUDA", "--model_name", str(tts_model), "--language_idx", "en", "--pipe_out" ]
|
||||
#if speaker:
|
||||
# tts_args.extend(["--speaker_wav", str(speaker)])
|
||||
if speed > 2.0: # tts has a max speed of 2.0
|
||||
ffmpeg_args.extend(["-af", "atempo=2.0"])
|
||||
speed = min(speed / 2.0, 2.0)
|
||||
if speed != 1.0:
|
||||
tts_args.extend(["--speed", str(speed)])
|
||||
#if speed != 1.0:
|
||||
# tts_args.extend(["--speed", str(speed)])
|
||||
if speed == 1.0:
|
||||
speed = None
|
||||
|
||||
tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
|
||||
|
||||
# if torch.is_tensor(wav):
|
||||
# wav = wav.cpu().numpy()
|
||||
# if isinstance(wav, list):
|
||||
# wav = np.array(wav)
|
||||
|
||||
#tts_io_out = io.BytesIO()
|
||||
#save_wav(wav, tts_io_out)
|
||||
|
||||
#tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
|
||||
# Pipe the output from piper to the input of ffmpeg
|
||||
ffmpeg_args.extend(["-"])
|
||||
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_proc.stdout, stdout=subprocess.PIPE)
|
||||
tts_proc.stdin.close()
|
||||
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_io_out, stdout=subprocess.PIPE)
|
||||
|
||||
#print(" ".join(tts_args))
|
||||
#print(" ".join(ffmpeg_args))
|
||||
|
||||
Loading…
Reference in New Issue
Block a user