xtts wip

2025-06-26 18:16:32 +00:00 · 2023-11-27 16:57:53 -05:00 · 2023-11-27 16:57:53 -05:00 · 0ca7da80c4
commit 0ca7da80c4
parent 2fdac61ccb
7 changed files with 128 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
+voices/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/20
+++ b/20
@ -0,0 +1,20 @@
+FROM nvidia/cuda:11.8.0-base-ubuntu22.04
+
+ENV COQUI_TOS_AGREED=1
+
+#python3.11 
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y python3-pip wget ffmpeg && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+#RUN git clone https://github.com/matatonic/openedai-api-audio-speech /app
+RUN mkdir -p /app/voices
+COPY *.py *.yaml *.txt *.md *.sh /app/
+COPY ./voices/alloy.wav /app/voices/alloy.wav
+WORKDIR /app
+RUN pip install -r requirements.txt
+
+RUN ./download_voices_tts-1.sh
+RUN ./download_voices_tts-1-hd.sh
+
+CMD python3 main.py
--- a/Dockerfile.min
+++ b/Dockerfile.min
@ -7,11 +7,10 @@ RUN apt-get update && \

 #RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
-COPY *.py *.yaml /app/
+COPY *.py *.yaml *.sh /app/
 WORKDIR /app
 RUN pip install piper-tts

-RUN piper --update-voices --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
-RUN piper --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
+RUN download_voices_tts-1.sh

 CMD python3 main.py
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -3,8 +3,8 @@ services:
  server:
    build:
      context: .
-      dockerfile: Dockerfile.min
-      #dockerfile: Dockerfile # for tts-1-hd support via xtts_v2
+      #dockerfile: Dockerfile.min
+      dockerfile: Dockerfile # for tts-1-hd support via xtts_v2
    stdin_open: true
    tty: true
    runtime: nvidia
@ -18,3 +18,5 @@ services:
              capabilities: [gpu]
    ports:
      - "8000:8000"
+    volumes:
+      - .:/app/
--- a/download_voices_tts-1-hd.sh
+++ b/download_voices_tts-1-hd.sh
@ -0,0 +1,5 @@
+#!/bin/sh
+COQUI_TOS_AGREED=1
+tts --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --text "Done" --language_idx "en" --speaker_wav voices/alloy.wav --pipe_out | \
+	ffmpeg -f s16le -ar 22050 -ac 1 -i - > /dev/null
+rm -f tts_output.wav
--- a/download_voices_tts-1.sh
+++ b/download_voices_tts-1.sh
@ -0,0 +1,3 @@
+#!/bin/sh
+piper --update-voices --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
+piper --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
--- a/main.py
+++ b/main.py
@ -2,15 +2,83 @@
 import subprocess
 import yaml
 import re
+import io
+import os
+from pathlib import Path
 from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 import uvicorn
 from pydantic import BaseModel
+import numpy as np
+import torch
+#import TTS
+from TTS.api import TTS
+from TTS.config import load_config
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+from TTS.utils.audio.numpy_transforms import save_wav

 piper_cuda = False # onnxruntime-gpu not working for me, but cpu is fast enough
+xtts_device = 'cuda'

 app = FastAPI()

+class FakeBufferedIO(io.BytesIO):
+    def __init__(self):
+        self.buffer = self
+
+class xtts_wrapper():
+    def __init__(self, model_name):
+
+        self.xtts = TTS(model_name=model_name, progress_bar=False, gpu=True).to(xtts_device)
+        """
+        vocoder_path, vocoder_config_path = None, None
+        tts_loc = Path(TTS.__file__).parent / '.models.json'
+        manager = ModelManager(tts_loc)
+        model_path, config_path, model_item = manager.download_model(model_name)
+        if not config_path:
+            config_path = os.path.join(model_path, "config.json")
+        #print(model_path, config_path, model_item)
+        #vocoder_path, vocoder_config_path, _ = manager.download_model(model_item["default_vocoder"])
+        
+        self.xtts_synthesizer = Synthesizer(
+            tts_checkpoint=model_path,
+            tts_config_path=config_path,
+            #tts_speakers_file=None,
+            #tts_languages_file=None,
+            #vocoder_checkpoint=vocoder_path,
+            #vocoder_config=vocoder_config_path,
+            #encoder_checkpoint="",
+            #encoder_config="",
+            use_cuda=xtts_cuda,
+        )
+
+        self.use_multi_speaker = hasattr(self.xtts_synthesizer.tts_model, "num_speakers") and (
+            self.xtts_synthesizer.tts_model.num_speakers > 1 or self.xtts_synthesizer.tts_speakers_file is not None
+        )
+        self.speaker_manager = getattr(self.xtts_synthesizer.tts_model, "speaker_manager", None)
+
+        self.use_multi_language = hasattr(self.xtts_synthesizer.tts_model, "num_languages") and (
+            self.xtts_synthesizer.tts_model.num_languages > 1 or self.xtts_synthesizer.tts_languages_file is not None
+        )
+        self.language_manager = getattr(self.xtts_synthesizer.tts_model, "language_manager", None)
+        """
+
+    def tts(self, text, speaker_wav, speed):
+        io_ret = FakeBufferedIO()
+        file_path = self.xtts.tts_to_file(
+            text,
+            language='en',
+            speaker_wav=speaker_wav,
+            speed=speed,
+            pipe_out=io_ret,
+        )
+        
+        #self.xtts.synthesizer.save_wav(wav, path='tts_output.wav', pipe_out=io_ret)
+        return io_ret
+
+xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
+
 def preprocess(raw_input):
    with open('pre_process_map.yaml', 'r') as file:
        pre_process_map = yaml.safe_load(file)
@ -61,8 +129,7 @@ async def generate_speech(request: GenerateSpeechRequest):
        ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
     #"-hwaccel:auto"

-    tts_args = []
-    tts_proc = None
+    tts_io_out = None

    if model == 'tts-1':
        piper_model, speaker = map_voice_to_speaker(voice, model)
@ -76,25 +143,38 @@ async def generate_speech(request: GenerateSpeechRequest):

        tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
+        tts_proc.stdin.close()
+        tts_io_out = tts_proc.stdout
+        
    elif model == 'tts-1-hd':
-        #tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to(device)
-        #tts.tts_to_file(text=ttstext, file_path=output_filename, speaker_wav=self.speaker_wav)
        tts_model, speaker = model, speaker = map_voice_to_speaker(voice, model)
-        tts_args = ["tts", "--text", input_text, "--use_cuda", "USE_CUDA", "--model_name", str(tts_model), "--language_idx", "en", "--pipe_out" ]
-        if speaker:
-            tts_args.extend(["--speaker_wav", str(speaker)])
+
+        #tts_args = ["tts", "--text", input_text, "--use_cuda", "USE_CUDA", "--model_name", str(tts_model), "--language_idx", "en", "--pipe_out" ]
+        #if speaker:
+        #    tts_args.extend(["--speaker_wav", str(speaker)])
        if speed > 2.0: # tts has a max speed of 2.0
            ffmpeg_args.extend(["-af", "atempo=2.0"]) 
            speed = min(speed / 2.0, 2.0)
-        if speed != 1.0:
-             tts_args.extend(["--speed", str(speed)])
+        #if speed != 1.0:
+        #     tts_args.extend(["--speed", str(speed)])
+        if speed == 1.0:
+            speed = None

-        tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
+        
+#        if torch.is_tensor(wav):
+#            wav = wav.cpu().numpy()
+#        if isinstance(wav, list):
+#            wav = np.array(wav)
+
+        #tts_io_out = io.BytesIO()
+        #save_wav(wav, tts_io_out)
+        
+        #tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    
    # Pipe the output from piper to the input of ffmpeg
    ffmpeg_args.extend(["-"])
-    ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_proc.stdout, stdout=subprocess.PIPE)
-    tts_proc.stdin.close()
+    ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_io_out, stdout=subprocess.PIPE)

    #print(" ".join(tts_args))
    #print(" ".join(ffmpeg_args))