reorder docker, allow different xtts model versions

This commit is contained in:
matatonic 2024-03-20 12:33:32 -04:00
parent 3fe07873f7
commit b4756dc1db
5 changed files with 26 additions and 23 deletions

View File

@ -6,13 +6,15 @@ RUN apt-get update && \
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
# default clone of the default voice is really bad, use a better default
COPY voices/alloy-alt.wav /app/voices/
WORKDIR /app
COPY requirements.txt /app/
RUN pip install -r requirements.txt
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
RUN ./download_voices_tts-1.sh
RUN ./download_voices_tts-1-hd.sh
ENV COQUI_TOS_AGREED=1
CMD python main.py --host 0.0.0.0 --port 8000 --preload_xtts
CMD python main.py --host 0.0.0.0 --port 8000 --preload xtts

View File

@ -9,7 +9,6 @@ RUN pip install piper-tts
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
COPY voices/alloy-alt.wav /app/voices/
WORKDIR /app
RUN ./download_voices_tts-1.sh

View File

@ -1,5 +1,5 @@
#!/bin/sh
export COQUI_TOS_AGREED=1
model="tts_models/multilingual/multi-dataset/xtts_v2"
model="xtts" # others are possible, ex. xtts_v2.0.2
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
./download_samples.sh

22
main.py
View File

@ -17,9 +17,10 @@ args = None
app = FastAPI()
class xtts_wrapper():
def __init__(self, model_name):
def __init__(self, model_name, device):
global args
self.xtts = TTS(model_name=model_name, progress_bar=False).to(args.xtts_device)
self.model_name = model_name
self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)
def tts(self, text, speaker_wav, speed):
tf, file_path = tempfile.mkstemp(suffix='.wav')
@ -119,13 +120,14 @@ async def generate_speech(request: GenerateSpeechRequest):
tts_proc.stdin.close()
tts_io_out = tts_proc.stdout
# Use xtts_v2 for tts-1-hd
# Use xtts for tts-1-hd
elif model == 'tts-1-hd':
if not xtts:
xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
tts_model, speaker = model, speaker = map_voice_to_speaker(voice, 'tts-1-hd')
if not xtts or xtts.model_name != tts_model:
xtts = xtts_wrapper(tts_model, device=args.xtts_device)
# XXX probably should GC/torch cleanup here
# tts speed doesn't seem to work well
if speed < 0.5:
speed = speed / 0.5
@ -151,7 +153,7 @@ if __name__ == "__main__":
parser.add_argument('--piper_cuda', action='store_true', default=False, help="Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough")
parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
parser.add_argument('--preload_xtts', action='store_true', default=False, help="Preload the xtts model. By default it's loaded on first use.")
parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts'). By default it's loaded on first use.")
parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0")
@ -160,7 +162,7 @@ if __name__ == "__main__":
if args.xtts_device != "none":
from TTS.api import TTS
if args.preload_xtts:
xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
if args.preload:
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
uvicorn.run(app, host=args.host, port=args.port) # , root_path=cwd, access_log=False, log_level="info", ssl_keyfile="cert.pem", ssl_certfile="cert.pem")
uvicorn.run(app, host=args.host, port=args.port)

View File

@ -25,26 +25,26 @@ tts-1:
speaker: 163
tts-1-hd:
alloy:
model: tts_models/multilingual/multi-dataset/xtts_v2
model: xtts
speaker: voices/alloy-alt.wav
alloy-orig:
model: tts_models/multilingual/multi-dataset/xtts_v2
model: xtts
speaker: voices/alloy.wav # it's REALLY BAD
echo:
model: tts_models/multilingual/multi-dataset/xtts_v2
model: xtts
speaker: voices/echo.wav
fable:
model: tts_models/multilingual/multi-dataset/xtts_v2
model: xtts
speaker: voices/fable.wav
onyx:
model: tts_models/multilingual/multi-dataset/xtts_v2
model: xtts
speaker: voices/onyx.wav
nova:
model: tts_models/multilingual/multi-dataset/xtts_v2
model: xtts
speaker: voices/nova.wav
shimmer:
model: tts_models/multilingual/multi-dataset/xtts_v2
model: xtts
speaker: voices/shimmer.wav
me:
model: tts_models/multilingual/multi-dataset/xtts_v2
model: xtts_v2.0.2 # you can specify different xtts version
speaker: voices/me.wav # this could be you