mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
reorder docker, allow different xtts model versions
This commit is contained in:
parent
3fe07873f7
commit
b4756dc1db
@ -6,13 +6,15 @@ RUN apt-get update && \
|
||||
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
|
||||
# default clone of the default voice is really bad, use a better default
|
||||
COPY voices/alloy-alt.wav /app/voices/
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt /app/
|
||||
RUN pip install -r requirements.txt
|
||||
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
|
||||
|
||||
RUN ./download_voices_tts-1.sh
|
||||
RUN ./download_voices_tts-1-hd.sh
|
||||
|
||||
ENV COQUI_TOS_AGREED=1
|
||||
CMD python main.py --host 0.0.0.0 --port 8000 --preload_xtts
|
||||
CMD python main.py --host 0.0.0.0 --port 8000 --preload xtts
|
||||
|
||||
@ -9,7 +9,6 @@ RUN pip install piper-tts
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
|
||||
COPY voices/alloy-alt.wav /app/voices/
|
||||
WORKDIR /app
|
||||
|
||||
RUN ./download_voices_tts-1.sh
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#!/bin/sh
|
||||
export COQUI_TOS_AGREED=1
|
||||
model="tts_models/multilingual/multi-dataset/xtts_v2"
|
||||
model="xtts" # others are possible, ex. xtts_v2.0.2
|
||||
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
|
||||
./download_samples.sh
|
||||
22
main.py
22
main.py
@ -17,9 +17,10 @@ args = None
|
||||
app = FastAPI()
|
||||
|
||||
class xtts_wrapper():
|
||||
def __init__(self, model_name):
|
||||
def __init__(self, model_name, device):
|
||||
global args
|
||||
self.xtts = TTS(model_name=model_name, progress_bar=False).to(args.xtts_device)
|
||||
self.model_name = model_name
|
||||
self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)
|
||||
|
||||
def tts(self, text, speaker_wav, speed):
|
||||
tf, file_path = tempfile.mkstemp(suffix='.wav')
|
||||
@ -119,13 +120,14 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
tts_proc.stdin.close()
|
||||
tts_io_out = tts_proc.stdout
|
||||
|
||||
# Use xtts_v2 for tts-1-hd
|
||||
# Use xtts for tts-1-hd
|
||||
elif model == 'tts-1-hd':
|
||||
if not xtts:
|
||||
xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
|
||||
|
||||
tts_model, speaker = model, speaker = map_voice_to_speaker(voice, 'tts-1-hd')
|
||||
|
||||
if not xtts or xtts.model_name != tts_model:
|
||||
xtts = xtts_wrapper(tts_model, device=args.xtts_device)
|
||||
# XXX probably should GC/torch cleanup here
|
||||
|
||||
# tts speed doesn't seem to work well
|
||||
if speed < 0.5:
|
||||
speed = speed / 0.5
|
||||
@ -151,7 +153,7 @@ if __name__ == "__main__":
|
||||
|
||||
parser.add_argument('--piper_cuda', action='store_true', default=False, help="Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough")
|
||||
parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
|
||||
parser.add_argument('--preload_xtts', action='store_true', default=False, help="Preload the xtts model. By default it's loaded on first use.")
|
||||
parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts'). By default it's loaded on first use.")
|
||||
parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
|
||||
parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0")
|
||||
|
||||
@ -160,7 +162,7 @@ if __name__ == "__main__":
|
||||
if args.xtts_device != "none":
|
||||
from TTS.api import TTS
|
||||
|
||||
if args.preload_xtts:
|
||||
xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
|
||||
if args.preload:
|
||||
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
|
||||
|
||||
uvicorn.run(app, host=args.host, port=args.port) # , root_path=cwd, access_log=False, log_level="info", ssl_keyfile="cert.pem", ssl_certfile="cert.pem")
|
||||
uvicorn.run(app, host=args.host, port=args.port)
|
||||
|
||||
@ -25,26 +25,26 @@ tts-1:
|
||||
speaker: 163
|
||||
tts-1-hd:
|
||||
alloy:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
model: xtts
|
||||
speaker: voices/alloy-alt.wav
|
||||
alloy-orig:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
model: xtts
|
||||
speaker: voices/alloy.wav # it's REALLY BAD
|
||||
echo:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
model: xtts
|
||||
speaker: voices/echo.wav
|
||||
fable:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
model: xtts
|
||||
speaker: voices/fable.wav
|
||||
onyx:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
model: xtts
|
||||
speaker: voices/onyx.wav
|
||||
nova:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
model: xtts
|
||||
speaker: voices/nova.wav
|
||||
shimmer:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
model: xtts
|
||||
speaker: voices/shimmer.wav
|
||||
me:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
model: xtts_v2.0.2 # you can specify different xtts version
|
||||
speaker: voices/me.wav # this could be you
|
||||
|
||||
Loading…
Reference in New Issue
Block a user