mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
0.14.0 +streaming, +pcm, +wav, +temp, top_p, etc.
This commit is contained in:
parent
65c03e3448
commit
ae6a384e75
30
README.md
30
README.md
@ -10,7 +10,7 @@ An OpenAI API compatible text to speech server.
|
|||||||
Full Compatibility:
|
Full Compatibility:
|
||||||
* `tts-1`: `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable)
|
* `tts-1`: `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable)
|
||||||
* `tts-1-hd`: `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable, uses OpenAI samples by default)
|
* `tts-1-hd`: `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable, uses OpenAI samples by default)
|
||||||
* response_format: `mp3`, `opus`, `aac`, or `flac`
|
* response_format: `mp3`, `opus`, `aac`, `flac`, `wav` and `pcm`
|
||||||
* speed 0.25-4.0 (and more)
|
* speed 0.25-4.0 (and more)
|
||||||
|
|
||||||
Details:
|
Details:
|
||||||
@ -20,6 +20,8 @@ Details:
|
|||||||
* Custom cloned voices can be used for tts-1-hd, See: [Custom Voices Howto](#custom-voices-howto)
|
* Custom cloned voices can be used for tts-1-hd, See: [Custom Voices Howto](#custom-voices-howto)
|
||||||
* 🌐 [Multilingual](#multilingual) support with XTTS voices
|
* 🌐 [Multilingual](#multilingual) support with XTTS voices
|
||||||
* [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
|
* [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
|
||||||
|
* Configurable [generation parameters](#generation-parameters)
|
||||||
|
* Streamed output while generating
|
||||||
* Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`
|
* Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`
|
||||||
|
|
||||||
|
|
||||||
@ -27,6 +29,14 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s
|
|||||||
|
|
||||||
## Recent Changes
|
## Recent Changes
|
||||||
|
|
||||||
|
Version 0.14.0, 2024-06-26
|
||||||
|
|
||||||
|
* Added `response_format`: `wav` and `pcm` support
|
||||||
|
* Output streaming (while generating) for `tts-1` and `tts-1-hd`
|
||||||
|
* Enhanced [generation parameters](#generation-parameters) for xtts models (temperature, top_p, etc.)
|
||||||
|
* Idle unload timer (optional) - doesn't work perfectly yet
|
||||||
|
* Improved error handling
|
||||||
|
|
||||||
Version 0.13.0, 2024-06-25
|
Version 0.13.0, 2024-06-25
|
||||||
|
|
||||||
* Added [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
|
* Added [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
|
||||||
@ -313,3 +323,21 @@ tts-1-hd:
|
|||||||
model_path: voices/halo
|
model_path: voices/halo
|
||||||
```
|
```
|
||||||
3) The model will be loaded when you access the voice for the first time (`--preload` doesn't work with custom models yet)
|
3) The model will be loaded when you access the voice for the first time (`--preload` doesn't work with custom models yet)
|
||||||
|
|
||||||
|
## Generation Parameters
|
||||||
|
|
||||||
|
The generation of XTTSv2 voices can be fine tuned with the following options (defaults included below):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
tts-1-hd:
|
||||||
|
alloy:
|
||||||
|
model: xtts
|
||||||
|
speaker: voices/alloy.wav
|
||||||
|
enable_text_splitting: True
|
||||||
|
length_penalty: 1.0
|
||||||
|
repetition_penalty: 10
|
||||||
|
speed: 1.0
|
||||||
|
temperature: 0.75
|
||||||
|
top_k: 50
|
||||||
|
top_p: 0.85
|
||||||
|
```
|
||||||
@ -2,10 +2,7 @@
|
|||||||
set COQUI_TOS_AGREED=1
|
set COQUI_TOS_AGREED=1
|
||||||
set TTS_HOME=voices
|
set TTS_HOME=voices
|
||||||
|
|
||||||
set MODELS=%*
|
for %%i in (%*) do (
|
||||||
if "%MODELS%" == "" set MODELS=xtts
|
|
||||||
|
|
||||||
for %%i in (%MODELS%) do (
|
|
||||||
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('%%i')"
|
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('%%i')"
|
||||||
)
|
)
|
||||||
call download_samples.bat
|
call download_samples.bat
|
||||||
|
|||||||
@ -2,8 +2,7 @@
|
|||||||
export COQUI_TOS_AGREED=1
|
export COQUI_TOS_AGREED=1
|
||||||
export TTS_HOME=voices
|
export TTS_HOME=voices
|
||||||
|
|
||||||
MODELS=${*:-xtts}
|
for model in $*; do
|
||||||
for model in $MODELS; do
|
|
||||||
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
|
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
|
||||||
done
|
done
|
||||||
./download_samples.sh
|
./download_samples.sh
|
||||||
@ -4,7 +4,9 @@ loguru
|
|||||||
# piper-tts
|
# piper-tts
|
||||||
piper-tts==1.2.0
|
piper-tts==1.2.0
|
||||||
# xtts
|
# xtts
|
||||||
TTS
|
TTS==0.22.0
|
||||||
|
# https://github.com/huggingface/transformers/issues/31040
|
||||||
|
transformers<4.41.0
|
||||||
# XXX, 3.8+ has some issue for now
|
# XXX, 3.8+ has some issue for now
|
||||||
spacy==3.7.4
|
spacy==3.7.4
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,9 @@ loguru
|
|||||||
# piper-tts
|
# piper-tts
|
||||||
piper-tts==1.2.0
|
piper-tts==1.2.0
|
||||||
# xtts
|
# xtts
|
||||||
TTS
|
TTS==0.22.0
|
||||||
|
# https://github.com/huggingface/transformers/issues/31040
|
||||||
|
transformers<4.41.0
|
||||||
# XXX, 3.8+ has some issue for now
|
# XXX, 3.8+ has some issue for now
|
||||||
spacy==3.7.4
|
spacy==3.7.4
|
||||||
|
|
||||||
|
|||||||
@ -2,5 +2,5 @@ TTS_HOME=voices
|
|||||||
HF_HOME=voices
|
HF_HOME=voices
|
||||||
#PRELOAD_MODEL=xtts
|
#PRELOAD_MODEL=xtts
|
||||||
#PRELOAD_MODEL=xtts_v2.0.2
|
#PRELOAD_MODEL=xtts_v2.0.2
|
||||||
#EXTRA_ARGS=--log-level DEBUG
|
#EXTRA_ARGS=--log-level DEBUG --unload-timer 300
|
||||||
#USE_ROCM=1
|
#USE_ROCM=1
|
||||||
219
speech.py
219
speech.py
@ -1,51 +1,105 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import gc
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
import yaml
|
import yaml
|
||||||
from fastapi.responses import StreamingResponse
|
import contextlib
|
||||||
import uvicorn
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from loguru import logger
|
|
||||||
|
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from loguru import logger
|
||||||
|
from pydantic import BaseModel
|
||||||
|
import uvicorn
|
||||||
from openedai import OpenAIStub, BadRequestError, ServiceUnavailableError
|
from openedai import OpenAIStub, BadRequestError, ServiceUnavailableError
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.asynccontextmanager
|
||||||
|
async def lifespan(app):
|
||||||
|
yield
|
||||||
|
gc.collect()
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
torch.cuda.ipc_collect()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
app = OpenAIStub(lifespan=lifespan)
|
||||||
xtts = None
|
xtts = None
|
||||||
args = None
|
args = None
|
||||||
app = OpenAIStub()
|
|
||||||
|
def unload_model():
|
||||||
|
import torch, gc
|
||||||
|
global xtts
|
||||||
|
if xtts:
|
||||||
|
logger.info("Unloading model")
|
||||||
|
xtts.xtts.to('cpu') # this was required to free up GPU memory...
|
||||||
|
del xtts
|
||||||
|
xtts = None
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
torch.cuda.ipc_collect()
|
||||||
|
|
||||||
class xtts_wrapper():
|
class xtts_wrapper():
|
||||||
def __init__(self, model_name, device, model_path=None):
|
check_interval: int = 1
|
||||||
|
|
||||||
|
def __init__(self, model_name, device, model_path=None, unload_timer=None):
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
self.unload_timer = unload_timer
|
||||||
|
self.last_used = time.time()
|
||||||
|
self.timer = None
|
||||||
|
self.lock = threading.Lock()
|
||||||
|
|
||||||
logger.info(f"Loading model {self.model_name} to {device}")
|
logger.info(f"Loading model {self.model_name} to {device}")
|
||||||
|
|
||||||
if model_path: # custom model # and config_path
|
if model_path is None:
|
||||||
config_path=os.path.join(model_path, 'config.json')
|
model_path = ModelManager().download_model(model_name)[0]
|
||||||
self.xtts = TTS(model_path=model_path, config_path=config_path).to(device)
|
|
||||||
else:
|
|
||||||
self.xtts = TTS(model_name=model_name).to(device)
|
|
||||||
|
|
||||||
def tts(self, text, speaker_wav, speed, language):
|
config_path = os.path.join(model_path, 'config.json')
|
||||||
tf, file_path = tempfile.mkstemp(suffix='.wav', prefix='openedai-speech-')
|
config = XttsConfig()
|
||||||
|
config.load_json(config_path)
|
||||||
|
self.xtts = Xtts.init_from_config(config)
|
||||||
|
self.xtts.load_checkpoint(config, checkpoint_dir=model_path, use_deepspeed=False) # XXX there are no prebuilt deepspeed wheels??
|
||||||
|
self.xtts = self.xtts.to(device=device)
|
||||||
|
self.xtts.eval()
|
||||||
|
|
||||||
|
if self.unload_timer:
|
||||||
|
logger.info(f"Setting unload timer to {self.unload_timer} seconds")
|
||||||
|
self.not_idle()
|
||||||
|
self.check_idle()
|
||||||
|
|
||||||
|
def not_idle(self):
|
||||||
|
with self.lock:
|
||||||
|
self.last_used = time.time()
|
||||||
|
|
||||||
|
def check_idle(self):
|
||||||
|
with self.lock:
|
||||||
|
if time.time() - self.last_used >= self.unload_timer:
|
||||||
|
print("Unloading TTS model due to inactivity")
|
||||||
|
unload_model()
|
||||||
|
else:
|
||||||
|
# Reschedule the check
|
||||||
|
self.timer = threading.Timer(self.check_interval, self.check_idle)
|
||||||
|
self.timer.daemon = True
|
||||||
|
self.timer.start()
|
||||||
|
|
||||||
|
def tts(self, text, language, speaker_wav, **hf_generate_kwargs):
|
||||||
|
self.not_idle()
|
||||||
try:
|
try:
|
||||||
# TODO: support speaker= as voice id instead of just wav
|
with torch.no_grad():
|
||||||
file_path = self.xtts.tts_to_file(
|
gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # XXX TODO: allow multiple wav
|
||||||
text=text,
|
|
||||||
language=language,
|
for wav in self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs):
|
||||||
speaker_wav=speaker_wav,
|
yield wav.cpu().numpy().tobytes() # assumes wav data is f32le
|
||||||
speed=speed,
|
self.not_idle()
|
||||||
file_path=file_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
os.unlink(file_path)
|
self.not_idle()
|
||||||
|
|
||||||
return tf
|
|
||||||
|
|
||||||
def default_exists(filename: str):
|
def default_exists(filename: str):
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
@ -92,10 +146,10 @@ class GenerateSpeechRequest(BaseModel):
|
|||||||
|
|
||||||
def build_ffmpeg_args(response_format, input_format, sample_rate):
|
def build_ffmpeg_args(response_format, input_format, sample_rate):
|
||||||
# Convert the output to the desired format using ffmpeg
|
# Convert the output to the desired format using ffmpeg
|
||||||
if input_format == 'raw':
|
if input_format == 'WAV':
|
||||||
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", sample_rate, "-ac", "1", "-i", "-"]
|
|
||||||
else:
|
|
||||||
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "WAV", "-i", "-"]
|
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "WAV", "-i", "-"]
|
||||||
|
else:
|
||||||
|
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", input_format, "-ar", sample_rate, "-ac", "1", "-i", "-"]
|
||||||
|
|
||||||
if response_format == "mp3":
|
if response_format == "mp3":
|
||||||
ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"])
|
ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"])
|
||||||
@ -105,6 +159,10 @@ def build_ffmpeg_args(response_format, input_format, sample_rate):
|
|||||||
ffmpeg_args.extend(["-f", "adts", "-c:a", "aac", "-ab", "64k"])
|
ffmpeg_args.extend(["-f", "adts", "-c:a", "aac", "-ab", "64k"])
|
||||||
elif response_format == "flac":
|
elif response_format == "flac":
|
||||||
ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
|
ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
|
||||||
|
elif response_format == "wav":
|
||||||
|
ffmpeg_args.extend(["-f", "wav", "-c:a", "pcm_s16le"])
|
||||||
|
elif response_format == "pcm": # even though pcm is technically 'raw', we still use ffmpeg to adjust the speed
|
||||||
|
ffmpeg_args.extend(["-f", "s16le", "-c:a", "pcm_s16le"])
|
||||||
|
|
||||||
return ffmpeg_args
|
return ffmpeg_args
|
||||||
|
|
||||||
@ -121,18 +179,27 @@ async def generate_speech(request: GenerateSpeechRequest):
|
|||||||
|
|
||||||
model = request.model
|
model = request.model
|
||||||
voice = request.voice
|
voice = request.voice
|
||||||
response_format = request.response_format
|
response_format = request.response_format.lower()
|
||||||
speed = request.speed
|
speed = request.speed
|
||||||
|
|
||||||
# Set the Content-Type header based on the requested format
|
# Set the Content-Type header based on the requested format
|
||||||
if response_format == "mp3":
|
if response_format == "mp3":
|
||||||
media_type = "audio/mpeg"
|
media_type = "audio/mpeg"
|
||||||
elif response_format == "opus":
|
elif response_format == "opus":
|
||||||
media_type = "audio/ogg;codecs=opus"
|
media_type = "audio/ogg;codec=opus" # codecs?
|
||||||
elif response_format == "aac":
|
elif response_format == "aac":
|
||||||
media_type = "audio/aac"
|
media_type = "audio/aac"
|
||||||
elif response_format == "flac":
|
elif response_format == "flac":
|
||||||
media_type = "audio/x-flac"
|
media_type = "audio/x-flac"
|
||||||
|
elif response_format == "wav":
|
||||||
|
media_type = "audio/wav"
|
||||||
|
elif response_format == "pcm":
|
||||||
|
if model == 'tts-1': # piper
|
||||||
|
media_type = "audio/pcm;rate=22050"
|
||||||
|
elif model == 'tts-1-hd':
|
||||||
|
media_type = "audio/pcm;rate=24000"
|
||||||
|
else:
|
||||||
|
BadRequestError(f"Invalid response_format: '{response_format}'", param='response_format')
|
||||||
|
|
||||||
ffmpeg_args = None
|
ffmpeg_args = None
|
||||||
tts_io_out = None
|
tts_io_out = None
|
||||||
@ -158,51 +225,77 @@ async def generate_speech(request: GenerateSpeechRequest):
|
|||||||
tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
|
tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
|
||||||
tts_proc.stdin.close()
|
tts_proc.stdin.close()
|
||||||
tts_io_out = tts_proc.stdout
|
tts_io_out = tts_proc.stdout
|
||||||
ffmpeg_args = build_ffmpeg_args(response_format, input_format="raw", sample_rate="22050")
|
ffmpeg_args = build_ffmpeg_args(response_format, input_format="s16le", sample_rate="22050")
|
||||||
|
|
||||||
|
# Pipe the output from piper/xtts to the input of ffmpeg
|
||||||
|
ffmpeg_args.extend(["-"])
|
||||||
|
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_io_out, stdout=subprocess.PIPE)
|
||||||
|
|
||||||
|
return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)
|
||||||
# Use xtts for tts-1-hd
|
# Use xtts for tts-1-hd
|
||||||
elif model == 'tts-1-hd':
|
elif model == 'tts-1-hd':
|
||||||
voice_map = map_voice_to_speaker(voice, 'tts-1-hd')
|
voice_map = map_voice_to_speaker(voice, 'tts-1-hd')
|
||||||
try:
|
try:
|
||||||
tts_model = voice_map['model']
|
tts_model = voice_map.pop('model')
|
||||||
speaker = voice_map['speaker']
|
speaker = voice_map.pop('speaker')
|
||||||
|
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise ServiceUnavailableError(f"Configuration error: tts-1-hd voice '{voice}' is missing setting. KeyError: {e}")
|
raise ServiceUnavailableError(f"Configuration error: tts-1-hd voice '{voice}' is missing setting. KeyError: {e}")
|
||||||
|
|
||||||
language = voice_map.get('language', 'en')
|
if xtts and xtts.model_name != tts_model:
|
||||||
tts_model_path = voice_map.get('model_path', None)
|
unload_model()
|
||||||
|
|
||||||
if xtts is not None and xtts.model_name != tts_model:
|
tts_model_path = voice_map.pop('model_path', None) # XXX changing this on the fly is ignored if you keep the same name
|
||||||
import torch, gc
|
|
||||||
del xtts
|
|
||||||
xtts = None
|
|
||||||
gc.collect()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
else:
|
if xtts is None:
|
||||||
if xtts is None:
|
xtts = xtts_wrapper(tts_model, device=args.xtts_device, model_path=tts_model_path, unload_timer=args.unload_timer)
|
||||||
xtts = xtts_wrapper(tts_model, device=args.xtts_device, model_path=tts_model_path)
|
|
||||||
|
|
||||||
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")
|
ffmpeg_args = build_ffmpeg_args(response_format, input_format="f32le", sample_rate="24000")
|
||||||
|
|
||||||
# tts speed doesn't seem to work well
|
# tts speed doesn't seem to work well
|
||||||
if speed < 0.5:
|
speed = voice_map.pop('speed', speed)
|
||||||
speed = speed / 0.5
|
if speed < 0.5:
|
||||||
ffmpeg_args.extend(["-af", "atempo=0.5"])
|
speed = speed / 0.5
|
||||||
if speed > 1.0:
|
ffmpeg_args.extend(["-af", "atempo=0.5"])
|
||||||
ffmpeg_args.extend(["-af", f"atempo={speed}"])
|
if speed > 1.0:
|
||||||
speed = 1.0
|
ffmpeg_args.extend(["-af", f"atempo={speed}"])
|
||||||
|
speed = 1.0
|
||||||
|
|
||||||
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed, language=language)
|
language = voice_map.pop('language', 'en')
|
||||||
|
|
||||||
|
comment = voice_map.pop('comment', None) # ignored.
|
||||||
|
|
||||||
|
hf_generate_kwargs = dict(
|
||||||
|
speed=speed,
|
||||||
|
**voice_map,
|
||||||
|
)
|
||||||
|
|
||||||
|
hf_generate_kwargs['enable_text_splitting'] = hf_generate_kwargs.get('enable_text_splitting', True) # change the default to true
|
||||||
|
|
||||||
|
# Pipe the output from piper/xtts to the input of ffmpeg
|
||||||
|
ffmpeg_args.extend(["-"])
|
||||||
|
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||||
|
|
||||||
|
def generator():
|
||||||
|
try:
|
||||||
|
for chunk in xtts.tts(text=input_text, language=language, speaker_wav=speaker, **hf_generate_kwargs):
|
||||||
|
ffmpeg_proc.stdin.write(chunk)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Exception: {repr(e)}")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
finally:
|
||||||
|
ffmpeg_proc.stdin.close()
|
||||||
|
|
||||||
|
worker = threading.Thread(target=generator)
|
||||||
|
worker.daemon = True
|
||||||
|
worker.start()
|
||||||
|
|
||||||
|
return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)
|
||||||
else:
|
else:
|
||||||
raise BadRequestError("No such model, must be tts-1 or tts-1-hd.", param='model')
|
raise BadRequestError("No such model, must be tts-1 or tts-1-hd.", param='model')
|
||||||
|
|
||||||
# Pipe the output from piper/xtts to the input of ffmpeg
|
|
||||||
ffmpeg_args.extend(["-"])
|
|
||||||
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_io_out, stdout=subprocess.PIPE)
|
|
||||||
|
|
||||||
return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)
|
|
||||||
|
|
||||||
# We return 'mps' but currently XTTS will not work with mps devices as the cuda support is incomplete
|
# We return 'mps' but currently XTTS will not work with mps devices as the cuda support is incomplete
|
||||||
def auto_torch_device():
|
def auto_torch_device():
|
||||||
@ -220,6 +313,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
parser.add_argument('--xtts_device', action='store', default=auto_torch_device(), help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
|
parser.add_argument('--xtts_device', action='store', default=auto_torch_device(), help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
|
||||||
parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
|
parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
|
||||||
|
parser.add_argument('--unload-timer', action='store', default=None, type=int, help="Idle unload timer for the XTTS model in seconds")
|
||||||
parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
|
parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
|
||||||
parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
|
parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
|
||||||
parser.add_argument('-L', '--log-level', default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="Set the log level")
|
parser.add_argument('-L', '--log-level', default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="Set the log level")
|
||||||
@ -233,10 +327,13 @@ if __name__ == "__main__":
|
|||||||
logger.add(sink=sys.stderr, level=args.log_level)
|
logger.add(sink=sys.stderr, level=args.log_level)
|
||||||
|
|
||||||
if args.xtts_device != "none":
|
if args.xtts_device != "none":
|
||||||
from TTS.api import TTS
|
import torch
|
||||||
|
from TTS.tts.configs.xtts_config import XttsConfig
|
||||||
|
from TTS.tts.models.xtts import Xtts
|
||||||
|
from TTS.utils.manage import ModelManager
|
||||||
|
|
||||||
if args.preload:
|
if args.preload:
|
||||||
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
|
xtts = xtts_wrapper(args.preload, device=args.xtts_device, unload_timer=args.unload_timer)
|
||||||
|
|
||||||
app.register_model('tts-1')
|
app.register_model('tts-1')
|
||||||
app.register_model('tts-1-hd')
|
app.register_model('tts-1-hd')
|
||||||
|
|||||||
@ -48,3 +48,11 @@ tts-1-hd:
|
|||||||
me:
|
me:
|
||||||
model: xtts_v2.0.2 # you can specify different xtts version
|
model: xtts_v2.0.2 # you can specify different xtts version
|
||||||
speaker: voices/me.wav # this could be you
|
speaker: voices/me.wav # this could be you
|
||||||
|
enable_text_splitting: True
|
||||||
|
length_penalty: 1.0
|
||||||
|
repetition_penalty: 10
|
||||||
|
speed: 1.0
|
||||||
|
temperature: 0.75
|
||||||
|
top_k: 50
|
||||||
|
top_p: 0.85
|
||||||
|
comment: You can add a comment here also, which will be persistent and otherwise ignored.
|
||||||
Loading…
Reference in New Issue
Block a user