0.18.0 - Allow multiple samples in xtts. Closes: #38

This commit is contained in:
matatonic 2024-08-15 17:19:34 -04:00
parent b5d0dafd7d
commit e815ef2860
2 changed files with 41 additions and 3 deletions

View File

@ -30,6 +30,10 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s
## Recent Changes
Version 0.18.0, 2024-08-15
* Allow folders of wav samples in xtts. Samples will be combined, allowing for mixed voices and collections of small samples. Still limited to 30 seconds total. Thanks @nathanhere.
Version 0.17.3, 2024-08-15
* fix fr_FR-tom-medium and other 44khz piper voices (detect non-default sample rates)
@ -308,6 +312,7 @@ Coqui XTTS v2 voice cloning can work with as little as 6 seconds of clear audio.
* low noise (no hiss or hum)
* No partial words, breathing, laughing, music or backgrounds sounds
* An even speaking pace with a variety of words is best, like in interviews or audiobooks.
* Audio longer than 30 seconds will be silently truncated.
You can use FFmpeg to prepare your audio files, here are some examples:
@ -332,6 +337,20 @@ tts-1-hd:
speaker: voices/me.wav # this could be you
```
You can also use a sub folder for multiple audio samples to combine small samples or to mix different samples together.
For example:
```yaml
...
tts-1-hd:
mixed:
model: xtts
speaker: voices/mixed
```
Where the `voices/mixed/` folder contains multiple wav files. The total audio length is still limited to 30 seconds.
## Multilingual
Multilingual cloning support was added in version 0.11.0 and is available only with the XTTS v2 model. To use multilingual voices with piper simply download a language specific voice.

View File

@ -85,7 +85,7 @@ class xtts_wrapper():
self.timer.daemon = True
self.timer.start()
def tts(self, text, language, speaker_wav, **hf_generate_kwargs):
def tts(self, text, language, audio_path, **hf_generate_kwargs):
with torch.no_grad():
self.last_used = time.time()
tokens = 0
@ -93,7 +93,7 @@ class xtts_wrapper():
with self.lock:
logger.debug(f"generating [{language}]: {[text]}")
gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # not worth caching calls, it's < 0.001s after model is loaded
gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=audio_path) # not worth caching calls, it's < 0.001s after model is loaded
pcm_stream = self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs)
self.last_used = time.time()
@ -319,6 +319,21 @@ async def generate_speech(request: GenerateSpeechRequest):
in_q = queue.Queue() # speech pcm
ex_q = queue.Queue() # exceptions
def get_speaker_samples(samples: str) -> list[str]:
if os.path.isfile(samples):
audio_path = [samples]
elif os.path.isdir(samples):
audio_path = [os.path.join(samples, sample) for sample in os.listdir(samples) if os.path.isfile(os.path.join(samples, sample))]
if len(audio_path) < 1:
logger.error(f"No files found: {samples}")
raise ServiceUnavailableError(f"Invalid path: {samples}")
else:
logger.error(f"Invalid path: {samples}")
raise ServiceUnavailableError(f"Invalid path: {samples}")
return audio_path
def exception_check(exq: queue.Queue):
try:
e = exq.get_nowait()
@ -329,9 +344,13 @@ async def generate_speech(request: GenerateSpeechRequest):
def generator():
# text -> in_q
audio_path = get_speaker_samples(speaker)
logger.debug(f"{voice} wav samples: {audio_path}")
try:
for text in all_text:
for chunk in xtts.tts(text=text, language=language, speaker_wav=speaker, **hf_generate_kwargs):
for chunk in xtts.tts(text=text, language=language, audio_path=audio_path, **hf_generate_kwargs):
exception_check(ex_q)
in_q.put(chunk)