mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
0.18.0 - Allow multiple samples in xtts. Closes: #38
This commit is contained in:
parent
b5d0dafd7d
commit
e815ef2860
19
README.md
19
README.md
@ -30,6 +30,10 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s
|
||||
|
||||
## Recent Changes
|
||||
|
||||
Version 0.18.0, 2024-08-15
|
||||
|
||||
* Allow folders of wav samples in xtts. Samples will be combined, allowing for mixed voices and collections of small samples. Still limited to 30 seconds total. Thanks @nathanhere.
|
||||
|
||||
Version 0.17.3, 2024-08-15
|
||||
|
||||
* fix fr_FR-tom-medium and other 44khz piper voices (detect non-default sample rates)
|
||||
@ -308,6 +312,7 @@ Coqui XTTS v2 voice cloning can work with as little as 6 seconds of clear audio.
|
||||
* low noise (no hiss or hum)
|
||||
* No partial words, breathing, laughing, music or backgrounds sounds
|
||||
* An even speaking pace with a variety of words is best, like in interviews or audiobooks.
|
||||
* Audio longer than 30 seconds will be silently truncated.
|
||||
|
||||
You can use FFmpeg to prepare your audio files, here are some examples:
|
||||
|
||||
@ -332,6 +337,20 @@ tts-1-hd:
|
||||
speaker: voices/me.wav # this could be you
|
||||
```
|
||||
|
||||
You can also use a sub folder for multiple audio samples to combine small samples or to mix different samples together.
|
||||
|
||||
For example:
|
||||
|
||||
```yaml
|
||||
...
|
||||
tts-1-hd:
|
||||
mixed:
|
||||
model: xtts
|
||||
speaker: voices/mixed
|
||||
```
|
||||
|
||||
Where the `voices/mixed/` folder contains multiple wav files. The total audio length is still limited to 30 seconds.
|
||||
|
||||
## Multilingual
|
||||
|
||||
Multilingual cloning support was added in version 0.11.0 and is available only with the XTTS v2 model. To use multilingual voices with piper simply download a language specific voice.
|
||||
|
25
speech.py
25
speech.py
@ -85,7 +85,7 @@ class xtts_wrapper():
|
||||
self.timer.daemon = True
|
||||
self.timer.start()
|
||||
|
||||
def tts(self, text, language, speaker_wav, **hf_generate_kwargs):
|
||||
def tts(self, text, language, audio_path, **hf_generate_kwargs):
|
||||
with torch.no_grad():
|
||||
self.last_used = time.time()
|
||||
tokens = 0
|
||||
@ -93,7 +93,7 @@ class xtts_wrapper():
|
||||
with self.lock:
|
||||
logger.debug(f"generating [{language}]: {[text]}")
|
||||
|
||||
gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # not worth caching calls, it's < 0.001s after model is loaded
|
||||
gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=audio_path) # not worth caching calls, it's < 0.001s after model is loaded
|
||||
pcm_stream = self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs)
|
||||
self.last_used = time.time()
|
||||
|
||||
@ -319,6 +319,21 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
in_q = queue.Queue() # speech pcm
|
||||
ex_q = queue.Queue() # exceptions
|
||||
|
||||
def get_speaker_samples(samples: str) -> list[str]:
|
||||
if os.path.isfile(samples):
|
||||
audio_path = [samples]
|
||||
elif os.path.isdir(samples):
|
||||
audio_path = [os.path.join(samples, sample) for sample in os.listdir(samples) if os.path.isfile(os.path.join(samples, sample))]
|
||||
|
||||
if len(audio_path) < 1:
|
||||
logger.error(f"No files found: {samples}")
|
||||
raise ServiceUnavailableError(f"Invalid path: {samples}")
|
||||
else:
|
||||
logger.error(f"Invalid path: {samples}")
|
||||
raise ServiceUnavailableError(f"Invalid path: {samples}")
|
||||
|
||||
return audio_path
|
||||
|
||||
def exception_check(exq: queue.Queue):
|
||||
try:
|
||||
e = exq.get_nowait()
|
||||
@ -329,9 +344,13 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
|
||||
def generator():
|
||||
# text -> in_q
|
||||
|
||||
audio_path = get_speaker_samples(speaker)
|
||||
logger.debug(f"{voice} wav samples: {audio_path}")
|
||||
|
||||
try:
|
||||
for text in all_text:
|
||||
for chunk in xtts.tts(text=text, language=language, speaker_wav=speaker, **hf_generate_kwargs):
|
||||
for chunk in xtts.tts(text=text, language=language, audio_path=audio_path, **hf_generate_kwargs):
|
||||
exception_check(ex_q)
|
||||
in_q.put(chunk)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user