mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
0.18.0 - Allow multiple samples in xtts. Closes: #38
This commit is contained in:
parent
b5d0dafd7d
commit
e815ef2860
19
README.md
19
README.md
@ -30,6 +30,10 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s
|
|||||||
|
|
||||||
## Recent Changes
|
## Recent Changes
|
||||||
|
|
||||||
|
Version 0.18.0, 2024-08-15
|
||||||
|
|
||||||
|
* Allow folders of wav samples in xtts. Samples will be combined, allowing for mixed voices and collections of small samples. Still limited to 30 seconds total. Thanks @nathanhere.
|
||||||
|
|
||||||
Version 0.17.3, 2024-08-15
|
Version 0.17.3, 2024-08-15
|
||||||
|
|
||||||
* fix fr_FR-tom-medium and other 44khz piper voices (detect non-default sample rates)
|
* fix fr_FR-tom-medium and other 44khz piper voices (detect non-default sample rates)
|
||||||
@ -308,6 +312,7 @@ Coqui XTTS v2 voice cloning can work with as little as 6 seconds of clear audio.
|
|||||||
* low noise (no hiss or hum)
|
* low noise (no hiss or hum)
|
||||||
* No partial words, breathing, laughing, music or backgrounds sounds
|
* No partial words, breathing, laughing, music or backgrounds sounds
|
||||||
* An even speaking pace with a variety of words is best, like in interviews or audiobooks.
|
* An even speaking pace with a variety of words is best, like in interviews or audiobooks.
|
||||||
|
* Audio longer than 30 seconds will be silently truncated.
|
||||||
|
|
||||||
You can use FFmpeg to prepare your audio files, here are some examples:
|
You can use FFmpeg to prepare your audio files, here are some examples:
|
||||||
|
|
||||||
@ -332,6 +337,20 @@ tts-1-hd:
|
|||||||
speaker: voices/me.wav # this could be you
|
speaker: voices/me.wav # this could be you
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can also use a sub folder for multiple audio samples to combine small samples or to mix different samples together.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
...
|
||||||
|
tts-1-hd:
|
||||||
|
mixed:
|
||||||
|
model: xtts
|
||||||
|
speaker: voices/mixed
|
||||||
|
```
|
||||||
|
|
||||||
|
Where the `voices/mixed/` folder contains multiple wav files. The total audio length is still limited to 30 seconds.
|
||||||
|
|
||||||
## Multilingual
|
## Multilingual
|
||||||
|
|
||||||
Multilingual cloning support was added in version 0.11.0 and is available only with the XTTS v2 model. To use multilingual voices with piper simply download a language specific voice.
|
Multilingual cloning support was added in version 0.11.0 and is available only with the XTTS v2 model. To use multilingual voices with piper simply download a language specific voice.
|
||||||
|
|||||||
25
speech.py
25
speech.py
@ -85,7 +85,7 @@ class xtts_wrapper():
|
|||||||
self.timer.daemon = True
|
self.timer.daemon = True
|
||||||
self.timer.start()
|
self.timer.start()
|
||||||
|
|
||||||
def tts(self, text, language, speaker_wav, **hf_generate_kwargs):
|
def tts(self, text, language, audio_path, **hf_generate_kwargs):
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
self.last_used = time.time()
|
self.last_used = time.time()
|
||||||
tokens = 0
|
tokens = 0
|
||||||
@ -93,7 +93,7 @@ class xtts_wrapper():
|
|||||||
with self.lock:
|
with self.lock:
|
||||||
logger.debug(f"generating [{language}]: {[text]}")
|
logger.debug(f"generating [{language}]: {[text]}")
|
||||||
|
|
||||||
gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # not worth caching calls, it's < 0.001s after model is loaded
|
gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=audio_path) # not worth caching calls, it's < 0.001s after model is loaded
|
||||||
pcm_stream = self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs)
|
pcm_stream = self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs)
|
||||||
self.last_used = time.time()
|
self.last_used = time.time()
|
||||||
|
|
||||||
@ -319,6 +319,21 @@ async def generate_speech(request: GenerateSpeechRequest):
|
|||||||
in_q = queue.Queue() # speech pcm
|
in_q = queue.Queue() # speech pcm
|
||||||
ex_q = queue.Queue() # exceptions
|
ex_q = queue.Queue() # exceptions
|
||||||
|
|
||||||
|
def get_speaker_samples(samples: str) -> list[str]:
|
||||||
|
if os.path.isfile(samples):
|
||||||
|
audio_path = [samples]
|
||||||
|
elif os.path.isdir(samples):
|
||||||
|
audio_path = [os.path.join(samples, sample) for sample in os.listdir(samples) if os.path.isfile(os.path.join(samples, sample))]
|
||||||
|
|
||||||
|
if len(audio_path) < 1:
|
||||||
|
logger.error(f"No files found: {samples}")
|
||||||
|
raise ServiceUnavailableError(f"Invalid path: {samples}")
|
||||||
|
else:
|
||||||
|
logger.error(f"Invalid path: {samples}")
|
||||||
|
raise ServiceUnavailableError(f"Invalid path: {samples}")
|
||||||
|
|
||||||
|
return audio_path
|
||||||
|
|
||||||
def exception_check(exq: queue.Queue):
|
def exception_check(exq: queue.Queue):
|
||||||
try:
|
try:
|
||||||
e = exq.get_nowait()
|
e = exq.get_nowait()
|
||||||
@ -329,9 +344,13 @@ async def generate_speech(request: GenerateSpeechRequest):
|
|||||||
|
|
||||||
def generator():
|
def generator():
|
||||||
# text -> in_q
|
# text -> in_q
|
||||||
|
|
||||||
|
audio_path = get_speaker_samples(speaker)
|
||||||
|
logger.debug(f"{voice} wav samples: {audio_path}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for text in all_text:
|
for text in all_text:
|
||||||
for chunk in xtts.tts(text=text, language=language, speaker_wav=speaker, **hf_generate_kwargs):
|
for chunk in xtts.tts(text=text, language=language, audio_path=audio_path, **hf_generate_kwargs):
|
||||||
exception_check(ex_q)
|
exception_check(ex_q)
|
||||||
in_q.put(chunk)
|
in_q.put(chunk)
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user