mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
Merge branch 'main' of https://github.com/matatonic/openedai-speech
This commit is contained in:
commit
5c887af40e
21
README.md
21
README.md
@ -18,7 +18,7 @@ Details:
|
||||
* You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) via the `voice_to_speaker.yaml` configuration file
|
||||
* Model `tts-1-hd` via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, but requires around 4GB GPU VRAM)
|
||||
* Custom cloned voices can be used for tts-1-hd, See: [Custom Voices Howto](#custom-voices-howto)
|
||||
* 🌐 [Multilingual](#multilingual) support with XTTS voices
|
||||
* 🌐 [Multilingual](#multilingual) support with XTTS voices, the language is automatically detected if not set
|
||||
* [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
|
||||
* Configurable [generation parameters](#generation-parameters)
|
||||
* Streamed output while generating
|
||||
@ -30,6 +30,19 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s
|
||||
|
||||
## Recent Changes
|
||||
|
||||
Version 0.17.2, 2024-07-01
|
||||
|
||||
* fix -min image (re: langdetect)
|
||||
|
||||
Version 0.17.1, 2024-07-01
|
||||
|
||||
* fix ROCm (add langdetect to requirements-rocm.txt)
|
||||
* Fix zh-cn for xtts
|
||||
|
||||
Version 0.17.0, 2024-07-01
|
||||
|
||||
* Automatic language detection, thanks [@RodolfoCastanheira](https://github.com/RodolfoCastanheira)
|
||||
|
||||
Version 0.16.0, 2024-06-29
|
||||
|
||||
* Multi-client safe version. Audio generation is synchronized in a single process. The estimated 'realtime' factor of XTTS on a GPU is roughly 1/3, this means that multiple streams simultaneously, or `speed` over 2, may experience audio underrun (delays or pauses in playback). This makes multiple clients possible and safe, but in practice 2 or 3 simultaneous streams is the maximum without audio underrun.
|
||||
@ -58,7 +71,7 @@ Version 0.14.0, 2024-06-26
|
||||
Version 0.13.0, 2024-06-25
|
||||
|
||||
* Added [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
|
||||
* Initial prebuilt arm64 image support (Apple M-series, Raspberry Pi - MPS is not supported in XTTS/torch), thanks @JakeStevenson, @hchasens
|
||||
* Initial prebuilt arm64 image support (Apple M-series, Raspberry Pi - MPS is not supported in XTTS/torch), thanks [@JakeStevenson](https://github.com/JakeStevenson), [@hchasens](https://github.com/hchasens)
|
||||
* Initial attempt at AMD GPU (ROCm 5.7) support
|
||||
* Parler-tts support removed
|
||||
* Move the *.default.yaml to the root folder
|
||||
@ -88,7 +101,7 @@ Version 0.11.0, 2024-05-29
|
||||
|
||||
Version: 0.10.1, 2024-05-05
|
||||
|
||||
* Remove `runtime: nvidia` from docker-compose.yml, this assumes nvidia/cuda compatible runtime is available by default. thanks @jmtatsch
|
||||
* Remove `runtime: nvidia` from docker-compose.yml, this assumes nvidia/cuda compatible runtime is available by default. thanks [@jmtatsch](https://github.com/jmtatsch)
|
||||
|
||||
Version: 0.10.0, 2024-04-27
|
||||
|
||||
@ -318,7 +331,7 @@ tts-1-hd:
|
||||
|
||||
Multilingual cloning support was added in version 0.11.0 and is available only with the XTTS v2 model. To use multilingual voices with piper simply download a language specific voice.
|
||||
|
||||
Coqui XTTSv2 has support for 16 languages: English (`en`), Spanish (`es`), French (`fr`), German (`de`), Italian (`it`), Portuguese (`pt`), Polish (`pl`), Turkish (`tr`), Russian (`ru`), Dutch (`nl`), Czech (`cs`), Arabic (`ar`), Chinese (`zh-cn`), Japanese (`ja`), Hungarian (`hu`) and Korean (`ko`).
|
||||
Coqui XTTSv2 has support for multiple languages: English (`en`), Spanish (`es`), French (`fr`), German (`de`), Italian (`it`), Portuguese (`pt`), Polish (`pl`), Turkish (`tr`), Russian (`ru`), Dutch (`nl`), Czech (`cs`), Arabic (`ar`), Chinese (`zh-cn`), Hungarian (`hu`), Korean (`ko`), Japanese (`ja`), and Hindi (`hi`). When not set, an attempt will be made to automatically detect the language, falling back to English (`en`).
|
||||
|
||||
Unfortunately the OpenAI API does not support language, but you can create your own custom speaker voice and set the language for that.
|
||||
|
||||
|
||||
@ -11,8 +11,8 @@ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFo
|
||||
|
||||
parser.add_argument('sample', action='store', help="Set the wav sample file")
|
||||
parser.add_argument('-n', '--name', action='store', help="Set the name for the voice (by default will use the WAV file name)")
|
||||
parser.add_argument('-l', '--language', action='store', default="en", help="Set the language for the voice",
|
||||
choices=['en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko'])
|
||||
parser.add_argument('-l', '--language', action='store', default="auto", help="Set the language for the voice",
|
||||
choices=['auto', 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko', 'hi'])
|
||||
parser.add_argument('--openai-model', action='store', default="tts-1-hd", help="Set the openai model for the voice")
|
||||
parser.add_argument('--xtts-model', action='store', default="xtts", help="Set the xtts model for the voice (if using a custom model, also set model_path)")
|
||||
parser.add_argument('--model-path', action='store', default=None, help="Set the path for a custom xtts model")
|
||||
|
||||
@ -3,6 +3,7 @@ uvicorn
|
||||
loguru
|
||||
piper-tts
|
||||
coqui-tts
|
||||
langdetect
|
||||
# Creating an environment where deepspeed works is complex, for now it will be disabled by default.
|
||||
#deepspeed
|
||||
torch; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"
|
||||
|
||||
@ -3,6 +3,7 @@ uvicorn
|
||||
loguru
|
||||
piper-tts
|
||||
coqui-tts[languages]
|
||||
langdetect
|
||||
# Creating an environment where deepspeed works is complex, for now it will be disabled by default.
|
||||
#deepspeed
|
||||
|
||||
@ -12,4 +13,4 @@ torchaudio; sys_platform != "darwin"
|
||||
torch; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
|
||||
torchaudio; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
|
||||
|
||||
# ROCM (Linux only) - use requirements.amd.txt
|
||||
# ROCM (Linux only) - use requirements.amd.txt
|
||||
|
||||
24
speech.py
24
speech.py
@ -17,7 +17,6 @@ from openedai import OpenAIStub, BadRequestError, ServiceUnavailableError
|
||||
from pydantic import BaseModel
|
||||
import uvicorn
|
||||
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def lifespan(app):
|
||||
yield
|
||||
@ -270,7 +269,21 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
# Pipe the output from piper/xtts to the input of ffmpeg
|
||||
ffmpeg_args.extend(["-"])
|
||||
|
||||
language = voice_map.pop('language', 'en')
|
||||
language = voice_map.pop('language', 'auto')
|
||||
if language == 'auto':
|
||||
try:
|
||||
language = detect(input_text)
|
||||
if language not in [
|
||||
'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr',
|
||||
'ru', 'nl', 'cs', 'ar', 'zh-cn', 'hu', 'ko', 'ja', 'hi'
|
||||
]:
|
||||
logger.debug(f"Detected language {language} not supported, defaulting to en")
|
||||
language = 'en'
|
||||
else:
|
||||
logger.debug(f"Detected language: {language}")
|
||||
except:
|
||||
language = 'en'
|
||||
logger.debug(f"Failed to detect language, defaulting to en")
|
||||
|
||||
comment = voice_map.pop('comment', None) # ignored.
|
||||
|
||||
@ -282,7 +295,11 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
hf_generate_kwargs['enable_text_splitting'] = hf_generate_kwargs.get('enable_text_splitting', True) # change the default to true
|
||||
|
||||
if hf_generate_kwargs['enable_text_splitting']:
|
||||
all_text = split_sentence(input_text, language, xtts.xtts.tokenizer.char_limits[language])
|
||||
if language == 'zh-cn':
|
||||
split_lang = 'zh'
|
||||
else:
|
||||
split_lang = language
|
||||
all_text = split_sentence(input_text, split_lang, xtts.xtts.tokenizer.char_limits[split_lang])
|
||||
else:
|
||||
all_text = [input_text]
|
||||
|
||||
@ -387,6 +404,7 @@ if __name__ == "__main__":
|
||||
from TTS.tts.models.xtts import Xtts
|
||||
from TTS.utils.manage import ModelManager
|
||||
from TTS.tts.layers.xtts.tokenizer import split_sentence
|
||||
from langdetect import detect
|
||||
|
||||
if args.preload:
|
||||
xtts = xtts_wrapper(args.preload, device=args.xtts_device, unload_timer=args.unload_timer)
|
||||
|
||||
@ -46,8 +46,9 @@ tts-1-hd:
|
||||
model: xtts
|
||||
speaker: voices/shimmer.wav
|
||||
me:
|
||||
model: xtts_v2.0.2 # you can specify different xtts version
|
||||
model: xtts_v2.0.2 # you can specify an older xtts version
|
||||
speaker: voices/me.wav # this could be you
|
||||
language: auto
|
||||
enable_text_splitting: True
|
||||
length_penalty: 1.0
|
||||
repetition_penalty: 10
|
||||
|
||||
Loading…
Reference in New Issue
Block a user