mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
fix pops in tts-1-hd
This commit is contained in:
parent
1a553f18d7
commit
48fa91ec93
@ -25,10 +25,10 @@ Details:
|
||||
|
||||
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
|
||||
|
||||
Version: 0.7.2, 2024-03-20
|
||||
Version: 0.7.3, 2024-03-20
|
||||
|
||||
* Allow different xtts versions per voice in `voice_to_speaker.yaml`, ex. xtts_v2.0.2
|
||||
* Quality: Fix xtts sample rate (24000 vs. 22050 for piper)
|
||||
* Quality: Fix xtts sample rate (24000 vs. 22050 for piper) and pops
|
||||
* use CUDA 12.2-base in Dockerfile
|
||||
|
||||
API Documentation
|
||||
|
||||
12
main.py
12
main.py
@ -71,9 +71,12 @@ class GenerateSpeechRequest(BaseModel):
|
||||
response_format: str = "mp3" # mp3, opus, aac, flac
|
||||
speed: float = 1.0 # 0.25 - 4.0
|
||||
|
||||
def build_ffmpeg_args(sample_rate, response_format):
|
||||
def build_ffmpeg_args(response_format, input_format, sample_rate):
|
||||
# Convert the output to the desired format using ffmpeg
|
||||
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", sample_rate, "-ac", "1", "-i", "-"]
|
||||
if input_format == 'raw':
|
||||
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", sample_rate, "-ac", "1", "-i", "-"]
|
||||
else:
|
||||
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "WAV", "-i", "-"]
|
||||
|
||||
if response_format == "mp3":
|
||||
ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"])
|
||||
@ -123,7 +126,7 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
|
||||
tts_proc.stdin.close()
|
||||
tts_io_out = tts_proc.stdout
|
||||
ffmpeg_args = build_ffmpeg_args("22050", response_format)
|
||||
ffmpeg_args = build_ffmpeg_args(response_format, input_format="raw", sample_rate="22050")
|
||||
|
||||
# Use xtts for tts-1-hd
|
||||
elif model == 'tts-1-hd':
|
||||
@ -138,8 +141,7 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
|
||||
xtts = xtts_wrapper(tts_model, device=args.xtts_device)
|
||||
|
||||
# input sample rate is 22050, output is 24000...
|
||||
ffmpeg_args = build_ffmpeg_args("24000", response_format)
|
||||
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")
|
||||
|
||||
# tts speed doesn't seem to work well
|
||||
if speed < 0.5:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user