Fix xtts sample rate

This commit is contained in:
matatonic 2024-03-20 16:10:31 -04:00
parent 62c9d3caac
commit 3bc5fb5d21
4 changed files with 69 additions and 28 deletions

View File

@ -1,4 +1,7 @@
FROM nvidia/cuda:11.8.0-base-ubuntu22.04
FROM nvidia/cuda:12.2.0-base-ubuntu22.04
ENV COQUI_TOS_AGREED=1
ENV PRELOAD_MODEL=xtts
RUN apt-get update && \
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip && \
@ -9,12 +12,11 @@ RUN mkdir -p /app/voices
# default clone of the default voice is really bad, use a better default
COPY voices/alloy-alt.wav /app/voices/
WORKDIR /app
COPY requirements.txt /app/
COPY *.txt /app/
RUN pip install -r requirements.txt
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
COPY *.sh /app/
RUN ./download_voices_tts-1.sh
RUN ./download_voices_tts-1-hd.sh
COPY *.py *.yaml *.md LICENSE /app/
ENV COQUI_TOS_AGREED=1
CMD python main.py --host 0.0.0.0 --port 8000 --preload xtts
CMD python main.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL

View File

@ -24,11 +24,11 @@ Details:
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
Version: 0.7.0
Last update: 2024-03-20
Version: 0.7.2, 2024-03-20
* Allow different xtts versions per voice in `voice_to_speaker.yaml`, ex. xtts_v2.0.2
* Quality: Fix xtts sample rate (24000 vs. 22050 for piper)
* use CUDA 12.2-base in Dockerfile
API Documentation
-----------------
@ -56,16 +56,20 @@ Usage
-----
```
usage: main.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload_xtts] [-P PORT] [-H HOST]
usage: main.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT]
[-H HOST]
OpenedAI Speech API Server
options:
-h, --help show this help message and exit
--piper_cuda Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough (default: False)
--piper_cuda Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me,
but cpu is fast enough (default: False)
--xtts_device XTTS_DEVICE
Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
--preload_xtts Preload the xtts model. By default it's loaded on first use. (default: False)
Set the device for the xtts model. The special value of 'none' will use
piper for all models. (default: cuda)
--preload PRELOAD Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on
first use. (default: None)
-P PORT, --port PORT Server tcp port (default: 8000)
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: localhost)
```
@ -124,3 +128,31 @@ docker compose up
```
If you want a minimal docker image with piper support only (900MB vs. 13GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to change this.
Custom Voices Howto
-------------------
Custom voices should be mono 22050 hz sample rate WAV files with low noise (no background music, etc.) and not contain any partial words. Sample voices for xtts should be at least 6 seconds, but can be longer, but longer doesn't always produce better results.
You can use FFmpeg to process your audio files and prepare them for xtts, here are some examples:
```shell
# convert a multi-channel audio file to mono, set sample rate to 22050 hz, trim to 6 seconds, and output as WAV file.
ffmpeg -i input.mp3 -ac 1 -ar 22050 -t 6 -y me.wav
# use a simple noise filter to clean up audio, and select a start time start for sampling.
ffmpeg -i input.wav -af "highpass=f=200, lowpass=f=3000" -ac 1 -ar 22050 -ss 00:13:26.2 -t 6 -y me.wav
# A more complex noise reduction setup with volume adjustment
ffmpeg -i input.mkv -af "highpass=f=200, lowpass=f=3000, volume=5, afftdn=nf=25" -ac 1 -ar 22050 -ss 00:13:26.2 -t 6 -y me.wav
```
Once you WAV file is prepared, save it in the voices/ folder and update the `voice_to_speaker.yaml` file with the new file name.
For example:
```yaml
...
tts-1-hd:
me:
model: xtts_v2.0.2 # you can specify different xtts versions
speaker: voices/me.wav # this could be you
```

View File

@ -1,5 +1,4 @@
#!/bin/sh
export COQUI_TOS_AGREED=1
model="xtts" # others are possible, ex. xtts_v2.0.2
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')"
./download_samples.sh

34
main.py
View File

@ -71,6 +71,21 @@ class GenerateSpeechRequest(BaseModel):
response_format: str = "mp3" # mp3, opus, aac, flac
speed: float = 1.0 # 0.25 - 4.0
def build_ffmpeg_args(sample_rate, response_format):
# Convert the output to the desired format using ffmpeg
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", sample_rate, "-ac", "1", "-i", "-"]
if response_format == "mp3":
ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"])
elif response_format == "opus":
ffmpeg_args.extend(["-f", "ogg", "-c:a", "libopus"])
elif response_format == "aac":
ffmpeg_args.extend(["-f", "adts", "-c:a", "aac", "-ab", "64k"])
elif response_format == "flac":
ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
return ffmpeg_args
@app.post("/v1/audio/speech", response_class=StreamingResponse)
async def generate_speech(request: GenerateSpeechRequest):
global xtts, args
@ -90,18 +105,7 @@ async def generate_speech(request: GenerateSpeechRequest):
elif response_format == "flac":
media_type = "audio/x-flac"
# Convert the output to the desired format using ffmpeg
ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", "22050", "-ac", "1", "-i", "-"]
if response_format == "mp3":
ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"])
elif response_format == "opus":
ffmpeg_args.extend(["-f", "ogg", "-c:a", "libopus"])
elif response_format == "aac":
ffmpeg_args.extend(["-f", "adts", "-c:a", "aac", "-ab", "64k"])
elif response_format == "flac":
ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
ffmpeg_args = None
tts_io_out = None
# Use piper for tts-1, and if xtts_device == none use for all models.
@ -119,6 +123,7 @@ async def generate_speech(request: GenerateSpeechRequest):
tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
tts_proc.stdin.close()
tts_io_out = tts_proc.stdout
ffmpeg_args = build_ffmpeg_args("22050", response_format)
# Use xtts for tts-1-hd
elif model == 'tts-1-hd':
@ -128,6 +133,9 @@ async def generate_speech(request: GenerateSpeechRequest):
xtts = xtts_wrapper(tts_model, device=args.xtts_device)
# XXX probably should GC/torch cleanup here
# input sample rate is 22050, output is 24000...
ffmpeg_args = build_ffmpeg_args("24000", response_format)
# tts speed doesn't seem to work well
if speed < 0.5:
speed = speed / 0.5
@ -153,7 +161,7 @@ if __name__ == "__main__":
parser.add_argument('--piper_cuda', action='store_true', default=False, help="Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough")
parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts'). By default it's loaded on first use.")
parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0")