0.5.0 rc1

This commit is contained in:
matatonic 2023-11-27 22:43:41 -05:00
parent b44992c806
commit f3f679784f
7 changed files with 64 additions and 31 deletions

View File

@ -4,21 +4,21 @@ OpenedAI API for audio/speech
This is an API clone of the OpenAI API for text to speech audio generation.
* Compatible with the OpenAI audio/speech API
* Does not connect to OpenAI
* Does not require a (real) OpenAI API Key.
* Not affiliated with OpenAI in any way.
* Does not connect to the OpenAI API and does not require a (real) OpenAI API Key
* Not affiliated with OpenAI in any way
API Support:
* model 'tts-1' via [piper tts](https://github.com/rhasspy/piper) (fast, can use cpu)
* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 (fast, uses almost 4GB GPU VRAM)
* Can be run without TTS/xtts_v2, entirely on cpu
Compatibility:
Full Compatibility:
* `tts-1`: `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable)
* `tts-1-hd`: `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable, uses OpenAI samples by default)
* response_format: `mp3`, `opus`, `aac`, or `flac`
* speed 0.25-4.0 (and more)
Details:
* model 'tts-1' via [piper tts](https://github.com/rhasspy/piper) (fast, can use cpu)
* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, uses almost 4GB GPU VRAM)
* Can be run without TTS/xtts_v2, entirely on cpu
* Custom cloned voices can be used for tts-1-hd, just save a WAV file in `/voices/`
* You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) and xtts_v2 speaker clones via `voice_to_speaker.yaml`
* response_format: `mp3`, `opus`, `aac`, or `flac`
* Sometimes certain words or symbols will sound bad, you can fix them with regex via `pre_process_map.yaml`
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
@ -84,15 +84,11 @@ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -
Or just like this:
```shell
curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
"input": "The quick brown fox jumped over the lazy dog."}' > speech.mp3
```
Or like this example from the [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech):
```python

View File

@ -1,7 +1,4 @@
#!/bin/sh
for i in echo fable onyx nova shimmer; do
wget -q https://cdn.openai.com/API/docs/audio/$i.wav -O - | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
for i in alloy echo fable onyx nova shimmer; do
curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
done
# in testing alloy sounded REALY BAD after cloning. Save it anyways, but use another as the default.
wget -q https://cdn.openai.com/API/docs/audio/alloy.wav -O - | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/alloy0.wav

View File

@ -1,3 +1,6 @@
#!/bin/sh
piper --update-voices --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
piper --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
models="en_GB-northern_english_male-medium en_US-libritts_r-medium" # en_US-ryan-high
piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null
for i in $models ; do
piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null
done

10
main.py
View File

@ -112,9 +112,13 @@ async def generate_speech(request: GenerateSpeechRequest):
tts_model, speaker = model, speaker = map_voice_to_speaker(voice, 'tts-1-hd')
if speed > 2.0: # tts has a max speed of 2.0
ffmpeg_args.extend(["-af", "atempo=2.0"])
speed = min(speed / 2.0, 2.0)
# tts speed doesn't seem to work well
if speed < 0.5:
speed = speed / 0.5
ffmpeg_args.extend(["-af", f"atempo=0.5"])
if speed > 1.0:
ffmpeg_args.extend(["-af", f"atempo={speed}"])
speed = 1.0
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)

View File

@ -18,7 +18,7 @@ curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/jso
\"speed\": 1.0
}" | mpv --really-quiet -
wget -q https://cdn.openai.com/API/docs/audio/$voice.wav -O - | mpv --really-quiet -
curl -s https://cdn.openai.com/API/docs/audio/$voice.wav | mpv --really-quiet -
curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d "{
\"model\": \"tts-1\",
@ -36,3 +36,30 @@ curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/jso
done
curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d "{
\"model\": \"tts-1\",
\"input\": \"the slowest voice\",
\"voice\": \"onyx\",
\"speed\": 0.25
}" | mpv --really-quiet -
curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d "{
\"model\": \"tts-1-hd\",
\"input\": \"the slowest HD voice\",
\"voice\": \"onyx\",
\"speed\": 0.25
}" | mpv --really-quiet -
curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d "{
\"model\": \"tts-1\",
\"input\": \"And this is how fast it can go, the fastest voice\",
\"voice\": \"nova\",
\"speed\": 4.0
}" | mpv --really-quiet -
curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d "{
\"model\": \"tts-1-hd\",
\"input\": \"And this is how fast it can go, the fastest HD voice\",
\"voice\": \"nova\",
\"speed\": 4.0
}" | mpv --really-quiet -

View File

@ -4,26 +4,32 @@ tts-1:
speaker: set your own speaker
alloy:
model: voices/en_US-libritts_r-medium.onnx
speaker: 118 # 118, 64, 79, 80, 101, 130
speaker: 79 # 64, 79, 80, 101, 130
echo:
model: voices/en_US-libritts_r-medium.onnx # or en_US-ryan-high:0
speaker: 52 # 52, 102, 134
model: voices/en_US-libritts_r-medium.onnx
speaker: 134 # 52, 102, 134
echo-alt:
model: voices/en_US-ryan-high.onnx
speaker: # default speaker
fable:
model: voices/en_GB-northern_english_male-medium.onnx
speaker: # default speaker
onyx:
model: voices/en_US-libritts_r-medium.onnx
speaker: 55 # 55, 72, 90, 104, 132, 136, 137, 159
speaker: 159 # 55, 90, 132, 136, 137, 159
nova:
model: voices/en_US-libritts_r-medium.onnx
speaker: 57 # 57, 61, 107, 150, 162
speaker: 107 # 57, 61, 107, 150, 162
shimmer:
model: voices/en_US-libritts_r-medium.onnx
speaker: 163
tts-1-hd:
alloy:
model: tts_models/multilingual/multi-dataset/xtts_v2
speaker: voices/alloy.wav
speaker: voices/alloy-alt.wav
alloy-orig:
model: tts_models/multilingual/multi-dataset/xtts_v2
speaker: voices/alloy.wav # it's REALLY BAD
echo:
model: tts_models/multilingual/multi-dataset/xtts_v2
speaker: voices/echo.wav