mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
0.5.0 rc1
This commit is contained in:
parent
b44992c806
commit
f3f679784f
24
README.md
24
README.md
@ -4,21 +4,21 @@ OpenedAI API for audio/speech
|
||||
This is an API clone of the OpenAI API for text to speech audio generation.
|
||||
|
||||
* Compatible with the OpenAI audio/speech API
|
||||
* Does not connect to OpenAI
|
||||
* Does not require a (real) OpenAI API Key.
|
||||
* Not affiliated with OpenAI in any way.
|
||||
* Does not connect to the OpenAI API and does not require a (real) OpenAI API Key
|
||||
* Not affiliated with OpenAI in any way
|
||||
|
||||
API Support:
|
||||
* model 'tts-1' via [piper tts](https://github.com/rhasspy/piper) (fast, can use cpu)
|
||||
* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 (fast, uses almost 4GB GPU VRAM)
|
||||
* Can be run without TTS/xtts_v2, entirely on cpu
|
||||
|
||||
Compatibility:
|
||||
Full Compatibility:
|
||||
* `tts-1`: `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable)
|
||||
* `tts-1-hd`: `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable, uses OpenAI samples by default)
|
||||
* response_format: `mp3`, `opus`, `aac`, or `flac`
|
||||
* speed 0.25-4.0 (and more)
|
||||
|
||||
Details:
|
||||
* model 'tts-1' via [piper tts](https://github.com/rhasspy/piper) (fast, can use cpu)
|
||||
* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, uses almost 4GB GPU VRAM)
|
||||
* Can be run without TTS/xtts_v2, entirely on cpu
|
||||
* Custom cloned voices can be used for tts-1-hd, just save a WAV file in `/voices/`
|
||||
* You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) and xtts_v2 speaker clones via `voice_to_speaker.yaml`
|
||||
* response_format: `mp3`, `opus`, `aac`, or `flac`
|
||||
* Sometimes certain words or symbols will sound bad, you can fix them with regex via `pre_process_map.yaml`
|
||||
|
||||
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
|
||||
@ -84,15 +84,11 @@ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -
|
||||
|
||||
Or just like this:
|
||||
|
||||
|
||||
```shell
|
||||
curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
|
||||
"input": "The quick brown fox jumped over the lazy dog."}' > speech.mp3
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
Or like this example from the [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech):
|
||||
|
||||
```python
|
||||
|
@ -1,7 +1,4 @@
|
||||
#!/bin/sh
|
||||
for i in echo fable onyx nova shimmer; do
|
||||
wget -q https://cdn.openai.com/API/docs/audio/$i.wav -O - | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
|
||||
for i in alloy echo fable onyx nova shimmer; do
|
||||
curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
|
||||
done
|
||||
|
||||
# in testing alloy sounded REALY BAD after cloning. Save it anyways, but use another as the default.
|
||||
wget -q https://cdn.openai.com/API/docs/audio/alloy.wav -O - | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/alloy0.wav
|
||||
|
@ -1,3 +1,6 @@
|
||||
#!/bin/sh
|
||||
piper --update-voices --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
|
||||
piper --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
|
||||
models="en_GB-northern_english_male-medium en_US-libritts_r-medium" # en_US-ryan-high
|
||||
piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null
|
||||
for i in $models ; do
|
||||
piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null
|
||||
done
|
||||
|
10
main.py
10
main.py
@ -112,9 +112,13 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
|
||||
tts_model, speaker = model, speaker = map_voice_to_speaker(voice, 'tts-1-hd')
|
||||
|
||||
if speed > 2.0: # tts has a max speed of 2.0
|
||||
ffmpeg_args.extend(["-af", "atempo=2.0"])
|
||||
speed = min(speed / 2.0, 2.0)
|
||||
# tts speed doesn't seem to work well
|
||||
if speed < 0.5:
|
||||
speed = speed / 0.5
|
||||
ffmpeg_args.extend(["-af", f"atempo=0.5"])
|
||||
if speed > 1.0:
|
||||
ffmpeg_args.extend(["-af", f"atempo={speed}"])
|
||||
speed = 1.0
|
||||
|
||||
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
|
||||
|
||||
|
@ -18,7 +18,7 @@ curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/jso
|
||||
\"speed\": 1.0
|
||||
}" | mpv --really-quiet -
|
||||
|
||||
wget -q https://cdn.openai.com/API/docs/audio/$voice.wav -O - | mpv --really-quiet -
|
||||
curl -s https://cdn.openai.com/API/docs/audio/$voice.wav | mpv --really-quiet -
|
||||
|
||||
curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d "{
|
||||
\"model\": \"tts-1\",
|
||||
@ -36,3 +36,30 @@ curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/jso
|
||||
|
||||
done
|
||||
|
||||
curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d "{
|
||||
\"model\": \"tts-1\",
|
||||
\"input\": \"the slowest voice\",
|
||||
\"voice\": \"onyx\",
|
||||
\"speed\": 0.25
|
||||
}" | mpv --really-quiet -
|
||||
|
||||
curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d "{
|
||||
\"model\": \"tts-1-hd\",
|
||||
\"input\": \"the slowest HD voice\",
|
||||
\"voice\": \"onyx\",
|
||||
\"speed\": 0.25
|
||||
}" | mpv --really-quiet -
|
||||
|
||||
curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d "{
|
||||
\"model\": \"tts-1\",
|
||||
\"input\": \"And this is how fast it can go, the fastest voice\",
|
||||
\"voice\": \"nova\",
|
||||
\"speed\": 4.0
|
||||
}" | mpv --really-quiet -
|
||||
|
||||
curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d "{
|
||||
\"model\": \"tts-1-hd\",
|
||||
\"input\": \"And this is how fast it can go, the fastest HD voice\",
|
||||
\"voice\": \"nova\",
|
||||
\"speed\": 4.0
|
||||
}" | mpv --really-quiet -
|
||||
|
@ -4,26 +4,32 @@ tts-1:
|
||||
speaker: set your own speaker
|
||||
alloy:
|
||||
model: voices/en_US-libritts_r-medium.onnx
|
||||
speaker: 118 # 118, 64, 79, 80, 101, 130
|
||||
speaker: 79 # 64, 79, 80, 101, 130
|
||||
echo:
|
||||
model: voices/en_US-libritts_r-medium.onnx # or en_US-ryan-high:0
|
||||
speaker: 52 # 52, 102, 134
|
||||
model: voices/en_US-libritts_r-medium.onnx
|
||||
speaker: 134 # 52, 102, 134
|
||||
echo-alt:
|
||||
model: voices/en_US-ryan-high.onnx
|
||||
speaker: # default speaker
|
||||
fable:
|
||||
model: voices/en_GB-northern_english_male-medium.onnx
|
||||
speaker: # default speaker
|
||||
onyx:
|
||||
model: voices/en_US-libritts_r-medium.onnx
|
||||
speaker: 55 # 55, 72, 90, 104, 132, 136, 137, 159
|
||||
speaker: 159 # 55, 90, 132, 136, 137, 159
|
||||
nova:
|
||||
model: voices/en_US-libritts_r-medium.onnx
|
||||
speaker: 57 # 57, 61, 107, 150, 162
|
||||
speaker: 107 # 57, 61, 107, 150, 162
|
||||
shimmer:
|
||||
model: voices/en_US-libritts_r-medium.onnx
|
||||
speaker: 163
|
||||
tts-1-hd:
|
||||
alloy:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/alloy.wav
|
||||
speaker: voices/alloy-alt.wav
|
||||
alloy-orig:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/alloy.wav # it's REALLY BAD
|
||||
echo:
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
speaker: voices/echo.wav
|
||||
|
Loading…
Reference in New Issue
Block a user