0.5.0 rc1

2025-06-26 18:16:32 +00:00 · 2023-11-27 22:43:41 -05:00 · 2023-11-27 22:43:41 -05:00 · f3f679784f
commit f3f679784f
parent b44992c806
7 changed files with 64 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -4,21 +4,21 @@ OpenedAI API for audio/speech
 This is an API clone of the OpenAI API for text to speech audio generation.

 * Compatible with the OpenAI audio/speech API
-* Does not connect to OpenAI
-* Does not require a (real) OpenAI API Key. 
-* Not affiliated with OpenAI in any way.
+* Does not connect to the OpenAI API and does not require a (real) OpenAI API Key
+* Not affiliated with OpenAI in any way

-API Support:
-* model 'tts-1' via [piper tts](https://github.com/rhasspy/piper) (fast, can use cpu)
-* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 (fast, uses almost 4GB GPU VRAM)
-* Can be run without TTS/xtts_v2, entirely on cpu
-
-Compatibility:
+Full Compatibility:
 * `tts-1`: `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable)
 * `tts-1-hd`:  `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable, uses OpenAI samples by default)
+* response_format: `mp3`, `opus`, `aac`, or `flac`
+* speed 0.25-4.0 (and more)
+
+Details:
+* model 'tts-1' via [piper tts](https://github.com/rhasspy/piper) (fast, can use cpu)
+* model 'tts-1-hd' via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, uses almost 4GB GPU VRAM)
+* Can be run without TTS/xtts_v2, entirely on cpu
 * Custom cloned voices can be used for tts-1-hd, just save a WAV file in `/voices/`
 * You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) and xtts_v2 speaker clones via `voice_to_speaker.yaml`
-* response_format: `mp3`, `opus`, `aac`, or `flac`
 * Sometimes certain words or symbols will sound bad, you can fix them with regex via `pre_process_map.yaml`

 If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
@ -84,15 +84,11 @@ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -

 Or just like this:

-
 ```shell
 curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
    "input": "The quick brown fox jumped over the lazy dog."}' > speech.mp3
 ```

-
-
-
 Or like this example from the [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech):

 ```python
--- a/download_samples.sh
+++ b/download_samples.sh
@ -1,7 +1,4 @@
 #!/bin/sh
-for i in echo fable onyx nova shimmer; do
-	wget -q https://cdn.openai.com/API/docs/audio/$i.wav -O - | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
+for i in alloy echo fable onyx nova shimmer; do
+	curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
 done
-
-# in testing alloy sounded REALY BAD after cloning. Save it anyways, but use another as the default.
-wget -q https://cdn.openai.com/API/docs/audio/alloy.wav -O - | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/alloy0.wav
--- a/download_voices_tts-1.sh
+++ b/download_voices_tts-1.sh
@ -1,3 +1,6 @@
 #!/bin/sh
-piper --update-voices --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
-piper --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
+models="en_GB-northern_english_male-medium en_US-libritts_r-medium" # en_US-ryan-high 
+piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null
+for i in $models ; do
+	piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null
+done
--- a/main.py
+++ b/main.py
@ -112,9 +112,13 @@ async def generate_speech(request: GenerateSpeechRequest):

        tts_model, speaker = model, speaker = map_voice_to_speaker(voice, 'tts-1-hd')

-        if speed > 2.0: # tts has a max speed of 2.0
-            ffmpeg_args.extend(["-af", "atempo=2.0"]) 
-            speed = min(speed / 2.0, 2.0)
+        # tts speed doesn't seem to work well
+        if speed < 0.5:
+            speed = speed / 0.5
+            ffmpeg_args.extend(["-af", f"atempo=0.5"]) 
+        if speed > 1.0:
+            ffmpeg_args.extend(["-af", f"atempo={speed}"]) 
+            speed = 1.0

        tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
    
--- a/test_voices.sh
+++ b/test_voices.sh
@ -18,7 +18,7 @@ curl -s http://localhost:8000/v1/audio/speech  -H "Content-Type: application/jso
    \"speed\": 1.0
  }" | mpv --really-quiet -

-wget -q https://cdn.openai.com/API/docs/audio/$voice.wav -O - | mpv --really-quiet -
+curl -s https://cdn.openai.com/API/docs/audio/$voice.wav | mpv --really-quiet -

 curl -s http://localhost:8000/v1/audio/speech  -H "Content-Type: application/json"   -d "{
    \"model\": \"tts-1\",
@ -36,3 +36,30 @@ curl -s http://localhost:8000/v1/audio/speech  -H "Content-Type: application/jso

 done

+curl -s http://localhost:8000/v1/audio/speech  -H "Content-Type: application/json"   -d "{
+    \"model\": \"tts-1\",
+    \"input\": \"the slowest voice\",
+    \"voice\": \"onyx\",
+    \"speed\": 0.25
+  }" | mpv --really-quiet -
+
+curl -s http://localhost:8000/v1/audio/speech  -H "Content-Type: application/json"   -d "{
+    \"model\": \"tts-1-hd\",
+    \"input\": \"the slowest HD voice\",
+    \"voice\": \"onyx\",
+    \"speed\": 0.25
+  }" | mpv --really-quiet -
+
+curl -s http://localhost:8000/v1/audio/speech  -H "Content-Type: application/json"   -d "{
+    \"model\": \"tts-1\",
+    \"input\": \"And this is how fast it can go, the fastest voice\",
+    \"voice\": \"nova\",
+    \"speed\": 4.0
+  }" | mpv --really-quiet -
+
+curl -s http://localhost:8000/v1/audio/speech  -H "Content-Type: application/json"   -d "{
+    \"model\": \"tts-1-hd\",
+    \"input\": \"And this is how fast it can go, the fastest HD voice\",
+    \"voice\": \"nova\",
+    \"speed\": 4.0
+  }" | mpv --really-quiet -
--- a/voice_to_speaker.yaml
+++ b/voice_to_speaker.yaml
@ -4,26 +4,32 @@ tts-1:
    speaker: set your own speaker
  alloy: 
    model: voices/en_US-libritts_r-medium.onnx
-    speaker: 118 # 118, 64, 79, 80, 101, 130
+    speaker: 79 # 64, 79, 80, 101, 130
  echo:
-    model: voices/en_US-libritts_r-medium.onnx # or en_US-ryan-high:0
-    speaker: 52 # 52, 102, 134
+    model: voices/en_US-libritts_r-medium.onnx
+    speaker: 134 # 52, 102, 134
+  echo-alt:
+    model: voices/en_US-ryan-high.onnx
+    speaker: # default speaker
  fable:
    model: voices/en_GB-northern_english_male-medium.onnx
    speaker: # default speaker
  onyx:
    model: voices/en_US-libritts_r-medium.onnx
-    speaker: 55 # 55, 72, 90, 104, 132, 136, 137, 159
+    speaker: 159 # 55, 90, 132, 136, 137, 159
  nova:
    model: voices/en_US-libritts_r-medium.onnx
-    speaker: 57 # 57, 61, 107, 150, 162
+    speaker: 107 # 57, 61, 107, 150, 162
  shimmer:
    model: voices/en_US-libritts_r-medium.onnx
    speaker: 163
 tts-1-hd:
  alloy: 
    model: tts_models/multilingual/multi-dataset/xtts_v2
-    speaker: voices/alloy.wav
+    speaker: voices/alloy-alt.wav
+  alloy-orig: 
+    model: tts_models/multilingual/multi-dataset/xtts_v2
+    speaker: voices/alloy.wav # it's REALLY BAD
  echo:
    model: tts_models/multilingual/multi-dataset/xtts_v2
    speaker: voices/echo.wav
--- a/voices/alloy-alt.wav
+++ b/voices/alloy-alt.wav