0.2.0 rc5

2025-06-26 18:16:32 +00:00 · 2023-11-27 21:33:59 -05:00
parent eb1b2f12c7
commit b44992c806
9 changed files with 55 additions and 15 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -9,8 +9,8 @@ services:
    tty: true
    ports:
      - "8000:8000"
-    #volumes:
-    #  - .:/app/
+    volumes:
+      - .:/app/
    # Below can be removed if not using GPU
    runtime: nvidia
    deploy:
--- a/download_samples.sh
+++ b/download_samples.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+for i in echo fable onyx nova shimmer; do
+	wget -q https://cdn.openai.com/API/docs/audio/$i.wav -O - | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
+done
+
+# in testing alloy sounded REALY BAD after cloning. Save it anyways, but use another as the default.
+wget -q https://cdn.openai.com/API/docs/audio/alloy.wav -O - | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/alloy0.wav
--- a/download_voices_tts-1-hd.sh
+++ b/download_voices_tts-1-hd.sh
@@ -2,4 +2,4 @@
 export COQUI_TOS_AGREED=1
 model="tts_models/multilingual/multi-dataset/xtts_v2"
 python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
-$(cd voices/ && ./download_samples.sh)
+./download_samples.sh
--- a/main.py
+++ b/main.py
@@ -18,6 +18,7 @@ app = FastAPI()

 class xtts_wrapper():
    def __init__(self, model_name):
+        global args
        self.xtts = TTS(model_name=model_name, progress_bar=False).to(args.xtts_device)

    def tts(self, text, speaker_wav, speed):
@@ -57,6 +58,7 @@ class GenerateSpeechRequest(BaseModel):

@app.post("/v1/audio/speech")
 async def generate_speech(request: GenerateSpeechRequest):
+    global xtts, args
    input_text = preprocess(request.input)
    model = request.model
    voice = request.voice
@@ -132,7 +134,7 @@ if __name__ == "__main__":
    parser.add_argument('--piper_cuda', action='store_true', default=False, help="Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough") 
    parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
    parser.add_argument('--preload_xtts', action='store_true', default=False, help="Preload the xtts model. By default it's loaded on first use.")
-    parser.add_argument('-P', '--port', action='store', default=8000, help="Server tcp port")
+    parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
    parser.add_argument('-H', '--host', action='store', default='localhost', help="Host to listen on, Ex. 0.0.0.0")

    args = parser.parse_args()
--- a/test_voices.sh
+++ b/test_voices.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+
+curl -s http://localhost:8000/v1/audio/speech  -H "Content-Type: application/json"   -d "{
+    \"model\": \"tts-1\",
+    \"input\": \"I'm going to play you the original voice, followed by the piper voice and finally the X T T S version 2 voice\",
+    \"voice\": \"echo\",
+    \"speed\": 1.0
+  }" | mpv --really-quiet -
+
+for voice in  alloy echo fable onyx nova shimmer ; do
+
+echo $voice
+
+curl -s http://localhost:8000/v1/audio/speech  -H "Content-Type: application/json"   -d "{
+    \"model\": \"tts-1\",
+    \"input\": \"original\",
+    \"voice\": \"echo\",
+    \"speed\": 1.0
+  }" | mpv --really-quiet -
+
+wget -q https://cdn.openai.com/API/docs/audio/$voice.wav -O - | mpv --really-quiet -
+
+curl -s http://localhost:8000/v1/audio/speech  -H "Content-Type: application/json"   -d "{
+    \"model\": \"tts-1\",
+    \"input\": \"The quick brown fox jumped over the lazy dog. This voice is called $voice, how do you like this voice?\",
+    \"voice\": \"$voice\",
+    \"speed\": 1.0
+  }" | mpv --really-quiet -
+
+curl -s http://localhost:8000/v1/audio/speech  -H "Content-Type: application/json"   -d "{
+    \"model\": \"tts-1-hd\",
+    \"input\": \"The quick brown fox jumped over the lazy dog. This HD voice is called $voice, how do you like this voice?\",
+    \"voice\": \"$voice\",
+    \"speed\": 1.0
+  }" | mpv --really-quiet -
+
+done
+
--- a/voice_to_speaker.yaml
+++ b/voice_to_speaker.yaml
@@ -32,13 +32,13 @@ tts-1-hd:
    speaker: voices/fable.wav
  onyx:
    model: tts_models/multilingual/multi-dataset/xtts_v2
-    speaker: voices/voices/onyx.wav
+    speaker: voices/onyx.wav
  nova:
    model: tts_models/multilingual/multi-dataset/xtts_v2
-    speaker: voices/voices/nova.wav
+    speaker: voices/nova.wav
  shimmer:
    model: tts_models/multilingual/multi-dataset/xtts_v2
-    speaker: voices/voices/shimmer.wav
+    speaker: voices/shimmer.wav
  me:
    model: tts_models/multilingual/multi-dataset/xtts_v2
-    speaker: voices/voices/me.wav # this could be you
+    speaker: voices/me.wav # this could be you
--- a/voices/alloy.wav
+++ b/voices/alloy.wav
--- a/voices/download_samples.sh
+++ b/voices/download_samples.sh
@@ -1,7 +0,0 @@
-#!/bin/sh
-wget -q https://cdn.openai.com/API/docs/audio/alloy.wav
-wget -q https://cdn.openai.com/API/docs/audio/echo.wav
-wget -q https://cdn.openai.com/API/docs/audio/fable.wav
-wget -q https://cdn.openai.com/API/docs/audio/onyx.wav
-wget -q https://cdn.openai.com/API/docs/audio/nova.wav
-wget -q https://cdn.openai.com/API/docs/audio/shimmer.wav
--- a/voices/put_your_voices_here.txt
+++ b/voices/put_your_voices_here.txt