0.9.0

2025-06-26 18:16:32 +00:00 · 2024-04-23 22:07:23 -04:00 · 2024-04-23 22:07:23 -04:00 · 4d76aca1af
commit 4d76aca1af
parent bd3c7a601a
8 changed files with 185 additions and 33 deletions
--- a/7
+++ b/7
@ -1,10 +1,11 @@
-FROM ubuntu:22.04
+FROM python:3.11-slim

 ENV COQUI_TOS_AGREED=1
 ENV PRELOAD_MODEL=xtts
+# or PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1

 RUN apt-get update && \
-    apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip
+    apt-get install --no-install-recommends -y curl git ffmpeg

 #RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
@ -12,7 +13,7 @@ RUN mkdir -p /app/voices
 COPY voices/alloy-alt.wav /app/voices/
 WORKDIR /app
 COPY *.txt /app/
-RUN pip install -r requirements.txt
+RUN pip install --no-cache -r requirements.txt
 COPY *.sh /app/
 RUN ./download_voices_tts-1.sh
 RUN ./download_voices_tts-1-hd.sh
--- a/Dockerfile.min
+++ b/Dockerfile.min
@ -1,9 +1,9 @@
-FROM ubuntu:22.04
+FROM python:3.11-slim

 RUN apt-get update && \
-    apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn
+    apt-get install --no-install-recommends -y ffmpeg curl

-RUN pip install piper-tts
+RUN pip install --no-cache piper-tts pyyaml fastapi uvicorn

 #RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
--- a/README.md
+++ b/README.md
@ -24,6 +24,16 @@ Details:

 If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.

+
+Version: 0.9.0, 2024-04-23
+
+* Fix bug with yaml and loading UTF-8
+* New sample text-to-speech application `say.py`
+* Smaller docker base image
+* Add beta [parler-tts](https://huggingface.co/parler-tts/parler_tts_mini_v0.1) support (you can describe very basic features of the speaker voice), See: (https://www.text-description-to-speech.com/) for some examples of how to describe voices. Voices can be defined in the `voice_to_speaker.yaml`.
+* 2 example [parler-tts](https://huggingface.co/parler-tts/parler_tts_mini_v0.1) voices are included in the `voice_to_speaker.yaml` file.
+* parler-tts is experimental software and is kind of slow. The exact voice will be slightly different each generation but should be similar to the basic description.
+
 Version: 0.8.0, 2024-03-23

 * Cleanup, docs update.
@ -128,6 +138,14 @@ with client.audio.speech.with_streaming_response.create(
  response.stream_to_file("speech.mp3")
 ```

+Also see the `say.py` sample application for an example of how to use the openai-python API.
+
+```
+$ python say.py -i "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
+$ python say.py -i "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
+```
+
+
 Custom Voices Howto
 -------------------

--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -2,8 +2,9 @@ services:
  server:
    build:
      context: .
-      dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, 13GB
-      #dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~900MB
+      dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, ~10GB
+      #dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
+    #command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000",  "--preload", "parler-tts/parler_tts_mini_v0.1"]
    command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "xtts"]
    #command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--xtts_device", "none"] # min
    ports:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,9 @@
 fastapi
 uvicorn
+# piper-tts
 piper-tts
 onnxruntime-gpu
+# xtts
 TTS
+# parler-tts
+git+https://github.com/huggingface/parler-tts.git
--- a/say.py
+++ b/say.py
@ -0,0 +1,80 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import tempfile
+import argparse
+
+try:
+    import dotenv
+    dotenv.load_dotenv(override=True)
+except ImportError:
+    pass
+
+try:
+    from playsound import playsound
+except ImportError:
+    playsound = None
+
+import openai
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", type=str, default="tts-1")#, choices=["tts-1", "tts-1-hd"])
+    parser.add_argument("-v", "--voice", type=str, default="alloy")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"])
+    parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"])
+    parser.add_argument("-s", "--speed", type=float, default=1.0)
+    parser.add_argument("-i", "--input", type=str)
+    
+    if playsound is None:
+        parser.add_argument("-o", "--output", type=str) # required
+        parser.add_argument("-p", "--playsound", type=None, default=None, help="python playsound not found. pip install playsound")
+    else:
+        parser.add_argument("-o", "--output", type=str, default=None) # not required
+        parser.add_argument("-p", "--playsound", action="store_true")
+
+    args = parser.parse_args(argv)
+
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args(sys.argv[1:])
+
+    if args.playsound and playsound is None:
+        print("playsound module not found, audio will not be played, use -o <filename> to save output to a file. pip install playsound")
+        sys.exit(1)
+
+    if not args.playsound and not args.output:
+        print("Must select one of playsound (-p) or output file name (-o)")
+        sys.exit(1)
+
+
+    client = openai.OpenAI(
+        # This part is not needed if you set these environment variables before import openai
+        # export OPENAI_API_KEY=sk-11111111111
+        # export OPENAI_BASE_URL=http://localhost:8000/v1
+        api_key = os.environ.get("OPENAI_API_KEY", "sk-ip"),
+        base_url = os.environ.get("OPENAI_BASE_URL", "http://localhost:8000/v1"),
+    )
+
+    if args.playsound and args.output is None:
+        tf, args.output = file_path = tempfile.mkstemp(suffix='.wav')
+    else:
+        tf = None
+
+    with client.audio.speech.with_streaming_response.create(
+        model=args.model,
+        voice=args.voice,
+        speed=args.speed,
+        response_format=args.format,
+        input=args.input,
+    ) as response:
+        response.stream_to_file(args.output)
+
+    if args.playsound:
+        playsound(args.output)
+    
+    if tf:
+        os.unlink(args.output)
--- a/speech.py
+++ b/speech.py
@ -1,17 +1,26 @@
 #!/usr/bin/env python3
-from pathlib import Path
 import argparse
 import os
 import re
 import subprocess
 import tempfile
 import yaml
-from fastapi import FastAPI
-from fastapi.responses import StreamingResponse, PlainTextResponse
-from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
 import uvicorn
 from pydantic import BaseModel

+# for parler
+try:
+    from parler_tts import ParlerTTSForConditionalGeneration
+    from transformers import AutoTokenizer, logging
+    import torch
+    import soundfile as sf
+    logging.set_verbosity_error()
+    has_parler_tts = True
+except ImportError:
+    print("No parler support found")
+    has_parler_tts = False
+
 import openedai

 xtts = None
@ -20,7 +29,6 @@ app = openedai.OpenAIStub()

 class xtts_wrapper():
    def __init__(self, model_name, device):
-        global args
        self.model_name = model_name
        self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)

@ -38,9 +46,28 @@ class xtts_wrapper():
        os.unlink(file_path)
        return tf

+class parler_tts():
+    def __init__(self, model_name, device):
+        self.model_name = model_name
+        self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+    def tts(self, text, description):
+        input_ids = self.tokenizer(description, return_tensors="pt").input_ids.to(self.model.device)
+        prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.model.device)
+
+        generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+        audio_arr = generation.cpu().numpy().squeeze()
+        
+        tf, file_path = tempfile.mkstemp(suffix='.wav')
+        sf.write(file_path, audio_arr, self.model.config.sampling_rate)
+        os.unlink(file_path)
+        return tf
+
+
 # Read pre process map on demand so it can be changed without restarting the server
 def preprocess(raw_input):
-    with open('pre_process_map.yaml', 'r') as file:
+    with open('pre_process_map.yaml', 'r', encoding='utf8') as file:
        pre_process_map = yaml.safe_load(file)
        for a, b in pre_process_map:
            raw_input = re.sub(a, b, raw_input)
@ -48,7 +75,7 @@ def preprocess(raw_input):

 # Read voice map on demand so it can be changed without restarting the server
 def map_voice_to_speaker(voice: str, model: str):
-    with open('voice_to_speaker.yaml', 'r') as file:
+    with open('voice_to_speaker.yaml', 'r', encoding='utf8') as file:
        voice_map = yaml.safe_load(file)
        return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'], 

@ -120,26 +147,38 @@ async def generate_speech(request: GenerateSpeechRequest):
    elif model == 'tts-1-hd':
        tts_model, speaker = map_voice_to_speaker(voice, 'tts-1-hd')

-        if not xtts or xtts.model_name != tts_model:
-            if xtts:
-                import torch, gc
-                del xtts
-                gc.collect()
-                torch.cuda.empty_cache()
+        if xtts is not None and xtts.model_name != tts_model:
+            import torch, gc
+            del xtts
+            gc.collect()
+            torch.cuda.empty_cache()

-            xtts = xtts_wrapper(tts_model, device=args.xtts_device)
+        if 'parler-tts' in tts_model and has_parler_tts:
+            if not xtts:
+                xtts = parler_tts(tts_model, device=args.xtts_device)

-        ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")
+            ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate=str(xtts.model.config.sampling_rate))

-        # tts speed doesn't seem to work well
-        if speed < 0.5:
-            speed = speed / 0.5
-            ffmpeg_args.extend(["-af", "atempo=0.5"]) 
-        if speed > 1.0:
-            ffmpeg_args.extend(["-af", f"atempo={speed}"]) 
-            speed = 1.0
+            if speed != 1:
+                ffmpeg_args.extend(["-af", f"atempo={speed}"]) 

-        tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
+            tts_io_out = xtts.tts(text=input_text, description=speaker)
+
+        else:
+            if not xtts:
+                xtts = xtts_wrapper(tts_model, device=args.xtts_device)
+
+            ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")
+
+            # tts speed doesn't seem to work well
+            if speed < 0.5:
+                speed = speed / 0.5
+                ffmpeg_args.extend(["-af", "atempo=0.5"]) 
+            if speed > 1.0:
+                ffmpeg_args.extend(["-af", f"atempo={speed}"]) 
+                speed = 1.0
+
+            tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)

    # Pipe the output from piper/xtts to the input of ffmpeg
    ffmpeg_args.extend(["-"])
@ -165,7 +204,10 @@ if __name__ == "__main__":
        from TTS.api import TTS

    if args.preload:
-        xtts = xtts_wrapper(args.preload, device=args.xtts_device)
+        if 'parler-tts' in args.preload:
+            xtts = parler_tts(args.preload, device=args.xtts_device)
+        else:
+            xtts = xtts_wrapper(args.preload, device=args.xtts_device)

    app.register_model('tts-1')
    app.register_model('tts-1-hd')
--- a/voice_to_speaker.yaml
+++ b/voice_to_speaker.yaml
@ -1,4 +1,4 @@
-tts-1:
+tts-1:
  some_other_voice_name_you_want:
    model: voices/choose your own model.onnx
    speaker: set your own speaker
@ -48,3 +48,9 @@ tts-1-hd:
  me:
    model: xtts_v2.0.2 # you can specify different xtts version
    speaker: voices/me.wav # this could be you
+  parler:
+    model: parler-tts/parler_tts_mini_v0.1
+    speaker: A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast.
+  parler2:
+    model: parler-tts/parler_tts_mini_v0.1
+    speaker: A female voice with an Indian accent enunciates every word with precision. The speaker's voice is very close-sounding, and the recording is excellent, capturing her voice with crisp clarity.