initial

2025-06-26 18:16:32 +00:00 · 2023-11-26 21:41:59 -05:00 · 2023-11-26 21:41:59 -05:00 · e4d001da93
commit e4d001da93
parent 93eb45f6e7
8 changed files with 326 additions and 0 deletions
--- a/Dockerfile.min
+++ b/Dockerfile.min
@ -0,0 +1,17 @@
+FROM ubuntu:22.04
+
+# tts-1 only
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y wget ffmpeg python3-pip python3-yaml python3-fastapi python3-uvicorn && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+#RUN git clone https://github.com/matatonic/openedai-speech /app
+RUN mkdir -p /app/voices
+COPY *.py *.yaml /app/
+WORKDIR /app
+RUN pip install piper-tts
+
+RUN piper --update-voices --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
+RUN piper --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
+
+CMD python3 main.py
--- a/README.md
+++ b/README.md
@ -0,0 +1,98 @@
+openedai API for audio/speech
+-----------------------------
+
+This is an API clone of the OpenAI API for text to speech audio generation.
+
+This is v0.1, so please excuse the rough docs and configuration.
+
+It currently supports 'tts-1' via piper tts (fast, ~1 sec latency), and 'tts-1-hd' via xtts_v2 (slow, also uses a couple gigs of gpu vram).
+
+Installation instructions:
+--------------------------
+
+```pip install -r requirements.txt```
+
+To download voices in advance:
+
+for the tts-1 model:
+```shell
+piper --update-voices --data-dir voices --download-dir voices --model en_US-libritts_r-medium < /dev/null > /dev/null
+piper --data-dir voices --download-dir voices --model en_GB-northern_english_male-medium < /dev/null > /dev/null
+```
+
+for tts-1-hd:
+```shell
+COQUI_TOS_AGREED=1
+tts --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --text "." --language_idx en > /dev/null
+```
+
+Run the server, it listens on ```port 8000``` by default:
+
+```python main.py```
+
+API Usage
+---------
+
+You can use it like this:
+
+```shell
+curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
+    "model": "tts-1",
+    "input": "The quick brown fox jumped over the lazy dog.",
+    "voice": "alloy",
+    "response_format": "mp3",
+    "speed": 1.0
+  }' > speech.mp3
+```
+
+Or just like this:
+
+
+```shell
+curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
+    "input": "The quick brown fox jumped over the lazy dog."}' > speech.mp3
+```
+
+
+
+
+Or like this example from the [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech):
+
+```python
+import openai
+
+client = openai.OpenAI(
+  # This part is not needed if you set these environment variables before import openai
+  # export OPENAI_API_KEY=sk-11111111111
+  # export OPENAI_BASE_URL=http://localhost:8000/v1
+  api_key = "sk-111111111",
+  base_url = "http://localhost:8000/v1",
+)
+
+response = client.audio.speech.create(
+  model="tts-1",
+  voice="alloy",
+  input="Today is a wonderful day to build something people love!"
+)
+
+response.stream_to_file("speech.mp3")
+```
+
+Docker support
+--------------
+
+You can run the server via docker like so:
+```shell
+docker compose build
+docker compose up
+```
+
+By default it will build a minimal docker image with piper and tts-1 support only. You can edit docker-compose.yml to change this.
+
+Voice sounds bad on some words or symbols? Check out ```pre_process_map.yaml``` and add a regular express to replace it with something that sounds right.
+
+Want to change the voices or add your own? Check out ```voice_to_speaker.yaml```. I tried to map the voices to something similar to the OpenAI voices, but some are better than others.
+
+If you find a better voice match, please let me know so I can update the defaults.
+
+Voice models for tts-1-hd/xtts2 are incomplete, you can add your own WAV file samples to make more voices, see allow.wav for a sample.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,20 @@
+version: "3.3"
+services:
+  server:
+    build:
+      context: .
+      dockerfile: Dockerfile.min
+      #dockerfile: Dockerfile # for tts-1-hd support via xtts_v2
+    stdin_open: true
+    tty: true
+    runtime: nvidia
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              #device_ids: ['0', '1']
+              capabilities: [gpu]
+    ports:
+      - "8000:8000"
--- a/main.py
+++ b/main.py
@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+import subprocess
+import yaml
+import re
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+import uvicorn
+from pydantic import BaseModel
+
+piper_cuda = False # onnxruntime-gpu not working for me, but cpu is fast enough
+
+app = FastAPI()
+
+def preprocess(raw_input):
+    with open('pre_process_map.yaml', 'r') as file:
+        pre_process_map = yaml.safe_load(file)
+        for a, b in pre_process_map:
+            raw_input = re.sub(a, b, raw_input)
+    return raw_input
+
+def map_voice_to_speaker(voice: str, model: str):
+    with open('voice_to_speaker.yaml', 'r') as file:
+        voice_map = yaml.safe_load(file)
+        return voice_map[model][voice]['model'], voice_map[model][voice]['speaker'], 
+
+class GenerateSpeechRequest(BaseModel):
+    model: str = "tts-1" # or "tts-1-hd"
+    input: str
+    voice: str = "alloy"  # alloy, echo, fable, onyx, nova, and shimmer
+    response_format: str = "mp3" # mp3, opus, aac, flac
+    speed: float = 1.0 # 0.25 - 4.0
+
+@app.post("/v1/audio/speech") #, response_model=StreamingResponse)
+async def generate_speech(request: GenerateSpeechRequest):
+    input_text = preprocess(request.input)
+    model = request.model
+    voice = request.voice
+    response_format = request.response_format
+    speed = request.speed
+
+    # Set the Content-Type header based on the requested format
+    if response_format == "mp3":
+        media_type = "audio/mpeg"
+    elif response_format == "opus":
+        media_type = "audio/ogg;codecs=opus"
+    elif response_format == "aac":
+        media_type = "audio/aac"
+    elif response_format == "flac":
+        media_type = "audio/x-flac"
+
+    # Convert the output to the desired format using ffmpeg
+    ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "s16le", "-ar", "22050", "-ac", "1", "-i", "-"]
+
+    if response_format == "mp3":
+        ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"]) # 32k or 64k?
+    elif response_format == "opus":
+        ffmpeg_args.extend(["-f", "ogg", "-c:a", "libopus"])
+    elif response_format == "aac":
+        ffmpeg_args.extend(["-f", "adts", "-c:a", "aac", "-ab", "64k"])
+    elif response_format == "flac":
+        ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
+     #"-hwaccel:auto"
+
+    tts_args = []
+    tts_proc = None
+
+    if model == 'tts-1':
+        piper_model, speaker = map_voice_to_speaker(voice, model)
+        tts_args = ["piper", "--model", str(piper_model), "--data-dir", "voices", "--download-dir", "voices", "--output-raw"]
+        if piper_cuda:
+            tts_args.extend(["--cuda"])
+        if speaker:
+            tts_args.extend(["--speaker", str(speaker)])
+        if speed != 1.0:
+            tts_args.extend(["--length-scale", f"{1.0/speed}"])
+
+        tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
+    elif model == 'tts-1-hd':
+        #tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False).to(device)
+        #tts.tts_to_file(text=ttstext, file_path=output_filename, speaker_wav=self.speaker_wav)
+        tts_model, speaker = model, speaker = map_voice_to_speaker(voice, model)
+        tts_args = ["tts", "--text", input_text, "--use_cuda", "USE_CUDA", "--model_name", str(tts_model), "--language_idx", "en", "--pipe_out" ]
+        if speaker:
+            tts_args.extend(["--speaker_wav", str(speaker)])
+        if speed > 2.0: # tts has a max speed of 2.0
+            ffmpeg_args.extend(["-af", "atempo=2.0"]) 
+            speed = min(speed / 2.0, 2.0)
+        if speed != 1.0:
+             tts_args.extend(["--speed", str(speed)])
+
+        tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    
+    # Pipe the output from piper to the input of ffmpeg
+    ffmpeg_args.extend(["-"])
+    ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_proc.stdout, stdout=subprocess.PIPE)
+    tts_proc.stdin.close()
+
+    #print(" ".join(tts_args))
+    #print(" ".join(ffmpeg_args))
+
+    return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000) # , root_path=cwd, access_log=False, log_level="info", ssl_keyfile="cert.pem", ssl_certfile="cert.pem")
--- a/pre_process_map.yaml
+++ b/pre_process_map.yaml
@ -0,0 +1,39 @@
+# regex pairs to clean the text before speaking
+- - ([^.])\.$
+  - \1
+- - '&amp;'
+  - '&'
+- - '&lt;'
+  - <
+- - '&gt;'
+  - '>'
+- - '&quot;'
+  - '"'
+- - '&#x27;'
+  - ''''
+- - '&copy;'
+  - '©'
+- - '&reg;'
+  - '®'
+- - '&nbsp;'
+  - ' '
+- - '"'
+  - ''
+- - ' biases '
+  - ' bias''s '
+- - ex\.
+  - for example
+- - e\.g\.
+  - for example
+- - ' ESG '
+  - ' E.S.G. '
+- - ' FY '
+  - ' F.Y. '
+- - ([0-9]+)-([0-9]+)
+  - \1 to \2
+- - '[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+'
+  - ''
+- - '\*\*\*'
+  - '*'
+- - '\*\*'
+  - '*'
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+fastapi
+uvicorn
+piper-tts
+TTS
+#onnxruntime-gpu #not needed
--- a/voice_to_speaker.yaml
+++ b/voice_to_speaker.yaml
@ -0,0 +1,41 @@
+tts-1:
+  some_other_voice_name_you_want:
+    model: voices/choose your own model.onnx
+    speaker: set your own speaker
+  alloy: 
+    model: voices/en_US-libritts_r-medium.onnx
+    speaker: 118 # 118, 64, 79, 80, 101, 130
+  echo:
+    model: voices/en_US-libritts_r-medium.onnx # or en_US-ryan-high:0
+    speaker: 52 # 52, 102, 134
+  fable:
+    model: voices/en_GB-northern_english_male-medium.onnx
+    speaker: # default speaker
+  onyx:
+    model: voices/en_US-libritts_r-medium.onnx
+    speaker: 55 # 55, 72, 90, 104, 132, 136, 137, 159
+  nova:
+    model: voices/en_US-libritts_r-medium.onnx
+    speaker: 57 # 57, 61, 107, 150, 162
+  shimmer:
+    model: voices/en_US-libritts_r-medium.onnx
+    speaker: 163
+tts-1-hd:
+  alloy: 
+    model: tts_models/multilingual/multi-dataset/xtts_v2
+    speaker: voices/alloy.wav
+  echo:
+    model: tts_models/multilingual/multi-dataset/xtts_v2
+    speaker: voices/echo.wav # TODO
+  fable:
+    model: tts_models/multilingual/multi-dataset/xtts_v2
+    speaker: voices/fable.wav # TODO
+  onyx:
+    model: tts_models/multilingual/multi-dataset/xtts_v2
+    speaker: voices/onyx.wav # TODO
+  nova:
+    model: tts_models/multilingual/multi-dataset/xtts_v2
+    speaker: voices/nova.wav # TODO
+  shimmer:
+    model: tts_models/multilingual/multi-dataset/xtts_v2
+    speaker: voices/shimmer.wav # TODO
--- a/voices/alloy.wav
+++ b/voices/alloy.wav