mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
0.6.0 rc1
This commit is contained in:
parent
603ee9d54e
commit
ba08f6e8f3
@ -1,12 +1,13 @@
|
||||
FROM nvidia/cuda:11.8.0-base-ubuntu22.04
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y python-is-python3 python3-pip ffmpeg && \
|
||||
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
|
||||
COPY voices/alloy-alt.wav /app/voices/
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
@ -1,8 +1,7 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
# tts-1 only
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y ffmpeg python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn && \
|
||||
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip install piper-tts
|
||||
@ -10,6 +9,7 @@ RUN pip install piper-tts
|
||||
#RUN git clone https://github.com/matatonic/openedai-speech /app
|
||||
RUN mkdir -p /app/voices
|
||||
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
|
||||
COPY voices/alloy-alt.wav /app/voices/
|
||||
WORKDIR /app
|
||||
|
||||
RUN ./download_voices_tts-1.sh
|
||||
|
||||
@ -40,8 +40,8 @@ Installation instructions
|
||||
```shell
|
||||
# Install the Python requirements
|
||||
pip install -r requirements.txt
|
||||
# install ffmpeg
|
||||
sudo apt install ffmpeg
|
||||
# install ffmpeg & curl
|
||||
sudo apt install ffmpeg curl
|
||||
# Download the voice models:
|
||||
# for tts-1
|
||||
bash download_voices_tts-1.sh
|
||||
@ -120,4 +120,4 @@ docker compose build
|
||||
docker compose up
|
||||
```
|
||||
|
||||
If you want a minimal docker image with piper only (see: Dockerfile.min). You can edit the `docker-compose.yml` to change this.
|
||||
If you want a minimal docker image with piper support only (900MB vs. 13GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to change this.
|
||||
|
||||
@ -3,14 +3,14 @@ services:
|
||||
server:
|
||||
build:
|
||||
context: .
|
||||
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required
|
||||
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required
|
||||
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~900MB
|
||||
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, 13GB
|
||||
stdin_open: true
|
||||
tty: true
|
||||
ports:
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- .:/app/
|
||||
# volumes:
|
||||
# - .:/app/
|
||||
# Below can be removed if not using GPU
|
||||
runtime: nvidia
|
||||
deploy:
|
||||
|
||||
14
main.py
14
main.py
@ -10,7 +10,6 @@ from fastapi import FastAPI
|
||||
from fastapi.responses import StreamingResponse
|
||||
import uvicorn
|
||||
from pydantic import BaseModel
|
||||
from TTS.api import TTS
|
||||
|
||||
xtts = None
|
||||
args = None
|
||||
@ -31,7 +30,7 @@ class xtts_wrapper():
|
||||
speed=speed,
|
||||
file_path=file_path,
|
||||
)
|
||||
|
||||
|
||||
os.unlink(file_path)
|
||||
return tf
|
||||
|
||||
@ -104,7 +103,7 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
|
||||
tts_proc.stdin.close()
|
||||
tts_io_out = tts_proc.stdout
|
||||
|
||||
|
||||
# Use xtts_v2 for tts-1-hd
|
||||
elif model == 'tts-1-hd':
|
||||
if not xtts:
|
||||
@ -115,13 +114,13 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
# tts speed doesn't seem to work well
|
||||
if speed < 0.5:
|
||||
speed = speed / 0.5
|
||||
ffmpeg_args.extend(["-af", f"atempo=0.5"])
|
||||
ffmpeg_args.extend(["-af", "atempo=0.5"])
|
||||
if speed > 1.0:
|
||||
ffmpeg_args.extend(["-af", f"atempo={speed}"])
|
||||
speed = 1.0
|
||||
|
||||
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
|
||||
|
||||
|
||||
# Pipe the output from piper/xtts to the input of ffmpeg
|
||||
ffmpeg_args.extend(["-"])
|
||||
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_io_out, stdout=subprocess.PIPE)
|
||||
@ -134,7 +133,7 @@ if __name__ == "__main__":
|
||||
prog='main.py',
|
||||
description='OpenedAI Speech API Server',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
|
||||
|
||||
parser.add_argument('--piper_cuda', action='store_true', default=False, help="Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough")
|
||||
parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
|
||||
parser.add_argument('--preload_xtts', action='store_true', default=False, help="Preload the xtts model. By default it's loaded on first use.")
|
||||
@ -143,6 +142,9 @@ if __name__ == "__main__":
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.xtts_device != "none":
|
||||
from TTS.api import TTS
|
||||
|
||||
if args.preload_xtts:
|
||||
xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")
|
||||
|
||||
|
||||
0
test_voices.sh
Normal file → Executable file
0
test_voices.sh
Normal file → Executable file
Loading…
Reference in New Issue
Block a user