0.6.0 rc1

This commit is contained in:
matatonic 2023-11-27 23:25:44 -05:00
parent 603ee9d54e
commit ba08f6e8f3
6 changed files with 19 additions and 16 deletions

View File

@ -1,12 +1,13 @@
FROM nvidia/cuda:11.8.0-base-ubuntu22.04
RUN apt-get update && \
apt-get install --no-install-recommends -y python-is-python3 python3-pip ffmpeg && \
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip && \
apt-get clean && rm -rf /var/lib/apt/lists/*
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
COPY voices/alloy-alt.wav /app/voices/
WORKDIR /app
RUN pip install -r requirements.txt

View File

@ -1,8 +1,7 @@
FROM ubuntu:22.04
# tts-1 only
RUN apt-get update && \
apt-get install --no-install-recommends -y ffmpeg python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn && \
apt-get install --no-install-recommends -y ffmpeg curl python-is-python3 python3-pip python3-yaml python3-fastapi python3-uvicorn && \
apt-get clean && rm -rf /var/lib/apt/lists/*
RUN pip install piper-tts
@ -10,6 +9,7 @@ RUN pip install piper-tts
#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
COPY voices/alloy-alt.wav /app/voices/
WORKDIR /app
RUN ./download_voices_tts-1.sh

View File

@ -40,8 +40,8 @@ Installation instructions
```shell
# Install the Python requirements
pip install -r requirements.txt
# install ffmpeg
sudo apt install ffmpeg
# install ffmpeg & curl
sudo apt install ffmpeg curl
# Download the voice models:
# for tts-1
bash download_voices_tts-1.sh
@ -120,4 +120,4 @@ docker compose build
docker compose up
```
If you want a minimal docker image with piper only (see: Dockerfile.min). You can edit the `docker-compose.yml` to change this.
If you want a minimal docker image with piper support only (900MB vs. 13GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to change this.

View File

@ -3,14 +3,14 @@ services:
server:
build:
context: .
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~900MB
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, 13GB
stdin_open: true
tty: true
ports:
- "8000:8000"
volumes:
- .:/app/
# volumes:
# - .:/app/
# Below can be removed if not using GPU
runtime: nvidia
deploy:

14
main.py
View File

@ -10,7 +10,6 @@ from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import uvicorn
from pydantic import BaseModel
from TTS.api import TTS
xtts = None
args = None
@ -31,7 +30,7 @@ class xtts_wrapper():
speed=speed,
file_path=file_path,
)
os.unlink(file_path)
return tf
@ -104,7 +103,7 @@ async def generate_speech(request: GenerateSpeechRequest):
tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
tts_proc.stdin.close()
tts_io_out = tts_proc.stdout
# Use xtts_v2 for tts-1-hd
elif model == 'tts-1-hd':
if not xtts:
@ -115,13 +114,13 @@ async def generate_speech(request: GenerateSpeechRequest):
# tts speed doesn't seem to work well
if speed < 0.5:
speed = speed / 0.5
ffmpeg_args.extend(["-af", f"atempo=0.5"])
ffmpeg_args.extend(["-af", "atempo=0.5"])
if speed > 1.0:
ffmpeg_args.extend(["-af", f"atempo={speed}"])
speed = 1.0
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed)
# Pipe the output from piper/xtts to the input of ffmpeg
ffmpeg_args.extend(["-"])
ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_io_out, stdout=subprocess.PIPE)
@ -134,7 +133,7 @@ if __name__ == "__main__":
prog='main.py',
description='OpenedAI Speech API Server',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--piper_cuda', action='store_true', default=False, help="Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough")
parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
parser.add_argument('--preload_xtts', action='store_true', default=False, help="Preload the xtts model. By default it's loaded on first use.")
@ -143,6 +142,9 @@ if __name__ == "__main__":
args = parser.parse_args()
if args.xtts_device != "none":
from TTS.api import TTS
if args.preload_xtts:
xtts = xtts_wrapper("tts_models/multilingual/multi-dataset/xtts_v2")

0
test_voices.sh Normal file → Executable file
View File