Merge pull request #21 from matatonic/dev

0.13.0
This commit is contained in:
matatonic 2024-06-25 17:28:02 -04:00 committed by GitHub
commit 65c03e3448
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 492 additions and 161 deletions

View File

@ -55,6 +55,7 @@ jobs:
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}
platforms: linux/amd64,linux/arm64
# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
@ -66,8 +67,9 @@ jobs:
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}
platforms: linux/amd64,linux/arm64
build-and-push-alt-image:
build-and-push-min-image:
runs-on: ubuntu-latest
permissions:
@ -113,6 +115,7 @@ jobs:
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}
platforms: linux/amd64,linux/arm64
# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
@ -124,4 +127,70 @@ jobs:
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}
platforms: linux/amd64,linux/arm64
build-and-push-rocm-image:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
env:
# Set up environment variables for the job
USE_ROCM: 1
DOCKER_REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}-rocm
TAG: ${{ github.sha }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true
# Log in to the GitHub Container Registry only when not running on a pull request event
- name: Login to Docker Registry
uses: docker/login-action@v2
with:
registry: ${{ env.DOCKER_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
# Build and push the Docker image to GHCR for the main branch or specific tags
- name: Build and Push Docker Image
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}
platforms: linux/amd64,linux/arm64
build-args: |
USE_ROCM=1
# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
if: startsWith(github.ref, 'refs/tags/')
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}
platforms: linux/amd64,linux/arm64
build-args: |
USE_ROCM=1

View File

@ -1,23 +1,28 @@
FROM python:3.11-slim
RUN apt-get update && \
apt-get install --no-install-recommends -y curl ffmpeg git && \
apt-get clean && rm -rf /var/lib/apt/lists/*
ARG TARGETPLATFORM
RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then apt-get install --no-install-recommends -y build-essential ; fi
RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
ENV PATH="/root/.cargo/bin:${PATH}"
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
WORKDIR /app
RUN mkdir -p voices config
COPY requirements.txt /app/
ARG USE_ROCM
ENV USE_ROCM=${USE_ROCM}
COPY requirements*.txt /app/
RUN if [ "${USE_ROCM}" = "1" ]; then mv /app/requirements-rocm.txt /app/requirements.txt; fi
RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
COPY speech.py openedai.py say.py *.sh README.md LICENSE /app/
COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /app/config/
COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/
ARG PRELOAD_MODEL
ENV PRELOAD_MODEL=${PRELOAD_MODEL}
ENV TTS_HOME=voices
ENV HF_HOME=voices
ENV OPENEDAI_LOG_LEVEL=INFO
ENV COQUI_TOS_AGREED=1
CMD bash startup.sh

View File

@ -1,20 +1,20 @@
FROM python:3.11-slim
RUN apt-get update && \
apt-get install --no-install-recommends -y curl ffmpeg && \
apt-get clean && rm -rf /var/lib/apt/lists/*
ARG TARGETPLATFORM
RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then apt-get install --no-install-recommends -y build-essential ; fi
RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
ENV PATH="/root/.cargo/bin:${PATH}"
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
WORKDIR /app
RUN mkdir -p voices config
RUN --mount=type=cache,target=/root/.cache/pip pip install piper-tts==1.2.0 pyyaml fastapi uvicorn loguru numpy\<2
COPY speech.py openedai.py say.py *.sh README.md LICENSE /app/
COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /app/config/
COPY requirements*.txt /app/
RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements-min.txt
COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/
ENV TTS_HOME=voices
ENV HF_HOME=voices
ENV OPENEDAI_LOG_LEVEL=INFO
CMD bash startup.min.sh

137
README.md
View File

@ -19,6 +19,7 @@ Details:
* Model `tts-1-hd` via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, but requires around 4GB GPU VRAM)
* Custom cloned voices can be used for tts-1-hd, See: [Custom Voices Howto](#custom-voices-howto)
* 🌐 [Multilingual](#multilingual) support with XTTS voices
* [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
* Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`
@ -26,6 +27,16 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s
## Recent Changes
Version 0.13.0, 2024-06-25
* Added [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
* Initial prebuilt arm64 image support (Apple M-series, Raspberry Pi - MPS is not supported in XTTS/torch), thanks @JakeStevenson, @hchasens
* Initial attempt at AMD GPU (ROCm 5.7) support
* Parler-tts support removed
* Move the *.default.yaml to the root folder
* Run the docker as a service by default (`restart: unless-stopped`)
* Added `audio_reader.py` for streaming text input and reading long texts
Version 0.12.3, 2024-06-17
* Additional logging details for BadRequests (400)
@ -75,23 +86,24 @@ Version: 0.7.3, 2024-03-20
## Installation instructions
1) Copy the `sample.env` to `speech.env` (customize if needed)
### Create a `speech.env` environment file
Copy the `sample.env` to `speech.env` (customize if needed)
```bash
cp sample.env speech.env
```
2. Option: Docker (**recommended**) (prebuilt images are available)
Run the server:
```shell
docker compose up
#### Defaults
```bash
TTS_HOME=voices
HF_HOME=voices
#PRELOAD_MODEL=xtts
#PRELOAD_MODEL=xtts_v2.0.2
#EXTRA_ARGS=--log-level DEBUG
#USE_ROCM=1
```
For a minimal docker image with only piper support (<1GB vs. 8GB), use `docker compose -f docker-compose.min.yml up`
To install the docker image as a service, edit the `docker-compose.yml` and uncomment `restart: unless-stopped`, then start the service with: `docker compose up -d`
2. Option: Manual installation:
### Option A: Manual installation
```shell
# install curl and ffmpeg
sudo apt install curl ffmpeg
@ -99,38 +111,43 @@ sudo apt install curl ffmpeg
python -m venv .venv
source .venv/bin/activate
# Install the Python requirements
# - use requirements-rocm.txt for AMD GPU (ROCm support)
# - use requirements-min.txt for piper only (CPU only)
pip install -r requirements.txt
# run the server
bash startup.sh
```
> On first run, the voice models will be downloaded automatically. This might take a while depending on your network connection.
## Usage
### Option B: Docker Image (*recommended*)
```
usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST] [-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
OpenedAI Speech API Server
options:
-h, --help show this help message and exit
--xtts_device XTTS_DEVICE
Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
--preload PRELOAD Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
-P PORT, --port PORT Server tcp port (default: 8000)
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
Set the log level (default: INFO)
#### Nvidia GPU (cuda)
```shell
docker compose up
```
## API Documentation
#### AMD GPU (ROCm support)
* [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech)
* [OpenAI API Reference](https://platform.openai.com/docs/api-reference/audio/createSpeech)
```shell
docker compose -d docker-compose.rocm.yml up
```
#### ARM64 (Apple M-series, Raspberry Pi)
> XTTS only has CPU support here and will be very slow, you can use the Nvidia image for XTTS with CPU (slow), or use the piper only image (recommended)
#### CPU only, No GPU (piper only)
> For a minimal docker image with only piper support (<1GB vs. 8GB).
```shell
docker compose -f docker-compose.min.yml up
```
### Sample API Usage
## Sample Usage
You can use it like this:
@ -147,7 +164,7 @@ curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -
Or just like this:
```shell
curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
"input": "The quick brown fox jumped over the lazy dog."}' > speech.mp3
```
@ -175,33 +192,24 @@ with client.audio.speech.with_streaming_response.create(
Also see the `say.py` sample application for an example of how to use the openai-python API.
```shell
python say.py -t "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
# play the audio, requires 'pip install playsound'
python say.py -t "The quick brown fox jumped over the lazy dog." -p
# save to a file in flac format
python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac
```
You can also try the included `audio_reader.py` for listening to longer text and streamed input.
Example usage:
```bash
python audio_reader.py -s 2 < LICENSE # read the software license - fast
```
usage: say.py [-h] [-m MODEL] [-v VOICE] [-f {mp3,aac,opus,flac}] [-s SPEED] [-t TEXT] [-i INPUT] [-o OUTPUT] [-p]
Text to speech using the OpenAI API
## OpenAI API Documentation and Guide
options:
-h, --help show this help message and exit
-m MODEL, --model MODEL
The model to use (default: tts-1)
-v VOICE, --voice VOICE
The voice of the speaker (default: alloy)
-f {mp3,aac,opus,flac}, --format {mp3,aac,opus,flac}
The output audio format (default: mp3)
-s SPEED, --speed SPEED
playback speed, 0.25-4.0 (default: 1.0)
-t TEXT, --text TEXT Provide text to read on the command line (default: None)
-i INPUT, --input INPUT
Read text from a file (default is to read from stdin) (default: None)
-o OUTPUT, --output OUTPUT
The filename to save the output to (default: None)
-p, --playsound Play the audio (default: False)
* [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech)
* [OpenAI API Reference](https://platform.openai.com/docs/api-reference/audio/createSpeech)
```
## Custom Voices Howto
@ -251,13 +259,13 @@ For example:
...
tts-1-hd:
me:
model: xtts_v2.0.2 # you can specify different xtts versions
model: xtts
speaker: voices/me.wav # this could be you
```
## Multilingual
Multilingual support was added in version 0.11.0 and is available only with the XTTS v2 model.
Multilingual cloning support was added in version 0.11.0 and is available only with the XTTS v2 model. To use multilingual voices with piper simply download a language specific voice.
Coqui XTTSv2 has support for 16 languages: English (`en`), Spanish (`es`), French (`fr`), German (`de`), Italian (`it`), Portuguese (`pt`), Polish (`pl`), Turkish (`tr`), Russian (`ru`), Dutch (`nl`), Czech (`cs`), Arabic (`ar`), Chinese (`zh-cn`), Japanese (`ja`), Hungarian (`hu`) and Korean (`ko`).
@ -284,3 +292,24 @@ Remove:
These lines were added to the `config/pre_process_map.yaml` config file by default before version 0.11.0:
4) Your new multi-lingual speaker voice is ready to use!
## Custom Fine-Tuned Model Support
Adding a custom xtts model is simple. Here is an example of how to add a custom fine-tuned 'halo' XTTS model.
1) Save the model folder under `voices/` (all 4 files are required, including the vocab.json from the model)
```
openedai-speech$ ls voices/halo/
config.json vocab.json model.pth sample.wav
```
2) Add the custom voice entry under the `tts-1-hd` section of `config/voice_to_speaker.yaml`:
```yaml
tts-1-hd:
...
halo:
model: halo # This name is required to be unique
speaker: voices/halo/sample.wav # voice sample is required
model_path: voices/halo
```
3) The model will be loaded when you access the voice for the first time (`--preload` doesn't work with custom models yet)

63
add_voice.py Executable file
View File

@ -0,0 +1,63 @@
#!/usr/bin/env python
import argparse
import os
import shutil
import yaml
print("!! WARNING EXPERIMENTAL !! - THIS TOOL WILL ERASE ALL COMMENTS FROM THE CONFIG FILES .. OR WORSE!!")
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('sample', action='store', help="Set the wav sample file")
parser.add_argument('-n', '--name', action='store', help="Set the name for the voice (by default will use the WAV file name)")
parser.add_argument('-l', '--language', action='store', default="en", help="Set the language for the voice",
choices=['en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko'])
parser.add_argument('--openai-model', action='store', default="tts-1-hd", help="Set the openai model for the voice")
parser.add_argument('--xtts-model', action='store', default="xtts", help="Set the xtts model for the voice (if using a custom model, also set model_path)")
parser.add_argument('--model-path', action='store', default=None, help="Set the path for a custom xtts model")
parser.add_argument('--config-path', action='store', default="config/voice_to_speaker.yaml", help="Set the config file path")
parser.add_argument('--voice-path', action='store', default="voices", help="Set the default voices file path")
parser.add_argument('--default-path', action='store', default="voice_to_speaker.default.yaml", help="Set the default config file path")
args = parser.parse_args()
basename = os.path.basename(args.sample)
name_noext, ext = os.path.splitext(basename)
if not args.name:
args.name = name_noext
else:
basename = f"{args.name}.wav"
dest_file = os.path.join(args.voice_path, basename)
if args.sample != dest_file:
shutil.copy2(args.sample, dest_file)
if not os.path.exists(args.config_path):
shutil.copy2(args.default_path, args.config_path)
with open(args.config_path, 'r', encoding='utf8') as file:
voice_map = yaml.safe_load(file)
model_conf = voice_map.get(args.openai_model, {})
model_conf[args.name] = {
'model': args.xtts_model,
'speaker': os.path.join(args.voice_path, basename),
'language': args.language,
}
if args.model_path:
model_conf[args.name]['model_path'] = args.model_path
voice_map[args.openai_model] = model_conf
with open(args.config_path, 'w', encoding='utf8') as ofile:
yaml.safe_dump(voice_map, ofile, default_flow_style=False, allow_unicode=True)
print(f"Updated: {args.config_path}")
print(f"Added voice: {args.openai_model}/{args.name}")
print(f"Added section:")
print(f"{args.openai_model}:")
print(f" {args.name}:")
print(f" model: {model_conf[args.name]['model']}")
print(f" speaker: {model_conf[args.name]['speaker']}")
print(f" language: {model_conf[args.name]['language']}")

127
audio_reader.py Executable file
View File

@ -0,0 +1,127 @@
#!/usr/bin/env python3
try:
import dotenv
dotenv.load_dotenv()
except ImportError:
pass
import argparse
import os
import pysbd
import queue
import sys
import tempfile
import threading
import shutil
import sys
import tempfile
import contextlib
import openai
try:
from playsound import playsound
except ImportError:
print("Error: missing required package 'playsound'. !pip install playsound")
sys.exit(1)
@contextlib.contextmanager
def tempdir():
path = tempfile.mkdtemp()
try:
yield path
finally:
try:
shutil.rmtree(path)
except IOError:
sys.stderr.write('Failed to clean up temp dir {}'.format(path))
class SimpleAudioPlayer:
def __init__(self):
self._queue = queue.Queue()
self.running = True
self._thread = threading.Thread(target=self.__play_audio_loop, daemon=True)
self._thread.start()
def put(self, file):
self._queue.put(file)
def stop(self):
self.running = False
self._thread.join()
try:
while True:
file = self._queue.get_nowait()
if os.path.exists(file):
os.unlink(file)
except queue.Empty as e:
pass
def __play_audio_loop(self):
while self.running:
try:
while True:
file = self._queue.get(block=True, timeout=0.01)
try:
playsound(file)
finally:
os.unlink(file)
except queue.Empty as e:
continue
class OpenAI_tts:
def __init__(self, model, voice, speed, base_dir):
self.base_dir = base_dir
self.openai_client = openai.OpenAI(
# export OPENAI_API_KEY=sk-11111111111
# export OPENAI_BASE_URL=http://localhost:8000/v1
api_key = os.environ.get("OPENAI_API_KEY", "sk-ip"),
base_url = os.environ.get("OPENAI_BASE_URL", "http://localhost:8000/v1"),
)
self.params = {
'model': model,
'voice': voice,
'speed': speed
}
def speech_to_file(self, text: str) -> None:
with self.openai_client.audio.speech.with_streaming_response.create(
input=text, response_format='opus', **self.params
) as response:
tf, output_filename = tempfile.mkstemp(suffix='.wav', prefix="audio_reader_", dir=self.base_dir)
response.stream_to_file(output_filename)
return output_filename
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Text to speech player',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-m', '--model', action='store', default="tts-1", help="The OpenAI model")
parser.add_argument('-v', '--voice', action='store', default="alloy", help="The voice to use")
parser.add_argument('-s', '--speed', action='store', default=1.0, help="How fast to read the audio")
args = parser.parse_args()
try:
with tempdir() as base_dir:
player = SimpleAudioPlayer()
reader = OpenAI_tts(voice=args.voice, model=args.model, speed=args.speed, base_dir=base_dir)
seg = pysbd.Segmenter(language='en', clean=True) # text is dirty, clean it up.
for raw_line in sys.stdin:
for line in seg.segment(raw_line):
if not line:
continue
print(line)
player.put(reader.speech_to_file(line))
player.stop()
except KeyboardInterrupt:
pass

View File

View File

@ -10,4 +10,4 @@ services:
- ./voices:/app/voices
- ./config:/app/config
# To install as a service
#restart: unless-stopped
restart: unless-stopped

27
docker-compose.rocm.yml Normal file
View File

@ -0,0 +1,27 @@
services:
server:
build:
dockerfile: Dockerfile
args:
- USE_ROCM=1
image: ghcr.io/matatonic/openedai-speech-rocm
env_file: speech.env
ports:
- "8000:8000"
volumes:
- ./voices:/app/voices
- ./config:/app/config
# To install as a service
restart: unless-stopped
# For AMD GPU (ROCm) Support
cap_add:
- SYS_PTRACE
devices:
- /dev/kfd
- /dev/dri
security_opt:
- seccomp=unconfined
group_add:
- video
- audio
ipc: host

View File

@ -10,9 +10,7 @@ services:
- ./voices:/app/voices
- ./config:/app/config
# To install as a service
#restart: unless-stopped
# Set nvidia runtime if it's not the default
#runtime: nvidia
restart: unless-stopped
deploy:
resources:
reservations:

View File

@ -1,4 +1,4 @@
# regex pairs to clean the text before speaking
# regex pairs to clean the text before speaking
- - ([^.])\.$
- \1
- - '&amp;'

6
requirements-min.txt Normal file
View File

@ -0,0 +1,6 @@
pyyaml
fastapi
uvicorn
loguru
numpy<2
piper-tts==1.2.0

14
requirements-rocm.txt Normal file
View File

@ -0,0 +1,14 @@
fastapi
uvicorn
loguru
# piper-tts
piper-tts==1.2.0
# xtts
TTS
# XXX, 3.8+ has some issue for now
spacy==3.7.4
# torch==2.2.2 Fixes: https://github.com/matatonic/openedai-speech/issues/9
# Re: https://github.com/pytorch/pytorch/issues/121834
torch==2.2.2; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"
torchaudio==2.2.2; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"

View File

@ -5,10 +5,15 @@ loguru
piper-tts==1.2.0
# xtts
TTS
# Fixes: https://github.com/matatonic/openedai-speech/issues/9
# Re: https://github.com/pytorch/pytorch/issues/121834
torch==2.2.2
# XXX, 3.8+ has some issue for now
spacy==3.7.4
# parler-tts
git+https://github.com/huggingface/parler-tts.git
# torch==2.2.2 Fixes: https://github.com/matatonic/openedai-speech/issues/9
# Re: https://github.com/pytorch/pytorch/issues/121834
torch==2.2.2; sys_platform != "darwin"
torchaudio; sys_platform != "darwin"
# for MPS accelerated torch on Mac - doesn't work yet, incomplete support in torch and torchaudio
torch==2.2.2; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
torchaudio==2.2.2; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
# ROCM (Linux only) - use requirements.amd.txt

View File

@ -2,4 +2,5 @@ TTS_HOME=voices
HF_HOME=voices
#PRELOAD_MODEL=xtts
#PRELOAD_MODEL=xtts_v2.0.2
#PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1
#EXTRA_ARGS=--log-level DEBUG
#USE_ROCM=1

131
speech.py
View File

@ -11,71 +11,52 @@ import uvicorn
from pydantic import BaseModel
from loguru import logger
# for parler
try:
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, logging
import torch
import soundfile as sf
logging.set_verbosity_error()
has_parler_tts = True
except ImportError:
logger.info("No parler support found")
has_parler_tts = False
from openedai import OpenAIStub, BadRequestError
from openedai import OpenAIStub, BadRequestError, ServiceUnavailableError
xtts = None
args = None
app = OpenAIStub()
class xtts_wrapper():
def __init__(self, model_name, device):
def __init__(self, model_name, device, model_path=None):
self.model_name = model_name
self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)
logger.info(f"Loading model {self.model_name} to {device}")
if model_path: # custom model # and config_path
config_path=os.path.join(model_path, 'config.json')
self.xtts = TTS(model_path=model_path, config_path=config_path).to(device)
else:
self.xtts = TTS(model_name=model_name).to(device)
def tts(self, text, speaker_wav, speed, language):
tf, file_path = tempfile.mkstemp(suffix='.wav')
tf, file_path = tempfile.mkstemp(suffix='.wav', prefix='openedai-speech-')
file_path = self.xtts.tts_to_file(
text=text,
language=language,
speaker_wav=speaker_wav,
speed=speed,
file_path=file_path,
)
try:
# TODO: support speaker= as voice id instead of just wav
file_path = self.xtts.tts_to_file(
text=text,
language=language,
speaker_wav=speaker_wav,
speed=speed,
file_path=file_path,
)
finally:
os.unlink(file_path)
os.unlink(file_path)
return tf
class parler_tts():
def __init__(self, model_name, device):
self.model_name = model_name
self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
def tts(self, text, description):
input_ids = self.tokenizer(description, return_tensors="pt").input_ids.to(self.model.device)
prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.model.device)
generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
tf, file_path = tempfile.mkstemp(suffix='.wav')
sf.write(file_path, audio_arr, self.model.config.sampling_rate)
os.unlink(file_path)
return tf
def default_exists(filename: str):
if not os.path.exists(filename):
basename, ext = os.path.splitext(filename)
fpath, ext = os.path.splitext(filename)
basename = os.path.basename(fpath)
default = f"{basename}.default{ext}"
logger.info(f"{filename} does not exist, setting defaults from {default}")
with open(default, 'r') as from_file:
with open(filename, 'w') as to_file:
with open(default, 'r', encoding='utf8') as from_file:
with open(filename, 'w', encoding='utf8') as to_file:
to_file.write(from_file.read())
# Read pre process map on demand so it can be changed without restarting the server
@ -97,14 +78,10 @@ def map_voice_to_speaker(voice: str, model: str):
with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
voice_map = yaml.safe_load(file)
try:
m = voice_map[model][voice]['model']
s = voice_map[model][voice]['speaker']
l = voice_map[model][voice].get('language', 'en')
return voice_map[model][voice]
except KeyError as e:
raise BadRequestError(f"Error loading voice: {voice}, KeyError: {e}", param='voice')
return (m, s, l)
class GenerateSpeechRequest(BaseModel):
model: str = "tts-1" # or "tts-1-hd"
@ -162,7 +139,15 @@ async def generate_speech(request: GenerateSpeechRequest):
# Use piper for tts-1, and if xtts_device == none use for all models.
if model == 'tts-1' or args.xtts_device == 'none':
piper_model, speaker, not_used_language = map_voice_to_speaker(voice, 'tts-1')
voice_map = map_voice_to_speaker(voice, 'tts-1')
try:
piper_model = voice_map['model']
except KeyError as e:
raise ServiceUnavailableError(f"Configuration error: tts-1 voice '{voice}' is missing 'model:' setting. KeyError: {e}")
speaker = voice_map.get('speaker', None)
tts_args = ["piper", "--model", str(piper_model), "--data-dir", "voices", "--download-dir", "voices", "--output-raw"]
if speaker:
tts_args.extend(["--speaker", str(speaker)])
@ -177,7 +162,16 @@ async def generate_speech(request: GenerateSpeechRequest):
# Use xtts for tts-1-hd
elif model == 'tts-1-hd':
tts_model, speaker, language = map_voice_to_speaker(voice, 'tts-1-hd')
voice_map = map_voice_to_speaker(voice, 'tts-1-hd')
try:
tts_model = voice_map['model']
speaker = voice_map['speaker']
except KeyError as e:
raise ServiceUnavailableError(f"Configuration error: tts-1-hd voice '{voice}' is missing setting. KeyError: {e}")
language = voice_map.get('language', 'en')
tts_model_path = voice_map.get('model_path', None)
if xtts is not None and xtts.model_name != tts_model:
import torch, gc
@ -186,20 +180,9 @@ async def generate_speech(request: GenerateSpeechRequest):
gc.collect()
torch.cuda.empty_cache()
if 'parler-tts' in tts_model and has_parler_tts:
if xtts is None:
xtts = parler_tts(tts_model, device=args.xtts_device)
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate=str(xtts.model.config.sampling_rate))
if speed != 1:
ffmpeg_args.extend(["-af", f"atempo={speed}"])
tts_io_out = xtts.tts(text=input_text, description=speaker)
else:
if xtts is None:
xtts = xtts_wrapper(tts_model, device=args.xtts_device)
xtts = xtts_wrapper(tts_model, device=args.xtts_device, model_path=tts_model_path)
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")
@ -221,13 +204,21 @@ async def generate_speech(request: GenerateSpeechRequest):
return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)
# We return 'mps' but currently XTTS will not work with mps devices as the cuda support is incomplete
def auto_torch_device():
try:
import torch
return 'cuda' if torch.cuda.is_available() else 'mps' if ( torch.backends.mps.is_available() and torch.backends.mps.is_built() ) else 'cpu'
except:
return 'none'
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='OpenedAI Speech API Server',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--xtts_device', action='store', default="cuda", help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
parser.add_argument('--xtts_device', action='store', default=auto_torch_device(), help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
@ -235,6 +226,9 @@ if __name__ == "__main__":
args = parser.parse_args()
default_exists('config/pre_process_map.yaml')
default_exists('config/voice_to_speaker.yaml')
logger.remove()
logger.add(sink=sys.stderr, level=args.log_level)
@ -242,10 +236,7 @@ if __name__ == "__main__":
from TTS.api import TTS
if args.preload:
if 'parler-tts' in args.preload:
xtts = parler_tts(args.preload, device=args.xtts_device)
else:
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
app.register_model('tts-1')
app.register_model('tts-1-hd')

View File

@ -5,4 +5,4 @@ set /p < speech.env
call download_voices_tts-1.bat
call download_voices_tts-1-hd.bat %PRELOAD_MODEL%
python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL% %OPENEDAI_LOG_LEVEL:+--log-level %OPENEDAI_LOG_LEVEL%
python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL% %EXTRA_ARGS%

View File

@ -4,4 +4,4 @@
bash download_voices_tts-1.sh
python speech.py --xtts_device none ${OPENEDAI_LOG_LEVEL:+--log-level $OPENEDAI_LOG_LEVEL}
python speech.py --xtts_device none $EXTRA_ARGS $@

View File

@ -2,7 +2,9 @@
[ -f speech.env ] && . speech.env
echo "First startup may download 2GB of speech models. Please wait."
bash download_voices_tts-1.sh
bash download_voices_tts-1-hd.sh $PRELOAD_MODEL
python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} ${OPENEDAI_LOG_LEVEL:+--log-level $OPENEDAI_LOG_LEVEL} $@
python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} $EXTRA_ARGS $@

View File

@ -48,9 +48,3 @@ tts-1-hd:
me:
model: xtts_v2.0.2 # you can specify different xtts version
speaker: voices/me.wav # this could be you
parler:
model: parler-tts/parler_tts_mini_v0.1
speaker: A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast.
parler2:
model: parler-tts/parler_tts_mini_v0.1
speaker: A female voice with an Indian accent enunciates every word with precision. The speaker's voice is very close-sounding, and the recording is excellent, capturing her voice with crisp clarity.