mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
0.13.0 -parler, +arm64, +audio_reader
This commit is contained in:
parent
18c73ce827
commit
ea4af74e5c
4
.github/workflows/build-docker.yml
vendored
4
.github/workflows/build-docker.yml
vendored
@ -55,6 +55,7 @@ jobs:
|
|||||||
push: true
|
push: true
|
||||||
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
|
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
|
||||||
labels: version=${{ github.run_id }}
|
labels: version=${{ github.run_id }}
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|
||||||
# For tagged releases, build and push the Docker image with the corresponding tag
|
# For tagged releases, build and push the Docker image with the corresponding tag
|
||||||
- name: Build and Push Docker Image (Tagged)
|
- name: Build and Push Docker Image (Tagged)
|
||||||
@ -66,6 +67,7 @@ jobs:
|
|||||||
push: true
|
push: true
|
||||||
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
|
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
|
||||||
labels: version=${{ github.run_id }}
|
labels: version=${{ github.run_id }}
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|
||||||
build-and-push-alt-image:
|
build-and-push-alt-image:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -113,6 +115,7 @@ jobs:
|
|||||||
push: true
|
push: true
|
||||||
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
|
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
|
||||||
labels: version=${{ github.run_id }}
|
labels: version=${{ github.run_id }}
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|
||||||
# For tagged releases, build and push the Docker image with the corresponding tag
|
# For tagged releases, build and push the Docker image with the corresponding tag
|
||||||
- name: Build and Push Docker Image (Tagged)
|
- name: Build and Push Docker Image (Tagged)
|
||||||
@ -124,4 +127,5 @@ jobs:
|
|||||||
push: true
|
push: true
|
||||||
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
|
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
|
||||||
labels: version=${{ github.run_id }}
|
labels: version=${{ github.run_id }}
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|
||||||
|
|||||||
12
Dockerfile
12
Dockerfile
@ -1,8 +1,11 @@
|
|||||||
FROM python:3.11-slim
|
FROM python:3.11-slim
|
||||||
|
|
||||||
RUN apt-get update && \
|
ARG TARGETPLATFORM
|
||||||
apt-get install --no-install-recommends -y curl ffmpeg git && \
|
RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
|
||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
RUN if [ $TARGEPLATFORM != 'linux/amd64' ]; then apt-get install --no-install-recommends -y build-essential ; fi
|
||||||
|
RUN if [ $TARGEPLATFORM != 'linux/amd64' ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
|
||||||
|
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||||
|
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
RUN mkdir -p voices config
|
RUN mkdir -p voices config
|
||||||
@ -10,8 +13,7 @@ RUN mkdir -p voices config
|
|||||||
COPY requirements.txt /app/
|
COPY requirements.txt /app/
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
|
RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
|
||||||
|
|
||||||
COPY speech.py openedai.py say.py *.sh README.md LICENSE /app/
|
COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/
|
||||||
COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /app/config/
|
|
||||||
|
|
||||||
ARG PRELOAD_MODEL
|
ARG PRELOAD_MODEL
|
||||||
ENV PRELOAD_MODEL=${PRELOAD_MODEL}
|
ENV PRELOAD_MODEL=${PRELOAD_MODEL}
|
||||||
|
|||||||
@ -1,17 +1,18 @@
|
|||||||
FROM python:3.11-slim
|
FROM python:3.11-slim
|
||||||
|
|
||||||
RUN apt-get update && \
|
ARG TARGETPLATFORM
|
||||||
apt-get install --no-install-recommends -y curl ffmpeg && \
|
RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
|
||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
RUN if [ $TARGEPLATFORM != 'linux/amd64' ]; then apt-get install --no-install-recommends -y build-essential ; fi
|
||||||
|
RUN if [ $TARGEPLATFORM != 'linux/amd64' ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
|
||||||
|
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||||
|
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
RUN mkdir -p voices config
|
RUN mkdir -p voices config
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip pip install piper-tts==1.2.0 pyyaml fastapi uvicorn loguru numpy\<2
|
RUN --mount=type=cache,target=/root/.cache/pip pip install piper-tts==1.2.0 pyyaml fastapi uvicorn loguru numpy\<2
|
||||||
|
|
||||||
|
COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/
|
||||||
COPY speech.py openedai.py say.py *.sh README.md LICENSE /app/
|
|
||||||
COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /app/config/
|
|
||||||
|
|
||||||
ENV TTS_HOME=voices
|
ENV TTS_HOME=voices
|
||||||
ENV HF_HOME=voices
|
ENV HF_HOME=voices
|
||||||
|
|||||||
30
README.md
30
README.md
@ -19,6 +19,7 @@ Details:
|
|||||||
* Model `tts-1-hd` via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, but requires around 4GB GPU VRAM)
|
* Model `tts-1-hd` via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, but requires around 4GB GPU VRAM)
|
||||||
* Custom cloned voices can be used for tts-1-hd, See: [Custom Voices Howto](#custom-voices-howto)
|
* Custom cloned voices can be used for tts-1-hd, See: [Custom Voices Howto](#custom-voices-howto)
|
||||||
* 🌐 [Multilingual](#multilingual) support with XTTS voices
|
* 🌐 [Multilingual](#multilingual) support with XTTS voices
|
||||||
|
* [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
|
||||||
* Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`
|
* Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`
|
||||||
|
|
||||||
|
|
||||||
@ -26,6 +27,14 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s
|
|||||||
|
|
||||||
## Recent Changes
|
## Recent Changes
|
||||||
|
|
||||||
|
Version 0.13.0, 2024-06-22
|
||||||
|
|
||||||
|
* Added [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
|
||||||
|
* Initial prebuilt arm64 image support (Apple M1/2/3, Raspberry Pi), thanks @JakeStevenson, @hchasens
|
||||||
|
* Parler-tts support removed
|
||||||
|
* Move the *.default.yaml to the root folder
|
||||||
|
* Added 'audio_reader.py' for streaming text input and reading long texts
|
||||||
|
|
||||||
Version 0.12.3, 2024-06-17
|
Version 0.12.3, 2024-06-17
|
||||||
|
|
||||||
* Additional logging details for BadRequests (400)
|
* Additional logging details for BadRequests (400)
|
||||||
@ -284,3 +293,24 @@ Remove:
|
|||||||
These lines were added to the `config/pre_process_map.yaml` config file by default before version 0.11.0:
|
These lines were added to the `config/pre_process_map.yaml` config file by default before version 0.11.0:
|
||||||
|
|
||||||
4) Your new multi-lingual speaker voice is ready to use!
|
4) Your new multi-lingual speaker voice is ready to use!
|
||||||
|
|
||||||
|
|
||||||
|
## Custom Fine-Tuned Model Support
|
||||||
|
|
||||||
|
Adding a custom xtts model is simple. Here is an example of how to add a custom fine-tuned 'halo' XTTS model.
|
||||||
|
|
||||||
|
1) Save the model folder under `voices/` (all 4 files are required, including the vocab.json from the model)
|
||||||
|
```
|
||||||
|
openedai-speech$ ls voices/halo/
|
||||||
|
config.json vocab.json model.pth sample.wav
|
||||||
|
```
|
||||||
|
2) Add the custom voice entry under the `tts-1-hd` section of `config/voice_to_speaker.yaml`:
|
||||||
|
```yaml
|
||||||
|
tts-1-hd:
|
||||||
|
...
|
||||||
|
halo:
|
||||||
|
model: halo # This name is required to be unique
|
||||||
|
speaker: voices/halo/sample.wav # voice sample is required
|
||||||
|
model_path: voices/halo
|
||||||
|
```
|
||||||
|
3) The model will be loaded when you access the voice for the first time (`--preload` doesn't work with custom models yet)
|
||||||
|
|||||||
63
add_voice.py
Executable file
63
add_voice.py
Executable file
@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
print("!! WARNING EXPERIMENTAL !! - THIS TOOL WILL ERASE ALL COMMENTS FROM THE CONFIG FILES .. OR WORSE!!")
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
|
||||||
|
parser.add_argument('sample', action='store', help="Set the wav sample file")
|
||||||
|
parser.add_argument('-n', '--name', action='store', help="Set the name for the voice (by default will use the WAV file name)")
|
||||||
|
parser.add_argument('-l', '--language', action='store', default="en", help="Set the language for the voice",
|
||||||
|
choices=['en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko'])
|
||||||
|
parser.add_argument('--openai-model', action='store', default="tts-1-hd", help="Set the openai model for the voice")
|
||||||
|
parser.add_argument('--xtts-model', action='store', default="xtts", help="Set the xtts model for the voice (if using a custom model, also set model_path)")
|
||||||
|
parser.add_argument('--model-path', action='store', default=None, help="Set the path for a custom xtts model")
|
||||||
|
parser.add_argument('--config-path', action='store', default="config/voice_to_speaker.yaml", help="Set the config file path")
|
||||||
|
parser.add_argument('--voice-path', action='store', default="voices", help="Set the default voices file path")
|
||||||
|
parser.add_argument('--default-path', action='store', default="voice_to_speaker.default.yaml", help="Set the default config file path")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
basename = os.path.basename(args.sample)
|
||||||
|
name_noext, ext = os.path.splitext(basename)
|
||||||
|
|
||||||
|
if not args.name:
|
||||||
|
args.name = name_noext
|
||||||
|
else:
|
||||||
|
basename = f"{args.name}.wav"
|
||||||
|
|
||||||
|
dest_file = os.path.join(args.voice_path, basename)
|
||||||
|
if args.sample != dest_file:
|
||||||
|
shutil.copy2(args.sample, dest_file)
|
||||||
|
|
||||||
|
if not os.path.exists(args.config_path):
|
||||||
|
shutil.copy2(args.default_path, args.config_path)
|
||||||
|
|
||||||
|
with open(args.config_path, 'r', encoding='utf8') as file:
|
||||||
|
voice_map = yaml.safe_load(file)
|
||||||
|
|
||||||
|
model_conf = voice_map.get(args.openai_model, {})
|
||||||
|
model_conf[args.name] = {
|
||||||
|
'model': args.xtts_model,
|
||||||
|
'speaker': os.path.join(args.voice_path, basename),
|
||||||
|
'language': args.language,
|
||||||
|
}
|
||||||
|
if args.model_path:
|
||||||
|
model_conf[args.name]['model_path'] = args.model_path
|
||||||
|
voice_map[args.openai_model] = model_conf
|
||||||
|
|
||||||
|
with open(args.config_path, 'w', encoding='utf8') as ofile:
|
||||||
|
yaml.safe_dump(voice_map, ofile, default_flow_style=False, allow_unicode=True)
|
||||||
|
|
||||||
|
print(f"Updated: {args.config_path}")
|
||||||
|
print(f"Added voice: {args.openai_model}/{args.name}")
|
||||||
|
print(f"Added section:")
|
||||||
|
print(f"{args.openai_model}:")
|
||||||
|
print(f" {args.name}:")
|
||||||
|
print(f" model: {model_conf[args.name]['model']}")
|
||||||
|
print(f" speaker: {model_conf[args.name]['speaker']}")
|
||||||
|
print(f" language: {model_conf[args.name]['language']}")
|
||||||
127
audio_reader.py
Executable file
127
audio_reader.py
Executable file
@ -0,0 +1,127 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
try:
|
||||||
|
import dotenv
|
||||||
|
dotenv.load_dotenv()
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import pysbd
|
||||||
|
import queue
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import threading
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import contextlib
|
||||||
|
|
||||||
|
import openai
|
||||||
|
|
||||||
|
try:
|
||||||
|
from playsound import playsound
|
||||||
|
except ImportError:
|
||||||
|
print("Error: missing required package 'playsound'. !pip install playsound")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def tempdir():
|
||||||
|
path = tempfile.mkdtemp()
|
||||||
|
try:
|
||||||
|
yield path
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
shutil.rmtree(path)
|
||||||
|
except IOError:
|
||||||
|
sys.stderr.write('Failed to clean up temp dir {}'.format(path))
|
||||||
|
|
||||||
|
class SimpleAudioPlayer:
|
||||||
|
def __init__(self):
|
||||||
|
self._queue = queue.Queue()
|
||||||
|
self.running = True
|
||||||
|
self._thread = threading.Thread(target=self.__play_audio_loop, daemon=True)
|
||||||
|
self._thread.start()
|
||||||
|
|
||||||
|
def put(self, file):
|
||||||
|
self._queue.put(file)
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self.running = False
|
||||||
|
self._thread.join()
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
file = self._queue.get_nowait()
|
||||||
|
if os.path.exists(file):
|
||||||
|
os.unlink(file)
|
||||||
|
except queue.Empty as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __play_audio_loop(self):
|
||||||
|
while self.running:
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
file = self._queue.get(block=True, timeout=0.01)
|
||||||
|
|
||||||
|
try:
|
||||||
|
playsound(file)
|
||||||
|
finally:
|
||||||
|
os.unlink(file)
|
||||||
|
|
||||||
|
except queue.Empty as e:
|
||||||
|
continue
|
||||||
|
|
||||||
|
class OpenAI_tts:
|
||||||
|
def __init__(self, model, voice, speed, base_dir):
|
||||||
|
self.base_dir = base_dir
|
||||||
|
self.openai_client = openai.OpenAI(
|
||||||
|
# export OPENAI_API_KEY=sk-11111111111
|
||||||
|
# export OPENAI_BASE_URL=http://localhost:8000/v1
|
||||||
|
api_key = os.environ.get("OPENAI_API_KEY", "sk-ip"),
|
||||||
|
base_url = os.environ.get("OPENAI_BASE_URL", "http://localhost:8000/v1"),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.params = {
|
||||||
|
'model': model,
|
||||||
|
'voice': voice,
|
||||||
|
'speed': speed
|
||||||
|
}
|
||||||
|
|
||||||
|
def speech_to_file(self, text: str) -> None:
|
||||||
|
with self.openai_client.audio.speech.with_streaming_response.create(
|
||||||
|
input=text, response_format='opus', **self.params
|
||||||
|
) as response:
|
||||||
|
tf, output_filename = tempfile.mkstemp(suffix='.wav', prefix="audio_reader_", dir=self.base_dir)
|
||||||
|
response.stream_to_file(output_filename)
|
||||||
|
return output_filename
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Text to speech player',
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
|
||||||
|
parser.add_argument('-m', '--model', action='store', default="tts-1")
|
||||||
|
parser.add_argument('-v', '--voice', action='store', default="alloy")
|
||||||
|
parser.add_argument('-s', '--speed', action='store', default=1.0)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
with tempdir() as base_dir:
|
||||||
|
player = SimpleAudioPlayer()
|
||||||
|
reader = OpenAI_tts(voice=args.voice, model=args.model, speed=args.speed, base_dir=base_dir)
|
||||||
|
seg = pysbd.Segmenter(language='en', clean=True) # text is dirty, clean it up.
|
||||||
|
|
||||||
|
for raw_line in sys.stdin:
|
||||||
|
for line in seg.segment(raw_line):
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(line)
|
||||||
|
player.put(reader.speech_to_file(line))
|
||||||
|
|
||||||
|
player.stop()
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
@ -1,4 +1,4 @@
|
|||||||
# regex pairs to clean the text before speaking
|
# regex pairs to clean the text before speaking
|
||||||
- - ([^.])\.$
|
- - ([^.])\.$
|
||||||
- \1
|
- \1
|
||||||
- - '&'
|
- - '&'
|
||||||
@ -10,5 +10,3 @@ TTS
|
|||||||
torch==2.2.2
|
torch==2.2.2
|
||||||
# XXX, 3.8+ has some issue for now
|
# XXX, 3.8+ has some issue for now
|
||||||
spacy==3.7.4
|
spacy==3.7.4
|
||||||
# parler-tts
|
|
||||||
git+https://github.com/huggingface/parler-tts.git
|
|
||||||
|
|||||||
@ -2,4 +2,3 @@ TTS_HOME=voices
|
|||||||
HF_HOME=voices
|
HF_HOME=voices
|
||||||
#PRELOAD_MODEL=xtts
|
#PRELOAD_MODEL=xtts
|
||||||
#PRELOAD_MODEL=xtts_v2.0.2
|
#PRELOAD_MODEL=xtts_v2.0.2
|
||||||
#PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1
|
|
||||||
121
speech.py
121
speech.py
@ -11,71 +11,52 @@ import uvicorn
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
# for parler
|
from openedai import OpenAIStub, BadRequestError, ServiceUnavailableError
|
||||||
try:
|
|
||||||
from parler_tts import ParlerTTSForConditionalGeneration
|
|
||||||
from transformers import AutoTokenizer, logging
|
|
||||||
import torch
|
|
||||||
import soundfile as sf
|
|
||||||
logging.set_verbosity_error()
|
|
||||||
has_parler_tts = True
|
|
||||||
except ImportError:
|
|
||||||
logger.info("No parler support found")
|
|
||||||
has_parler_tts = False
|
|
||||||
|
|
||||||
from openedai import OpenAIStub, BadRequestError
|
|
||||||
|
|
||||||
xtts = None
|
xtts = None
|
||||||
args = None
|
args = None
|
||||||
app = OpenAIStub()
|
app = OpenAIStub()
|
||||||
|
|
||||||
class xtts_wrapper():
|
class xtts_wrapper():
|
||||||
def __init__(self, model_name, device):
|
def __init__(self, model_name, device, model_path=None):
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.xtts = TTS(model_name=model_name, progress_bar=False).to(device)
|
|
||||||
|
logger.info(f"Loading model {self.model_name} to {device}")
|
||||||
|
|
||||||
|
if model_path: # custom model # and config_path
|
||||||
|
config_path=os.path.join(model_path, 'config.json')
|
||||||
|
self.xtts = TTS(model_path=model_path, config_path=config_path).to(device)
|
||||||
|
else:
|
||||||
|
self.xtts = TTS(model_name=model_name).to(device)
|
||||||
|
|
||||||
def tts(self, text, speaker_wav, speed, language):
|
def tts(self, text, speaker_wav, speed, language):
|
||||||
tf, file_path = tempfile.mkstemp(suffix='.wav')
|
tf, file_path = tempfile.mkstemp(suffix='.wav', prefix='openedai-speech-')
|
||||||
|
|
||||||
file_path = self.xtts.tts_to_file(
|
try:
|
||||||
text=text,
|
# TODO: support speaker= as voice id instead of just wav
|
||||||
language=language,
|
file_path = self.xtts.tts_to_file(
|
||||||
speaker_wav=speaker_wav,
|
text=text,
|
||||||
speed=speed,
|
language=language,
|
||||||
file_path=file_path,
|
speaker_wav=speaker_wav,
|
||||||
)
|
speed=speed,
|
||||||
|
file_path=file_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
os.unlink(file_path)
|
||||||
|
|
||||||
os.unlink(file_path)
|
|
||||||
return tf
|
return tf
|
||||||
|
|
||||||
class parler_tts():
|
|
||||||
def __init__(self, model_name, device):
|
|
||||||
self.model_name = model_name
|
|
||||||
self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
|
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
||||||
|
|
||||||
def tts(self, text, description):
|
|
||||||
input_ids = self.tokenizer(description, return_tensors="pt").input_ids.to(self.model.device)
|
|
||||||
prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.model.device)
|
|
||||||
|
|
||||||
generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
|
||||||
audio_arr = generation.cpu().numpy().squeeze()
|
|
||||||
|
|
||||||
tf, file_path = tempfile.mkstemp(suffix='.wav')
|
|
||||||
sf.write(file_path, audio_arr, self.model.config.sampling_rate)
|
|
||||||
os.unlink(file_path)
|
|
||||||
return tf
|
|
||||||
|
|
||||||
|
|
||||||
def default_exists(filename: str):
|
def default_exists(filename: str):
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
basename, ext = os.path.splitext(filename)
|
fpath, ext = os.path.splitext(filename)
|
||||||
|
basename = os.path.basename(fpath)
|
||||||
default = f"{basename}.default{ext}"
|
default = f"{basename}.default{ext}"
|
||||||
|
|
||||||
logger.info(f"{filename} does not exist, setting defaults from {default}")
|
logger.info(f"{filename} does not exist, setting defaults from {default}")
|
||||||
|
|
||||||
with open(default, 'r') as from_file:
|
with open(default, 'r', encoding='utf8') as from_file:
|
||||||
with open(filename, 'w') as to_file:
|
with open(filename, 'w', encoding='utf8') as to_file:
|
||||||
to_file.write(from_file.read())
|
to_file.write(from_file.read())
|
||||||
|
|
||||||
# Read pre process map on demand so it can be changed without restarting the server
|
# Read pre process map on demand so it can be changed without restarting the server
|
||||||
@ -97,14 +78,10 @@ def map_voice_to_speaker(voice: str, model: str):
|
|||||||
with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
|
with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
|
||||||
voice_map = yaml.safe_load(file)
|
voice_map = yaml.safe_load(file)
|
||||||
try:
|
try:
|
||||||
m = voice_map[model][voice]['model']
|
return voice_map[model][voice]
|
||||||
s = voice_map[model][voice]['speaker']
|
|
||||||
l = voice_map[model][voice].get('language', 'en')
|
|
||||||
|
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise BadRequestError(f"Error loading voice: {voice}, KeyError: {e}", param='voice')
|
raise BadRequestError(f"Error loading voice: {voice}, KeyError: {e}", param='voice')
|
||||||
|
|
||||||
return (m, s, l)
|
|
||||||
|
|
||||||
class GenerateSpeechRequest(BaseModel):
|
class GenerateSpeechRequest(BaseModel):
|
||||||
model: str = "tts-1" # or "tts-1-hd"
|
model: str = "tts-1" # or "tts-1-hd"
|
||||||
@ -162,7 +139,15 @@ async def generate_speech(request: GenerateSpeechRequest):
|
|||||||
|
|
||||||
# Use piper for tts-1, and if xtts_device == none use for all models.
|
# Use piper for tts-1, and if xtts_device == none use for all models.
|
||||||
if model == 'tts-1' or args.xtts_device == 'none':
|
if model == 'tts-1' or args.xtts_device == 'none':
|
||||||
piper_model, speaker, not_used_language = map_voice_to_speaker(voice, 'tts-1')
|
voice_map = map_voice_to_speaker(voice, 'tts-1')
|
||||||
|
try:
|
||||||
|
piper_model = voice_map['model']
|
||||||
|
|
||||||
|
except KeyError as e:
|
||||||
|
raise ServiceUnavailableError(f"Configuration error: tts-1 voice '{voice}' is missing 'model:' setting. KeyError: {e}")
|
||||||
|
|
||||||
|
speaker = voice_map.get('speaker', None)
|
||||||
|
|
||||||
tts_args = ["piper", "--model", str(piper_model), "--data-dir", "voices", "--download-dir", "voices", "--output-raw"]
|
tts_args = ["piper", "--model", str(piper_model), "--data-dir", "voices", "--download-dir", "voices", "--output-raw"]
|
||||||
if speaker:
|
if speaker:
|
||||||
tts_args.extend(["--speaker", str(speaker)])
|
tts_args.extend(["--speaker", str(speaker)])
|
||||||
@ -177,7 +162,16 @@ async def generate_speech(request: GenerateSpeechRequest):
|
|||||||
|
|
||||||
# Use xtts for tts-1-hd
|
# Use xtts for tts-1-hd
|
||||||
elif model == 'tts-1-hd':
|
elif model == 'tts-1-hd':
|
||||||
tts_model, speaker, language = map_voice_to_speaker(voice, 'tts-1-hd')
|
voice_map = map_voice_to_speaker(voice, 'tts-1-hd')
|
||||||
|
try:
|
||||||
|
tts_model = voice_map['model']
|
||||||
|
speaker = voice_map['speaker']
|
||||||
|
|
||||||
|
except KeyError as e:
|
||||||
|
raise ServiceUnavailableError(f"Configuration error: tts-1-hd voice '{voice}' is missing setting. KeyError: {e}")
|
||||||
|
|
||||||
|
language = voice_map.get('language', 'en')
|
||||||
|
tts_model_path = voice_map.get('model_path', None)
|
||||||
|
|
||||||
if xtts is not None and xtts.model_name != tts_model:
|
if xtts is not None and xtts.model_name != tts_model:
|
||||||
import torch, gc
|
import torch, gc
|
||||||
@ -186,20 +180,9 @@ async def generate_speech(request: GenerateSpeechRequest):
|
|||||||
gc.collect()
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
if 'parler-tts' in tts_model and has_parler_tts:
|
|
||||||
if xtts is None:
|
|
||||||
xtts = parler_tts(tts_model, device=args.xtts_device)
|
|
||||||
|
|
||||||
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate=str(xtts.model.config.sampling_rate))
|
|
||||||
|
|
||||||
if speed != 1:
|
|
||||||
ffmpeg_args.extend(["-af", f"atempo={speed}"])
|
|
||||||
|
|
||||||
tts_io_out = xtts.tts(text=input_text, description=speaker)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if xtts is None:
|
if xtts is None:
|
||||||
xtts = xtts_wrapper(tts_model, device=args.xtts_device)
|
xtts = xtts_wrapper(tts_model, device=args.xtts_device, model_path=tts_model_path)
|
||||||
|
|
||||||
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")
|
ffmpeg_args = build_ffmpeg_args(response_format, input_format="WAV", sample_rate="24000")
|
||||||
|
|
||||||
@ -235,6 +218,9 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
default_exists('config/pre_process_map.yaml')
|
||||||
|
default_exists('config/voice_to_speaker.yaml')
|
||||||
|
|
||||||
logger.remove()
|
logger.remove()
|
||||||
logger.add(sink=sys.stderr, level=args.log_level)
|
logger.add(sink=sys.stderr, level=args.log_level)
|
||||||
|
|
||||||
@ -242,10 +228,7 @@ if __name__ == "__main__":
|
|||||||
from TTS.api import TTS
|
from TTS.api import TTS
|
||||||
|
|
||||||
if args.preload:
|
if args.preload:
|
||||||
if 'parler-tts' in args.preload:
|
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
|
||||||
xtts = parler_tts(args.preload, device=args.xtts_device)
|
|
||||||
else:
|
|
||||||
xtts = xtts_wrapper(args.preload, device=args.xtts_device)
|
|
||||||
|
|
||||||
app.register_model('tts-1')
|
app.register_model('tts-1')
|
||||||
app.register_model('tts-1-hd')
|
app.register_model('tts-1-hd')
|
||||||
|
|||||||
@ -48,9 +48,3 @@ tts-1-hd:
|
|||||||
me:
|
me:
|
||||||
model: xtts_v2.0.2 # you can specify different xtts version
|
model: xtts_v2.0.2 # you can specify different xtts version
|
||||||
speaker: voices/me.wav # this could be you
|
speaker: voices/me.wav # this could be you
|
||||||
parler:
|
|
||||||
model: parler-tts/parler_tts_mini_v0.1
|
|
||||||
speaker: A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast.
|
|
||||||
parler2:
|
|
||||||
model: parler-tts/parler_tts_mini_v0.1
|
|
||||||
speaker: A female voice with an Indian accent enunciates every word with precision. The speaker's voice is very close-sounding, and the recording is excellent, capturing her voice with crisp clarity.
|
|
||||||
Loading…
Reference in New Issue
Block a user