0.12.0 - Improved errors & logging, swap alloy default voice

closes #3, re: #11
This commit is contained in:
matatonic 2024-06-16 23:35:11 -04:00
parent 07337907f9
commit f21ed56a00
9 changed files with 167 additions and 14 deletions

View File

@ -17,6 +17,7 @@ ARG PRELOAD_MODEL
ENV PRELOAD_MODEL=${PRELOAD_MODEL}
ENV TTS_HOME=voices
ENV HF_HOME=voices
ENV OPENEDAI_LOG_LEVEL=INFO
ENV COQUI_TOS_AGREED=1
CMD bash startup.sh

View File

@ -15,5 +15,6 @@ COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /a
ENV TTS_HOME=voices
ENV HF_HOME=voices
ENV OPENEDAI_LOG_LEVEL=INFO
CMD bash startup.min.sh

View File

@ -26,6 +26,11 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s
## Recent Changes
Version 0.12.0, 2024-06-16
* Improved error handling and logging
* Restore the original alloy tts-1-hd voice by default, use alloy-alt for the old voice.
Version 0.11.0, 2024-05-29
* 🌐 [Multilingual](#multilingual) support (16 languages) with XTTS
@ -95,7 +100,7 @@ bash startup.sh
## Usage
```
usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST]
usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST] [-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
OpenedAI Speech API Server
@ -106,6 +111,8 @@ options:
--preload PRELOAD Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
-P PORT, --port PORT Server tcp port (default: 8000)
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
Set the log level (default: INFO)
```
@ -213,7 +220,7 @@ Coqui XTTS v2 voice cloning can work with as little as 6 seconds of clear audio.
* Mono (single channel) 22050 Hz WAV file
* 6-30 seconds long - longer isn't always better (I've had some good results with as little as 4 seconds)
* low noise (no hiss or hum)
* No partial words, breathing, music or backgrounds sounds
* No partial words, breathing, laughing, music or backgrounds sounds
* An even speaking pace with a variety of words is best, like in interviews or audiobooks.
You can use FFmpeg to prepare your audio files, here are some examples:

View File

@ -24,12 +24,12 @@
model: voices/en_US-libritts_r-medium.onnx
speaker: 163
tts-1-hd:
alloy:
alloy-alt:
model: xtts
speaker: voices/alloy-alt.wav
alloy-orig:
alloy:
model: xtts
speaker: voices/alloy.wav # it's REALLY BAD
speaker: voices/alloy.wav
echo:
model: xtts
speaker: voices/echo.wav

View File

@ -1,12 +1,72 @@
from fastapi import FastAPI
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import PlainTextResponse
from fastapi.responses import PlainTextResponse, JSONResponse
from loguru import logger
class OpenAIError(Exception):
pass
class APIError(OpenAIError):
message: str
code: str = None
param: str = None
type: str = None
def __init__(self, message: str, code: int = 500, param: str = None, internal_message: str = ''):
super().__init__(message)
self.message = message
self.code = code
self.param = param
self.type = self.__class__.__name__,
self.internal_message = internal_message
def __repr__(self):
return "%s(message=%r, code=%d, param=%s)" % (
self.__class__.__name__,
self.message,
self.code,
self.param,
)
class InternalServerError(APIError):
pass
class ServiceUnavailableError(APIError):
def __init__(self, message="Service unavailable, please try again later.", code=503, internal_message=''):
super().__init__(message, code, internal_message)
class APIStatusError(APIError):
status_code: int = 400
def __init__(self, message: str, param: str = None, internal_message: str = ''):
super().__init__(message, self.status_code, param, internal_message)
class BadRequestError(APIStatusError):
status_code: int = 400
class AuthenticationError(APIStatusError):
status_code: int = 401
class PermissionDeniedError(APIStatusError):
status_code: int = 403
class NotFoundError(APIStatusError):
status_code: int = 404
class ConflictError(APIStatusError):
status_code: int = 409
class UnprocessableEntityError(APIStatusError):
status_code: int = 422
class RateLimitError(APIStatusError):
status_code: int = 429
class OpenAIStub(FastAPI):
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.models = {}
self.add_middleware(
CORSMiddleware,
allow_origins=["*"],
@ -15,6 +75,59 @@ class OpenAIStub(FastAPI):
allow_headers=["*"]
)
@self.exception_handler(Exception)
def openai_exception_handler(request: Request, exc: Exception) -> JSONResponse:
# Generic server errors
#logger.opt(exception=exc).error("Logging exception traceback")
return JSONResponse(status_code=500, content={
'message': 'InternalServerError',
'code': 500,
})
@self.exception_handler(APIError)
def openai_apierror_handler(request: Request, exc: APIError) -> JSONResponse:
# Server error
logger.opt(exception=exc).error("Logging exception traceback")
if exc.internal_message:
logger.info(exc.internal_message)
return JSONResponse(status_code = exc.code, content={
'message': exc.message,
'code': exc.code,
'type': exc.__class__.__name__,
'param': exc.param,
})
@self.exception_handler(APIStatusError)
def openai_statuserror_handler(request: Request, exc: APIStatusError) -> JSONResponse:
# User error
if exc.internal_message:
logger.info(exc.internal_message)
return JSONResponse(status_code = exc.code, content={
'message': exc.message,
'code': exc.code,
'type': exc.__class__.__name__,
'param': exc.param,
})
@self.middleware("http")
async def log_requests(request: Request, call_next):
logger.debug(f"Request path: {request.url.path}")
logger.debug(f"Request method: {request.method}")
logger.debug(f"Request headers: {request.headers}")
logger.debug(f"Request query params: {request.query_params}")
logger.debug(f"Request body: {await request.body()}")
response = await call_next(request)
logger.debug(f"Response status code: {response.status_code}")
logger.debug(f"Response headers: {response.headers}")
return response
@self.get('/v1/billing/usage')
@self.get('/v1/dashboard/billing/usage')
async def handle_billing_usage():

View File

@ -1,5 +1,6 @@
fastapi
uvicorn
loguru
# piper-tts
piper-tts==1.2.0
# xtts

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python3
import argparse
import os
import sys
import re
import subprocess
import tempfile
@ -8,6 +9,7 @@ import yaml
from fastapi.responses import StreamingResponse
import uvicorn
from pydantic import BaseModel
from loguru import logger
# for parler
try:
@ -18,14 +20,14 @@ try:
logging.set_verbosity_error()
has_parler_tts = True
except ImportError:
print("No parler support found")
logger.info("No parler support found")
has_parler_tts = False
import openedai
from openedai import OpenAIStub, BadRequestError
xtts = None
args = None
app = openedai.OpenAIStub()
app = OpenAIStub()
class xtts_wrapper():
def __init__(self, model_name, device):
@ -69,17 +71,24 @@ def default_exists(filename: str):
if not os.path.exists(filename):
basename, ext = os.path.splitext(filename)
default = f"{basename}.default{ext}"
logger.info(f"{filename} does not exist, setting defaults from {default}")
with open(default, 'r') as from_file:
with open(filename, 'w') as to_file:
to_file.write(from_file.read())
# Read pre process map on demand so it can be changed without restarting the server
def preprocess(raw_input):
logger.debug(f"preprocess: before: {[raw_input]}")
default_exists('config/pre_process_map.yaml')
with open('config/pre_process_map.yaml', 'r', encoding='utf8') as file:
pre_process_map = yaml.safe_load(file)
for a, b in pre_process_map:
raw_input = re.sub(a, b, raw_input)
raw_input = raw_input.strip()
logger.debug(f"preprocess: after: {[raw_input]}")
return raw_input
# Read voice map on demand so it can be changed without restarting the server
@ -87,7 +96,15 @@ def map_voice_to_speaker(voice: str, model: str):
default_exists('config/voice_to_speaker.yaml')
with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
voice_map = yaml.safe_load(file)
return (voice_map[model][voice]['model'], voice_map[model][voice]['speaker'], voice_map[model][voice].get('language', 'en'))
try:
m = voice_map[model][voice]['model']
s = voice_map[model][voice]['speaker']
l = voice_map[model][voice].get('language', 'en')
except KeyError as e:
raise BadRequestError(f"Error loading voice: {voice}, KeyError: {e}", param='voice')
return (m, s, l)
class GenerateSpeechRequest(BaseModel):
model: str = "tts-1" # or "tts-1-hd"
@ -117,7 +134,14 @@ def build_ffmpeg_args(response_format, input_format, sample_rate):
@app.post("/v1/audio/speech", response_class=StreamingResponse)
async def generate_speech(request: GenerateSpeechRequest):
global xtts, args
if len(request.input) < 1:
raise BadRequestError("Empty Input", param='input')
input_text = preprocess(request.input)
if len(input_text) < 1:
raise BadRequestError("Input text empty after preprocess.", param='input')
model = request.model
voice = request.voice
response_format = request.response_format
@ -188,6 +212,8 @@ async def generate_speech(request: GenerateSpeechRequest):
speed = 1.0
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed, language=language)
else:
raise BadRequestError("No such model, must be tts-1 or tts-1-hd.", param='model')
# Pipe the output from piper/xtts to the input of ffmpeg
ffmpeg_args.extend(["-"])
@ -205,9 +231,13 @@ if __name__ == "__main__":
parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
parser.add_argument('-L', '--log-level', default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="Set the log level")
args = parser.parse_args()
logger.remove()
logger.add(sink=sys.stderr, level=args.log_level)
if args.xtts_device != "none":
from TTS.api import TTS

View File

@ -5,4 +5,4 @@ set /p < speech.env
call download_voices_tts-1.bat
call download_voices_tts-1-hd.bat %PRELOAD_MODEL%
python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL%
python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL% %OPENEDAI_LOG_LEVEL:+--log-level %OPENEDAI_LOG_LEVEL%

View File

@ -5,4 +5,4 @@
bash download_voices_tts-1.sh
bash download_voices_tts-1-hd.sh $PRELOAD_MODEL
python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} $@
python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} ${OPENEDAI_LOG_LEVEL:+--log-level $OPENEDAI_LOG_LEVEL} $@