mirror of
https://github.com/matatonic/openedai-speech
synced 2025-06-26 18:16:32 +00:00
0.12.0 - Improved errors & logging, swap alloy default voice
closes #3, re: #11
This commit is contained in:
parent
07337907f9
commit
f21ed56a00
@ -17,6 +17,7 @@ ARG PRELOAD_MODEL
|
||||
ENV PRELOAD_MODEL=${PRELOAD_MODEL}
|
||||
ENV TTS_HOME=voices
|
||||
ENV HF_HOME=voices
|
||||
ENV OPENEDAI_LOG_LEVEL=INFO
|
||||
ENV COQUI_TOS_AGREED=1
|
||||
|
||||
CMD bash startup.sh
|
||||
|
||||
@ -15,5 +15,6 @@ COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /a
|
||||
|
||||
ENV TTS_HOME=voices
|
||||
ENV HF_HOME=voices
|
||||
ENV OPENEDAI_LOG_LEVEL=INFO
|
||||
|
||||
CMD bash startup.min.sh
|
||||
|
||||
11
README.md
11
README.md
@ -26,6 +26,11 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s
|
||||
|
||||
## Recent Changes
|
||||
|
||||
Version 0.12.0, 2024-06-16
|
||||
|
||||
* Improved error handling and logging
|
||||
* Restore the original alloy tts-1-hd voice by default, use alloy-alt for the old voice.
|
||||
|
||||
Version 0.11.0, 2024-05-29
|
||||
|
||||
* 🌐 [Multilingual](#multilingual) support (16 languages) with XTTS
|
||||
@ -95,7 +100,7 @@ bash startup.sh
|
||||
## Usage
|
||||
|
||||
```
|
||||
usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST]
|
||||
usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST] [-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
|
||||
|
||||
OpenedAI Speech API Server
|
||||
|
||||
@ -106,6 +111,8 @@ options:
|
||||
--preload PRELOAD Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
|
||||
-P PORT, --port PORT Server tcp port (default: 8000)
|
||||
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
|
||||
-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
|
||||
Set the log level (default: INFO)
|
||||
|
||||
```
|
||||
|
||||
@ -213,7 +220,7 @@ Coqui XTTS v2 voice cloning can work with as little as 6 seconds of clear audio.
|
||||
* Mono (single channel) 22050 Hz WAV file
|
||||
* 6-30 seconds long - longer isn't always better (I've had some good results with as little as 4 seconds)
|
||||
* low noise (no hiss or hum)
|
||||
* No partial words, breathing, music or backgrounds sounds
|
||||
* No partial words, breathing, laughing, music or backgrounds sounds
|
||||
* An even speaking pace with a variety of words is best, like in interviews or audiobooks.
|
||||
|
||||
You can use FFmpeg to prepare your audio files, here are some examples:
|
||||
|
||||
@ -24,12 +24,12 @@
|
||||
model: voices/en_US-libritts_r-medium.onnx
|
||||
speaker: 163
|
||||
tts-1-hd:
|
||||
alloy:
|
||||
alloy-alt:
|
||||
model: xtts
|
||||
speaker: voices/alloy-alt.wav
|
||||
alloy-orig:
|
||||
alloy:
|
||||
model: xtts
|
||||
speaker: voices/alloy.wav # it's REALLY BAD
|
||||
speaker: voices/alloy.wav
|
||||
echo:
|
||||
model: xtts
|
||||
speaker: voices/echo.wav
|
||||
|
||||
119
openedai.py
119
openedai.py
@ -1,12 +1,72 @@
|
||||
from fastapi import FastAPI
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import PlainTextResponse
|
||||
from fastapi.responses import PlainTextResponse, JSONResponse
|
||||
from loguru import logger
|
||||
|
||||
class OpenAIError(Exception):
|
||||
pass
|
||||
|
||||
class APIError(OpenAIError):
|
||||
message: str
|
||||
code: str = None
|
||||
param: str = None
|
||||
type: str = None
|
||||
|
||||
def __init__(self, message: str, code: int = 500, param: str = None, internal_message: str = ''):
|
||||
super().__init__(message)
|
||||
self.message = message
|
||||
self.code = code
|
||||
self.param = param
|
||||
self.type = self.__class__.__name__,
|
||||
self.internal_message = internal_message
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(message=%r, code=%d, param=%s)" % (
|
||||
self.__class__.__name__,
|
||||
self.message,
|
||||
self.code,
|
||||
self.param,
|
||||
)
|
||||
|
||||
class InternalServerError(APIError):
|
||||
pass
|
||||
|
||||
class ServiceUnavailableError(APIError):
|
||||
def __init__(self, message="Service unavailable, please try again later.", code=503, internal_message=''):
|
||||
super().__init__(message, code, internal_message)
|
||||
|
||||
class APIStatusError(APIError):
|
||||
status_code: int = 400
|
||||
|
||||
def __init__(self, message: str, param: str = None, internal_message: str = ''):
|
||||
super().__init__(message, self.status_code, param, internal_message)
|
||||
|
||||
class BadRequestError(APIStatusError):
|
||||
status_code: int = 400
|
||||
|
||||
class AuthenticationError(APIStatusError):
|
||||
status_code: int = 401
|
||||
|
||||
class PermissionDeniedError(APIStatusError):
|
||||
status_code: int = 403
|
||||
|
||||
class NotFoundError(APIStatusError):
|
||||
status_code: int = 404
|
||||
|
||||
class ConflictError(APIStatusError):
|
||||
status_code: int = 409
|
||||
|
||||
class UnprocessableEntityError(APIStatusError):
|
||||
status_code: int = 422
|
||||
|
||||
class RateLimitError(APIStatusError):
|
||||
status_code: int = 429
|
||||
|
||||
class OpenAIStub(FastAPI):
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.models = {}
|
||||
|
||||
|
||||
self.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
@ -15,6 +75,59 @@ class OpenAIStub(FastAPI):
|
||||
allow_headers=["*"]
|
||||
)
|
||||
|
||||
@self.exception_handler(Exception)
|
||||
def openai_exception_handler(request: Request, exc: Exception) -> JSONResponse:
|
||||
# Generic server errors
|
||||
#logger.opt(exception=exc).error("Logging exception traceback")
|
||||
|
||||
return JSONResponse(status_code=500, content={
|
||||
'message': 'InternalServerError',
|
||||
'code': 500,
|
||||
})
|
||||
|
||||
@self.exception_handler(APIError)
|
||||
def openai_apierror_handler(request: Request, exc: APIError) -> JSONResponse:
|
||||
# Server error
|
||||
logger.opt(exception=exc).error("Logging exception traceback")
|
||||
|
||||
if exc.internal_message:
|
||||
logger.info(exc.internal_message)
|
||||
|
||||
return JSONResponse(status_code = exc.code, content={
|
||||
'message': exc.message,
|
||||
'code': exc.code,
|
||||
'type': exc.__class__.__name__,
|
||||
'param': exc.param,
|
||||
})
|
||||
|
||||
@self.exception_handler(APIStatusError)
|
||||
def openai_statuserror_handler(request: Request, exc: APIStatusError) -> JSONResponse:
|
||||
# User error
|
||||
if exc.internal_message:
|
||||
logger.info(exc.internal_message)
|
||||
|
||||
return JSONResponse(status_code = exc.code, content={
|
||||
'message': exc.message,
|
||||
'code': exc.code,
|
||||
'type': exc.__class__.__name__,
|
||||
'param': exc.param,
|
||||
})
|
||||
|
||||
@self.middleware("http")
|
||||
async def log_requests(request: Request, call_next):
|
||||
logger.debug(f"Request path: {request.url.path}")
|
||||
logger.debug(f"Request method: {request.method}")
|
||||
logger.debug(f"Request headers: {request.headers}")
|
||||
logger.debug(f"Request query params: {request.query_params}")
|
||||
logger.debug(f"Request body: {await request.body()}")
|
||||
|
||||
response = await call_next(request)
|
||||
|
||||
logger.debug(f"Response status code: {response.status_code}")
|
||||
logger.debug(f"Response headers: {response.headers}")
|
||||
|
||||
return response
|
||||
|
||||
@self.get('/v1/billing/usage')
|
||||
@self.get('/v1/dashboard/billing/usage')
|
||||
async def handle_billing_usage():
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
loguru
|
||||
# piper-tts
|
||||
piper-tts==1.2.0
|
||||
# xtts
|
||||
|
||||
38
speech.py
38
speech.py
@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
@ -8,6 +9,7 @@ import yaml
|
||||
from fastapi.responses import StreamingResponse
|
||||
import uvicorn
|
||||
from pydantic import BaseModel
|
||||
from loguru import logger
|
||||
|
||||
# for parler
|
||||
try:
|
||||
@ -18,14 +20,14 @@ try:
|
||||
logging.set_verbosity_error()
|
||||
has_parler_tts = True
|
||||
except ImportError:
|
||||
print("No parler support found")
|
||||
logger.info("No parler support found")
|
||||
has_parler_tts = False
|
||||
|
||||
import openedai
|
||||
from openedai import OpenAIStub, BadRequestError
|
||||
|
||||
xtts = None
|
||||
args = None
|
||||
app = openedai.OpenAIStub()
|
||||
app = OpenAIStub()
|
||||
|
||||
class xtts_wrapper():
|
||||
def __init__(self, model_name, device):
|
||||
@ -69,17 +71,24 @@ def default_exists(filename: str):
|
||||
if not os.path.exists(filename):
|
||||
basename, ext = os.path.splitext(filename)
|
||||
default = f"{basename}.default{ext}"
|
||||
|
||||
logger.info(f"{filename} does not exist, setting defaults from {default}")
|
||||
|
||||
with open(default, 'r') as from_file:
|
||||
with open(filename, 'w') as to_file:
|
||||
to_file.write(from_file.read())
|
||||
|
||||
# Read pre process map on demand so it can be changed without restarting the server
|
||||
def preprocess(raw_input):
|
||||
logger.debug(f"preprocess: before: {[raw_input]}")
|
||||
default_exists('config/pre_process_map.yaml')
|
||||
with open('config/pre_process_map.yaml', 'r', encoding='utf8') as file:
|
||||
pre_process_map = yaml.safe_load(file)
|
||||
for a, b in pre_process_map:
|
||||
raw_input = re.sub(a, b, raw_input)
|
||||
|
||||
raw_input = raw_input.strip()
|
||||
logger.debug(f"preprocess: after: {[raw_input]}")
|
||||
return raw_input
|
||||
|
||||
# Read voice map on demand so it can be changed without restarting the server
|
||||
@ -87,7 +96,15 @@ def map_voice_to_speaker(voice: str, model: str):
|
||||
default_exists('config/voice_to_speaker.yaml')
|
||||
with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
|
||||
voice_map = yaml.safe_load(file)
|
||||
return (voice_map[model][voice]['model'], voice_map[model][voice]['speaker'], voice_map[model][voice].get('language', 'en'))
|
||||
try:
|
||||
m = voice_map[model][voice]['model']
|
||||
s = voice_map[model][voice]['speaker']
|
||||
l = voice_map[model][voice].get('language', 'en')
|
||||
|
||||
except KeyError as e:
|
||||
raise BadRequestError(f"Error loading voice: {voice}, KeyError: {e}", param='voice')
|
||||
|
||||
return (m, s, l)
|
||||
|
||||
class GenerateSpeechRequest(BaseModel):
|
||||
model: str = "tts-1" # or "tts-1-hd"
|
||||
@ -117,7 +134,14 @@ def build_ffmpeg_args(response_format, input_format, sample_rate):
|
||||
@app.post("/v1/audio/speech", response_class=StreamingResponse)
|
||||
async def generate_speech(request: GenerateSpeechRequest):
|
||||
global xtts, args
|
||||
if len(request.input) < 1:
|
||||
raise BadRequestError("Empty Input", param='input')
|
||||
|
||||
input_text = preprocess(request.input)
|
||||
|
||||
if len(input_text) < 1:
|
||||
raise BadRequestError("Input text empty after preprocess.", param='input')
|
||||
|
||||
model = request.model
|
||||
voice = request.voice
|
||||
response_format = request.response_format
|
||||
@ -188,6 +212,8 @@ async def generate_speech(request: GenerateSpeechRequest):
|
||||
speed = 1.0
|
||||
|
||||
tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed, language=language)
|
||||
else:
|
||||
raise BadRequestError("No such model, must be tts-1 or tts-1-hd.", param='model')
|
||||
|
||||
# Pipe the output from piper/xtts to the input of ffmpeg
|
||||
ffmpeg_args.extend(["-"])
|
||||
@ -205,9 +231,13 @@ if __name__ == "__main__":
|
||||
parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
|
||||
parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
|
||||
parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
|
||||
parser.add_argument('-L', '--log-level', default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="Set the log level")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.remove()
|
||||
logger.add(sink=sys.stderr, level=args.log_level)
|
||||
|
||||
if args.xtts_device != "none":
|
||||
from TTS.api import TTS
|
||||
|
||||
|
||||
@ -5,4 +5,4 @@ set /p < speech.env
|
||||
call download_voices_tts-1.bat
|
||||
call download_voices_tts-1-hd.bat %PRELOAD_MODEL%
|
||||
|
||||
python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL%
|
||||
python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL% %OPENEDAI_LOG_LEVEL:+--log-level %OPENEDAI_LOG_LEVEL%
|
||||
@ -5,4 +5,4 @@
|
||||
bash download_voices_tts-1.sh
|
||||
bash download_voices_tts-1-hd.sh $PRELOAD_MODEL
|
||||
|
||||
python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} $@
|
||||
python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} ${OPENEDAI_LOG_LEVEL:+--log-level $OPENEDAI_LOG_LEVEL} $@
|
||||
|
||||
Loading…
Reference in New Issue
Block a user