0.12.0 - Improved errors & logging, swap alloy default voice

closes #3, re: #11
2025-06-26 18:16:32 +00:00 · 2024-06-16 23:35:11 -04:00 · 2024-06-16 23:35:11 -04:00 · f21ed56a00
commit f21ed56a00
parent 07337907f9
9 changed files with 167 additions and 14 deletions
--- a/1
+++ b/1
@ -17,6 +17,7 @@ ARG PRELOAD_MODEL
 ENV PRELOAD_MODEL=${PRELOAD_MODEL}
 ENV TTS_HOME=voices
 ENV HF_HOME=voices
+ENV OPENEDAI_LOG_LEVEL=INFO
 ENV COQUI_TOS_AGREED=1

 CMD bash startup.sh
--- a/Dockerfile.min
+++ b/Dockerfile.min
@ -15,5 +15,6 @@ COPY config/voice_to_speaker.default.yaml config/pre_process_map.default.yaml /a

 ENV TTS_HOME=voices
 ENV HF_HOME=voices
+ENV OPENEDAI_LOG_LEVEL=INFO

 CMD bash startup.min.sh
--- a/README.md
+++ b/README.md
@ -26,6 +26,11 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s

 ## Recent Changes

+Version 0.12.0, 2024-06-16
+
+* Improved error handling and logging
+* Restore the original alloy tts-1-hd voice by default, use alloy-alt for the old voice.
+
 Version 0.11.0, 2024-05-29

 * 🌐 [Multilingual](#multilingual) support (16 languages) with XTTS
@ -95,7 +100,7 @@ bash startup.sh
 ## Usage

 ```
-usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST]
+usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST] [-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}]

 OpenedAI Speech API Server

@ -106,6 +111,8 @@ options:
  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
  -P PORT, --port PORT  Server tcp port (default: 8000)
  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
+  -L {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
+                        Set the log level (default: INFO)

 ```

@ -213,7 +220,7 @@ Coqui XTTS v2 voice cloning can work with as little as 6 seconds of clear audio.
 * Mono (single channel) 22050 Hz WAV file
 * 6-30 seconds long - longer isn't always better (I've had some good results with as little as 4 seconds)
 * low noise (no hiss or hum)
-* No partial words, breathing, music or backgrounds sounds
+* No partial words, breathing, laughing, music or backgrounds sounds
 * An even speaking pace with a variety of words is best, like in interviews or audiobooks.

 You can use FFmpeg to prepare your audio files, here are some examples:
--- a/config/voice_to_speaker.default.yaml
+++ b/config/voice_to_speaker.default.yaml
@ -24,12 +24,12 @@
    model: voices/en_US-libritts_r-medium.onnx
    speaker: 163
 tts-1-hd:
-  alloy:
+  alloy-alt:
    model: xtts
    speaker: voices/alloy-alt.wav
-  alloy-orig: 
+  alloy:
    model: xtts
-    speaker: voices/alloy.wav # it's REALLY BAD
+    speaker: voices/alloy.wav
  echo:
    model: xtts
    speaker: voices/echo.wav
--- a/openedai.py
+++ b/openedai.py
@ -1,12 +1,72 @@
-from fastapi import FastAPI
+from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import PlainTextResponse
+from fastapi.responses import PlainTextResponse, JSONResponse
+from loguru import logger
+
+class OpenAIError(Exception):
+    pass
+
+class APIError(OpenAIError):
+    message: str
+    code: str = None
+    param: str = None
+    type: str = None
+
+    def __init__(self, message: str, code: int = 500, param: str = None, internal_message: str = ''):
+        super().__init__(message)
+        self.message = message
+        self.code = code
+        self.param = param
+        self.type = self.__class__.__name__,
+        self.internal_message = internal_message
+
+    def __repr__(self):
+        return "%s(message=%r, code=%d, param=%s)" % (
+            self.__class__.__name__,
+            self.message,
+            self.code,
+            self.param,
+        )
+
+class InternalServerError(APIError):
+    pass
+
+class ServiceUnavailableError(APIError):
+    def __init__(self, message="Service unavailable, please try again later.", code=503, internal_message=''):
+        super().__init__(message, code, internal_message)
+
+class APIStatusError(APIError):
+    status_code: int = 400
+    
+    def __init__(self, message: str, param: str = None, internal_message: str = ''):
+        super().__init__(message, self.status_code, param, internal_message)
+
+class BadRequestError(APIStatusError):
+    status_code: int = 400
+
+class AuthenticationError(APIStatusError):
+    status_code: int = 401
+
+class PermissionDeniedError(APIStatusError):
+    status_code: int = 403
+
+class NotFoundError(APIStatusError):
+    status_code: int = 404
+
+class ConflictError(APIStatusError):
+    status_code: int = 409
+
+class UnprocessableEntityError(APIStatusError):
+    status_code: int = 422
+
+class RateLimitError(APIStatusError):
+    status_code: int = 429

 class OpenAIStub(FastAPI):
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.models = {}
-            
+
        self.add_middleware(
            CORSMiddleware,
            allow_origins=["*"],
@ -15,6 +75,59 @@ class OpenAIStub(FastAPI):
            allow_headers=["*"]
        )

+        @self.exception_handler(Exception)
+        def openai_exception_handler(request: Request, exc: Exception) -> JSONResponse:
+            # Generic server errors
+            #logger.opt(exception=exc).error("Logging exception traceback")
+
+            return JSONResponse(status_code=500, content={
+                'message': 'InternalServerError',
+                'code': 500,
+            })
+
+        @self.exception_handler(APIError)
+        def openai_apierror_handler(request: Request, exc: APIError) -> JSONResponse:
+            # Server error
+            logger.opt(exception=exc).error("Logging exception traceback")
+
+            if exc.internal_message:
+                logger.info(exc.internal_message)
+
+            return JSONResponse(status_code = exc.code, content={
+                'message': exc.message,
+                'code': exc.code,
+                'type': exc.__class__.__name__,
+                'param': exc.param,
+            })
+
+        @self.exception_handler(APIStatusError)
+        def openai_statuserror_handler(request: Request, exc: APIStatusError) -> JSONResponse:
+            # User error
+            if exc.internal_message:
+                logger.info(exc.internal_message)
+
+            return JSONResponse(status_code = exc.code, content={
+                'message': exc.message,
+                'code': exc.code,
+                'type': exc.__class__.__name__,
+                'param': exc.param,
+            })
+
+        @self.middleware("http")
+        async def log_requests(request: Request, call_next):
+            logger.debug(f"Request path: {request.url.path}")
+            logger.debug(f"Request method: {request.method}")
+            logger.debug(f"Request headers: {request.headers}")
+            logger.debug(f"Request query params: {request.query_params}")
+            logger.debug(f"Request body: {await request.body()}")
+
+            response = await call_next(request)
+
+            logger.debug(f"Response status code: {response.status_code}")
+            logger.debug(f"Response headers: {response.headers}")
+
+            return response
+
        @self.get('/v1/billing/usage')
        @self.get('/v1/dashboard/billing/usage')
        async def handle_billing_usage():
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 fastapi
 uvicorn
+loguru
 # piper-tts
 piper-tts==1.2.0
 # xtts
--- a/speech.py
+++ b/speech.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 import os
+import sys
 import re
 import subprocess
 import tempfile
@ -8,6 +9,7 @@ import yaml
 from fastapi.responses import StreamingResponse
 import uvicorn
 from pydantic import BaseModel
+from loguru import logger

 # for parler
 try:
@ -18,14 +20,14 @@ try:
    logging.set_verbosity_error()
    has_parler_tts = True
 except ImportError:
-    print("No parler support found")
+    logger.info("No parler support found")
    has_parler_tts = False

-import openedai
+from openedai import OpenAIStub, BadRequestError

 xtts = None
 args = None
-app = openedai.OpenAIStub()
+app = OpenAIStub()

 class xtts_wrapper():
    def __init__(self, model_name, device):
@ -69,17 +71,24 @@ def default_exists(filename: str):
    if not os.path.exists(filename):
        basename, ext = os.path.splitext(filename)
        default = f"{basename}.default{ext}"
+        
+        logger.info(f"{filename} does not exist, setting defaults from {default}")
+
        with open(default, 'r') as from_file:
            with open(filename, 'w') as to_file:
                to_file.write(from_file.read())

 # Read pre process map on demand so it can be changed without restarting the server
 def preprocess(raw_input):
+    logger.debug(f"preprocess: before: {[raw_input]}")
    default_exists('config/pre_process_map.yaml')
    with open('config/pre_process_map.yaml', 'r', encoding='utf8') as file:
        pre_process_map = yaml.safe_load(file)
        for a, b in pre_process_map:
            raw_input = re.sub(a, b, raw_input)
+    
+    raw_input = raw_input.strip()
+    logger.debug(f"preprocess: after: {[raw_input]}")
    return raw_input

 # Read voice map on demand so it can be changed without restarting the server
@ -87,7 +96,15 @@ def map_voice_to_speaker(voice: str, model: str):
    default_exists('config/voice_to_speaker.yaml')
    with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
        voice_map = yaml.safe_load(file)
-        return (voice_map[model][voice]['model'], voice_map[model][voice]['speaker'], voice_map[model][voice].get('language', 'en'))
+        try:
+            m = voice_map[model][voice]['model']
+            s = voice_map[model][voice]['speaker']
+            l = voice_map[model][voice].get('language', 'en')
+
+        except KeyError as e:
+            raise BadRequestError(f"Error loading voice: {voice}, KeyError: {e}", param='voice')
+        
+        return (m, s, l)

 class GenerateSpeechRequest(BaseModel):
    model: str = "tts-1" # or "tts-1-hd"
@ -117,7 +134,14 @@ def build_ffmpeg_args(response_format, input_format, sample_rate):
@app.post("/v1/audio/speech", response_class=StreamingResponse)
 async def generate_speech(request: GenerateSpeechRequest):
    global xtts, args
+    if len(request.input) < 1:
+        raise BadRequestError("Empty Input", param='input')
+
    input_text = preprocess(request.input)
+
+    if len(input_text) < 1:
+        raise BadRequestError("Input text empty after preprocess.", param='input')
+
    model = request.model
    voice = request.voice
    response_format = request.response_format
@ -188,6 +212,8 @@ async def generate_speech(request: GenerateSpeechRequest):
                speed = 1.0

            tts_io_out = xtts.tts(text=input_text, speaker_wav=speaker, speed=speed, language=language)
+    else:
+        raise BadRequestError("No such model, must be tts-1 or tts-1-hd.", param='model')

    # Pipe the output from piper/xtts to the input of ffmpeg
    ffmpeg_args.extend(["-"])
@ -205,9 +231,13 @@ if __name__ == "__main__":
    parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
    parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
    parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
+    parser.add_argument('-L', '--log-level', default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="Set the log level")

    args = parser.parse_args()

+    logger.remove()
+    logger.add(sink=sys.stderr, level=args.log_level)
+
    if args.xtts_device != "none":
        from TTS.api import TTS

--- a/startup.bat
+++ b/startup.bat
@ -5,4 +5,4 @@ set /p < speech.env
 call download_voices_tts-1.bat
 call download_voices_tts-1-hd.bat %PRELOAD_MODEL%

-python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL%
+python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL% %OPENEDAI_LOG_LEVEL:+--log-level %OPENEDAI_LOG_LEVEL%
--- a/startup.sh
+++ b/startup.sh
@ -5,4 +5,4 @@
 bash download_voices_tts-1.sh
 bash download_voices_tts-1-hd.sh $PRELOAD_MODEL

-python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} $@
+python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} ${OPENEDAI_LOG_LEVEL:+--log-level $OPENEDAI_LOG_LEVEL} $@