open-webui/backend/apps/audio/main.py

import os
import logging
from fastapi import (
    FastAPI,
    Request,
    Depends,
    HTTPException,
    status,
    UploadFile,
    File,
    Form,
)
from fastapi.responses import StreamingResponse, JSONResponse, FileResponse

from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel

from typing import List
import uuid
import requests
import hashlib
from pathlib import Path
import json

from constants import ERROR_MESSAGES
from utils.utils import (
    decode_token,
    get_current_user,
    get_verified_user,
    get_admin_user,
)
from utils.misc import calculate_sha256


from config import (
    SRC_LOG_LEVELS,
    CACHE_DIR,
    UPLOAD_DIR,
    WHISPER_MODEL,
    WHISPER_MODEL_DIR,
    WHISPER_MODEL_AUTO_UPDATE,
    DEVICE_TYPE,
    AUDIO_STT_OPENAI_API_BASE_URL,
    AUDIO_STT_OPENAI_API_KEY,
    AUDIO_TTS_OPENAI_API_BASE_URL,
    AUDIO_TTS_OPENAI_API_KEY,
    AUDIO_TTS_API_KEY,
    AUDIO_STT_ENGINE,
    AUDIO_STT_MODEL,
    AUDIO_TTS_ENGINE,
    AUDIO_TTS_MODEL,
    AUDIO_TTS_VOICE,
    AppConfig,
)

log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["AUDIO"])

app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

app.state.config = AppConfig()

app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL
app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY
app.state.config.STT_ENGINE = AUDIO_STT_ENGINE
app.state.config.STT_MODEL = AUDIO_STT_MODEL

app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL
app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY
app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE
app.state.config.TTS_MODEL = AUDIO_TTS_MODEL
app.state.config.TTS_VOICE = AUDIO_TTS_VOICE
app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY

# setting device type for whisper model
whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu"
log.info(f"whisper_device_type: {whisper_device_type}")

SPEECH_CACHE_DIR = Path(CACHE_DIR).joinpath("./audio/speech/")
SPEECH_CACHE_DIR.mkdir(parents=True, exist_ok=True)


class TTSConfigForm(BaseModel):
    OPENAI_API_BASE_URL: str
    OPENAI_API_KEY: str
    API_KEY: str
    ENGINE: str
    MODEL: str
    VOICE: str


class STTConfigForm(BaseModel):
    OPENAI_API_BASE_URL: str
    OPENAI_API_KEY: str
    ENGINE: str
    MODEL: str


class AudioConfigUpdateForm(BaseModel):
    tts: TTSConfigForm
    stt: STTConfigForm


from pydub import AudioSegment
from pydub.utils import mediainfo


def is_mp4_audio(file_path):
    """Check if the given file is an MP4 audio file."""
    if not os.path.isfile(file_path):
        print(f"File not found: {file_path}")
        return False

    info = mediainfo(file_path)
    if (
        info.get("codec_name") == "aac"
        and info.get("codec_type") == "audio"
        and info.get("codec_tag_string") == "mp4a"
    ):
        return True
    return False


def convert_mp4_to_wav(file_path, output_path):
    """Convert MP4 audio file to WAV format."""
    audio = AudioSegment.from_file(file_path, format="mp4")
    audio.export(output_path, format="wav")
    print(f"Converted {file_path} to {output_path}")


@app.get("/config")
async def get_audio_config(user=Depends(get_admin_user)):
    return {
        "tts": {
            "OPENAI_API_BASE_URL": app.state.config.TTS_OPENAI_API_BASE_URL,
            "OPENAI_API_KEY": app.state.config.TTS_OPENAI_API_KEY,
            "API_KEY": app.state.config.TTS_API_KEY,
            "ENGINE": app.state.config.TTS_ENGINE,
            "MODEL": app.state.config.TTS_MODEL,
            "VOICE": app.state.config.TTS_VOICE,
        },
        "stt": {
            "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
            "OPENAI_API_KEY": app.state.config.STT_OPENAI_API_KEY,
            "ENGINE": app.state.config.STT_ENGINE,
            "MODEL": app.state.config.STT_MODEL,
        },
    }


@app.post("/config/update")
async def update_audio_config(
    form_data: AudioConfigUpdateForm, user=Depends(get_admin_user)
):
    app.state.config.TTS_OPENAI_API_BASE_URL = form_data.tts.OPENAI_API_BASE_URL
    app.state.config.TTS_OPENAI_API_KEY = form_data.tts.OPENAI_API_KEY
    app.state.config.TTS_API_KEY = form_data.tts.API_KEY
    app.state.config.TTS_ENGINE = form_data.tts.ENGINE
    app.state.config.TTS_MODEL = form_data.tts.MODEL
    app.state.config.TTS_VOICE = form_data.tts.VOICE

    app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL
    app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
    app.state.config.STT_ENGINE = form_data.stt.ENGINE
    app.state.config.STT_MODEL = form_data.stt.MODEL

    return {
        "tts": {
            "OPENAI_API_BASE_URL": app.state.config.TTS_OPENAI_API_BASE_URL,
            "OPENAI_API_KEY": app.state.config.TTS_OPENAI_API_KEY,
            "API_KEY": app.state.config.TTS_API_KEY,
            "ENGINE": app.state.config.TTS_ENGINE,
            "MODEL": app.state.config.TTS_MODEL,
            "VOICE": app.state.config.TTS_VOICE,
        },
        "stt": {
            "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
            "OPENAI_API_KEY": app.state.config.STT_OPENAI_API_KEY,
            "ENGINE": app.state.config.STT_ENGINE,
            "MODEL": app.state.config.STT_MODEL,
        },
    }


@app.post("/speech")
async def speech(request: Request, user=Depends(get_verified_user)):
    body = await request.body()
    name = hashlib.sha256(body).hexdigest()

    file_path = SPEECH_CACHE_DIR.joinpath(f"{name}.mp3")
    file_body_path = SPEECH_CACHE_DIR.joinpath(f"{name}.json")

    # Check if the file already exists in the cache
    if file_path.is_file():
        return FileResponse(file_path)

    if app.state.config.TTS_ENGINE == "openai":
        headers = {}
        headers["Authorization"] = f"Bearer {app.state.config.TTS_OPENAI_API_KEY}"
        headers["Content-Type"] = "application/json"

        try:
            body = body.decode("utf-8")
            body = json.loads(body)
            body["model"] = app.state.config.TTS_MODEL
            body = json.dumps(body).encode("utf-8")
        except Exception as e:
            pass

        r = None
        try:
            r = requests.post(
                url=f"{app.state.config.TTS_OPENAI_API_BASE_URL}/audio/speech",
                data=body,
                headers=headers,
                stream=True,
            )

            r.raise_for_status()

            # Save the streaming content to a file
            with open(file_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)

            with open(file_body_path, "w") as f:
                json.dump(json.loads(body.decode("utf-8")), f)

            # Return the saved file
            return FileResponse(file_path)

        except Exception as e:
            log.exception(e)
            error_detail = "Open WebUI: Server Connection Error"
            if r is not None:
                try:
                    res = r.json()
                    if "error" in res:
                        error_detail = f"External: {res['error']['message']}"
                except:
                    error_detail = f"External: {e}"

            raise HTTPException(
                status_code=r.status_code if r != None else 500,
                detail=error_detail,
            )

    elif app.state.config.TTS_ENGINE == "elevenlabs":
        payload = None
        try:
            payload = json.loads(body.decode("utf-8"))
        except Exception as e:
            log.exception(e)
            raise HTTPException(status_code=400, detail="Invalid JSON payload")

        voice_id = payload.get("voice", "")
        url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"

        headers = {
            "Accept": "audio/mpeg",
            "Content-Type": "application/json",
            "xi-api-key": app.state.config.TTS_API_KEY,
        }

        data = {
            "text": payload["input"],
            "model_id": app.state.config.TTS_MODEL,
            "voice_settings": {"stability": 0.5, "similarity_boost": 0.5},
        }

        try:
            r = requests.post(url, json=data, headers=headers)

            r.raise_for_status()

            # Save the streaming content to a file
            with open(file_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)

            with open(file_body_path, "w") as f:
                json.dump(json.loads(body.decode("utf-8")), f)

            # Return the saved file
            return FileResponse(file_path)

        except Exception as e:
            log.exception(e)
            error_detail = "Open WebUI: Server Connection Error"
            if r is not None:
                try:
                    res = r.json()
                    if "error" in res:
                        error_detail = f"External: {res['error']['message']}"
                except:
                    error_detail = f"External: {e}"

            raise HTTPException(
                status_code=r.status_code if r != None else 500,
                detail=error_detail,
            )


@app.post("/transcriptions")
def transcribe(
    file: UploadFile = File(...),
    user=Depends(get_current_user),
):
    log.info(f"file.content_type: {file.content_type}")

    if file.content_type not in ["audio/mpeg", "audio/wav"]:
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
        )

    try:
        ext = file.filename.split(".")[-1]

        id = uuid.uuid4()
        filename = f"{id}.{ext}"

        file_dir = f"{CACHE_DIR}/audio/transcriptions"
        os.makedirs(file_dir, exist_ok=True)
        file_path = f"{file_dir}/{filename}"

        print(filename)

        contents = file.file.read()
        with open(file_path, "wb") as f:
            f.write(contents)
            f.close()

        if app.state.config.STT_ENGINE == "":
            from faster_whisper import WhisperModel

            whisper_kwargs = {
                "model_size_or_path": WHISPER_MODEL,
                "device": whisper_device_type,
                "compute_type": "int8",
                "download_root": WHISPER_MODEL_DIR,
                "local_files_only": not WHISPER_MODEL_AUTO_UPDATE,
            }

            log.debug(f"whisper_kwargs: {whisper_kwargs}")

            try:
                model = WhisperModel(**whisper_kwargs)
            except:
                log.warning(
                    "WhisperModel initialization failed, attempting download with local_files_only=False"
                )
                whisper_kwargs["local_files_only"] = False
                model = WhisperModel(**whisper_kwargs)

            segments, info = model.transcribe(file_path, beam_size=5)
            log.info(
                "Detected language '%s' with probability %f"
                % (info.language, info.language_probability)
            )

            transcript = "".join([segment.text for segment in list(segments)])

            data = {"text": transcript.strip()}

            # save the transcript to a json file
            transcript_file = f"{file_dir}/{id}.json"
            with open(transcript_file, "w") as f:
                json.dump(data, f)

            print(data)

            return data

        elif app.state.config.STT_ENGINE == "openai":
            if is_mp4_audio(file_path):
                print("is_mp4_audio")
                os.rename(file_path, file_path.replace(".wav", ".mp4"))
                # Convert MP4 audio file to WAV format
                convert_mp4_to_wav(file_path.replace(".wav", ".mp4"), file_path)

            headers = {"Authorization": f"Bearer {app.state.config.STT_OPENAI_API_KEY}"}

            files = {"file": (filename, open(file_path, "rb"))}
            data = {"model": app.state.config.STT_MODEL}

            print(files, data)

            r = None
            try:
                r = requests.post(
                    url=f"{app.state.config.STT_OPENAI_API_BASE_URL}/audio/transcriptions",
                    headers=headers,
                    files=files,
                    data=data,
                )

                r.raise_for_status()

                data = r.json()

                # save the transcript to a json file
                transcript_file = f"{file_dir}/{id}.json"
                with open(transcript_file, "w") as f:
                    json.dump(data, f)

                print(data)
                return data
            except Exception as e:
                log.exception(e)
                error_detail = "Open WebUI: Server Connection Error"
                if r is not None:
                    try:
                        res = r.json()
                        if "error" in res:
                            error_detail = f"External: {res['error']['message']}"
                    except:
                        error_detail = f"External: {e}"

                raise HTTPException(
                    status_code=r.status_code if r != None else 500,
                    detail=error_detail,
                )

    except Exception as e:
        log.exception(e)

        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=ERROR_MESSAGES.DEFAULT(e),
        )


def get_available_models() -> List[dict]:
    if app.state.config.TTS_ENGINE == "openai":
        return [{"id": "tts-1"}, {"id": "tts-1-hd"}]
    elif app.state.config.TTS_ENGINE == "elevenlabs":
        headers = {
            "xi-api-key": app.state.config.TTS_API_KEY,
            "Content-Type": "application/json",
        }

        try:
            response = requests.get(
                "https://api.elevenlabs.io/v1/models", headers=headers
            )
            response.raise_for_status()
            models = response.json()
            return [
                {"name": model["name"], "id": model["model_id"]} for model in models
            ]
        except requests.RequestException as e:
            log.error(f"Error fetching voices: {str(e)}")
    return []


@app.get("/models")
async def get_models(user=Depends(get_verified_user)):
    return {"models": get_available_models()}


def get_available_voices() -> List[dict]:
    if app.state.config.TTS_ENGINE == "openai":
        return [
            {"name": "alloy", "id": "alloy"},
            {"name": "echo", "id": "echo"},
            {"name": "fable", "id": "fable"},
            {"name": "onyx", "id": "onyx"},
            {"name": "nova", "id": "nova"},
            {"name": "shimmer", "id": "shimmer"},
        ]
    elif app.state.config.TTS_ENGINE == "elevenlabs":
        headers = {
            "xi-api-key": app.state.config.TTS_API_KEY,
            "Content-Type": "application/json",
        }

        try:
            response = requests.get(
                "https://api.elevenlabs.io/v1/voices", headers=headers
            )
            response.raise_for_status()
            voices_data = response.json()

            voices = []
            for voice in voices_data.get("voices", []):
                voices.append({"name": voice["name"], "id": voice["voice_id"]})
            return voices
        except requests.RequestException as e:
            log.error(f"Error fetching voices: {str(e)}")

    return []


@app.get("/voices")
async def get_voices(user=Depends(get_verified_user)):
    return {"voices": get_available_voices()}
no internet connection for whisper if you use docker 2024-02-13 14:11:53 +00:00			`import os`
Migrate to python logging module with env var control. 2024-03-20 23:11:36 +00:00			`import logging`
feat: whisper support 2024-02-11 08:17:50 +00:00			`from fastapi import (`
			`FastAPI,`
			`Request,`
			`Depends,`
			`HTTPException,`
			`status,`
			`UploadFile,`
			`File,`
			`Form,`
			`)`
refac: audio 2024-04-20 20:15:59 +00:00			`from fastapi.responses import StreamingResponse, JSONResponse, FileResponse`

feat: whisper support 2024-02-11 08:17:50 +00:00			`from fastapi.middleware.cors import CORSMiddleware`
refac 2024-04-20 20:21:52 +00:00			`from pydantic import BaseModel`

refac 2024-08-02 17:24:47 +00:00			`from typing import List`
refac 2024-06-07 03:44:42 +00:00			`import uuid`
refac: audio 2024-04-20 20:15:59 +00:00			`import requests`
			`import hashlib`
			`from pathlib import Path`
			`import json`

feat: whisper support 2024-02-11 08:17:50 +00:00			`from constants import ERROR_MESSAGES`
			`from utils.utils import (`
			`decode_token,`
			`get_current_user,`
			`get_verified_user,`
			`get_admin_user,`
			`)`
			`from utils.misc import calculate_sha256`

refac 2024-08-02 17:24:47 +00:00
chore: py formatting 2024-03-31 08:13:39 +00:00			`from config import (`
			`SRC_LOG_LEVELS,`
			`CACHE_DIR,`
			`UPLOAD_DIR,`
			`WHISPER_MODEL,`
			`WHISPER_MODEL_DIR,`
Introduce Whisper model auto-update control. * Introduce WHISPER_MODEL_AUTO_UPDATE env var * Pass local_files_only to WhisperModel() * Handle cases where auto-update is disabled but model is non-existent 2024-04-11 02:30:00 +00:00			`WHISPER_MODEL_AUTO_UPDATE,`
fixes and updates 2024-04-02 12:47:52 +00:00			`DEVICE_TYPE,`
refac: audio 2024-06-08 03:18:48 +00:00			`AUDIO_STT_OPENAI_API_BASE_URL,`
			`AUDIO_STT_OPENAI_API_KEY,`
			`AUDIO_TTS_OPENAI_API_BASE_URL,`
			`AUDIO_TTS_OPENAI_API_KEY,`
feat: elevenlabs tts support 2024-07-19 08:35:05 +00:00			`AUDIO_TTS_API_KEY,`
refac: audio 2024-06-08 03:18:48 +00:00			`AUDIO_STT_ENGINE,`
			`AUDIO_STT_MODEL,`
			`AUDIO_TTS_ENGINE,`
			`AUDIO_TTS_MODEL,`
			`AUDIO_TTS_VOICE,`
feat: switch to config proxy, remove config_get/set 2024-05-10 07:03:24 +00:00			`AppConfig,`
chore: py formatting 2024-03-31 08:13:39 +00:00			`)`
Migrate to python logging module with env var control. 2024-03-20 23:11:36 +00:00
			`log = logging.getLogger(__name__)`
			`log.setLevel(SRC_LOG_LEVELS["AUDIO"])`
feat: whisper support 2024-02-11 08:17:50 +00:00
			`app = FastAPI()`
			`app.add_middleware(`
			`CORSMiddleware,`
			`allow_origins=["*"],`
			`allow_credentials=True,`
			`allow_methods=["*"],`
			`allow_headers=["*"],`
			`)`

feat: switch to config proxy, remove config_get/set 2024-05-10 07:03:24 +00:00			`app.state.config = AppConfig()`
refac: audio 2024-06-08 03:18:48 +00:00
			`app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL`
			`app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY`
			`app.state.config.STT_ENGINE = AUDIO_STT_ENGINE`
			`app.state.config.STT_MODEL = AUDIO_STT_MODEL`

			`app.state.config.TTS_OPENAI_API_BASE_URL = AUDIO_TTS_OPENAI_API_BASE_URL`
			`app.state.config.TTS_OPENAI_API_KEY = AUDIO_TTS_OPENAI_API_KEY`
			`app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE`
			`app.state.config.TTS_MODEL = AUDIO_TTS_MODEL`
			`app.state.config.TTS_VOICE = AUDIO_TTS_VOICE`
feat: elevenlabs tts support 2024-07-19 08:35:05 +00:00			`app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY`
refac: audio 2024-04-20 20:15:59 +00:00
fixes and updates 2024-04-02 12:47:52 +00:00			`# setting device type for whisper model`
			`whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu"`
			`log.info(f"whisper_device_type: {whisper_device_type}")`

refac: audio 2024-04-20 20:15:59 +00:00			`SPEECH_CACHE_DIR = Path(CACHE_DIR).joinpath("./audio/speech/")`
			`SPEECH_CACHE_DIR.mkdir(parents=True, exist_ok=True)`


refac: audio 2024-06-08 03:18:48 +00:00			`class TTSConfigForm(BaseModel):`
			`OPENAI_API_BASE_URL: str`
			`OPENAI_API_KEY: str`
feat: elevenlabs tts support 2024-07-19 08:35:05 +00:00			`API_KEY: str`
refac: audio 2024-06-08 03:18:48 +00:00			`ENGINE: str`
			`MODEL: str`
			`VOICE: str`


			`class STTConfigForm(BaseModel):`
			`OPENAI_API_BASE_URL: str`
			`OPENAI_API_KEY: str`
			`ENGINE: str`
			`MODEL: str`


			`class AudioConfigUpdateForm(BaseModel):`
			`tts: TTSConfigForm`
			`stt: STTConfigForm`
refac 2024-04-20 20:21:52 +00:00

fix: safari audio issue 2024-06-08 09:07:19 +00:00			`from pydub import AudioSegment`
			`from pydub.utils import mediainfo`


			`def is_mp4_audio(file_path):`
			`"""Check if the given file is an MP4 audio file."""`
			`if not os.path.isfile(file_path):`
			`print(f"File not found: {file_path}")`
			`return False`

			`info = mediainfo(file_path)`
			`if (`
			`info.get("codec_name") == "aac"`
			`and info.get("codec_type") == "audio"`
			`and info.get("codec_tag_string") == "mp4a"`
			`):`
			`return True`
			`return False`


			`def convert_mp4_to_wav(file_path, output_path):`
			`"""Convert MP4 audio file to WAV format."""`
			`audio = AudioSegment.from_file(file_path, format="mp4")`
			`audio.export(output_path, format="wav")`
			`print(f"Converted {file_path} to {output_path}")`


refac 2024-04-20 20:21:52 +00:00			`@app.get("/config")`
refac: audio 2024-06-08 03:18:48 +00:00			`async def get_audio_config(user=Depends(get_admin_user)):`
refac 2024-04-20 20:21:52 +00:00			`return {`
refac: audio 2024-06-08 03:18:48 +00:00			`"tts": {`
			`"OPENAI_API_BASE_URL": app.state.config.TTS_OPENAI_API_BASE_URL,`
			`"OPENAI_API_KEY": app.state.config.TTS_OPENAI_API_KEY,`
feat: elevenlabs tts support 2024-07-19 08:35:05 +00:00			`"API_KEY": app.state.config.TTS_API_KEY,`
refac: audio 2024-06-08 03:18:48 +00:00			`"ENGINE": app.state.config.TTS_ENGINE,`
			`"MODEL": app.state.config.TTS_MODEL,`
			`"VOICE": app.state.config.TTS_VOICE,`
			`},`
			`"stt": {`
			`"OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,`
			`"OPENAI_API_KEY": app.state.config.STT_OPENAI_API_KEY,`
			`"ENGINE": app.state.config.STT_ENGINE,`
			`"MODEL": app.state.config.STT_MODEL,`
			`},`
refac 2024-04-20 20:21:52 +00:00			`}`


			`@app.post("/config/update")`
refac: audio 2024-06-08 03:18:48 +00:00			`async def update_audio_config(`
			`form_data: AudioConfigUpdateForm, user=Depends(get_admin_user)`
refac 2024-04-20 20:21:52 +00:00			`):`
refac: audio 2024-06-08 03:18:48 +00:00			`app.state.config.TTS_OPENAI_API_BASE_URL = form_data.tts.OPENAI_API_BASE_URL`
			`app.state.config.TTS_OPENAI_API_KEY = form_data.tts.OPENAI_API_KEY`
feat: elevenlabs tts support 2024-07-19 08:35:05 +00:00			`app.state.config.TTS_API_KEY = form_data.tts.API_KEY`
refac: audio 2024-06-08 03:18:48 +00:00			`app.state.config.TTS_ENGINE = form_data.tts.ENGINE`
			`app.state.config.TTS_MODEL = form_data.tts.MODEL`
			`app.state.config.TTS_VOICE = form_data.tts.VOICE`
refac 2024-04-20 20:21:52 +00:00
refac: audio 2024-06-08 03:18:48 +00:00			`app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL`
			`app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY`
			`app.state.config.STT_ENGINE = form_data.stt.ENGINE`
			`app.state.config.STT_MODEL = form_data.stt.MODEL`
refac 2024-04-20 20:21:52 +00:00
			`return {`
refac: audio 2024-06-08 03:18:48 +00:00			`"tts": {`
			`"OPENAI_API_BASE_URL": app.state.config.TTS_OPENAI_API_BASE_URL,`
			`"OPENAI_API_KEY": app.state.config.TTS_OPENAI_API_KEY,`
feat: elevenlabs tts support 2024-07-19 08:35:05 +00:00			`"API_KEY": app.state.config.TTS_API_KEY,`
refac: audio 2024-06-08 03:18:48 +00:00			`"ENGINE": app.state.config.TTS_ENGINE,`
			`"MODEL": app.state.config.TTS_MODEL,`
			`"VOICE": app.state.config.TTS_VOICE,`
			`},`
			`"stt": {`
			`"OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,`
			`"OPENAI_API_KEY": app.state.config.STT_OPENAI_API_KEY,`
			`"ENGINE": app.state.config.STT_ENGINE,`
			`"MODEL": app.state.config.STT_MODEL,`
			`},`
refac 2024-04-20 20:21:52 +00:00			`}`


refac: audio 2024-04-20 20:15:59 +00:00			`@app.post("/speech")`
			`async def speech(request: Request, user=Depends(get_verified_user)):`
feat: external openai tts support 2024-04-20 21:00:24 +00:00			`body = await request.body()`
			`name = hashlib.sha256(body).hexdigest()`

			`file_path = SPEECH_CACHE_DIR.joinpath(f"{name}.mp3")`
			`file_body_path = SPEECH_CACHE_DIR.joinpath(f"{name}.json")`

			`# Check if the file already exists in the cache`
			`if file_path.is_file():`
			`return FileResponse(file_path)`

feat: elevenlabs tts support 2024-07-19 08:35:05 +00:00			`if app.state.config.TTS_ENGINE == "openai":`
			`headers = {}`
			`headers["Authorization"] = f"Bearer {app.state.config.TTS_OPENAI_API_KEY}"`
			`headers["Content-Type"] = "application/json"`

			`try:`
			`body = body.decode("utf-8")`
			`body = json.loads(body)`
			`body["model"] = app.state.config.TTS_MODEL`
			`body = json.dumps(body).encode("utf-8")`
			`except Exception as e:`
			`pass`

			`r = None`
			`try:`
			`r = requests.post(`
			`url=f"{app.state.config.TTS_OPENAI_API_BASE_URL}/audio/speech",`
			`data=body,`
			`headers=headers,`
			`stream=True,`
			`)`

			`r.raise_for_status()`

			`# Save the streaming content to a file`
			`with open(file_path, "wb") as f:`
			`for chunk in r.iter_content(chunk_size=8192):`
			`f.write(chunk)`

			`with open(file_body_path, "w") as f:`
			`json.dump(json.loads(body.decode("utf-8")), f)`

			`# Return the saved file`
			`return FileResponse(file_path)`

			`except Exception as e:`
			`log.exception(e)`
			`error_detail = "Open WebUI: Server Connection Error"`
			`if r is not None:`
			`try:`
			`res = r.json()`
			`if "error" in res:`
			`error_detail = f"External: {res['error']['message']}"`
			`except:`
			`error_detail = f"External: {e}"`

			`raise HTTPException(`
			`status_code=r.status_code if r != None else 500,`
			`detail=error_detail,`
			`)`

			`elif app.state.config.TTS_ENGINE == "elevenlabs":`
			`payload = None`
			`try:`
			`payload = json.loads(body.decode("utf-8"))`
			`except Exception as e:`
			`log.exception(e)`
enh: ElevenLabs voice name -> ID 2024-07-20 06:56:00 +00:00			`raise HTTPException(status_code=400, detail="Invalid JSON payload")`

refac 2024-08-02 17:24:47 +00:00			`voice_id = payload.get("voice", "")`
enh: ElevenLabs voice name -> ID 2024-07-20 06:56:00 +00:00			`url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"`
feat: elevenlabs tts support 2024-07-19 08:35:05 +00:00
			`headers = {`
			`"Accept": "audio/mpeg",`
			`"Content-Type": "application/json",`
			`"xi-api-key": app.state.config.TTS_API_KEY,`
			`}`

			`data = {`
			`"text": payload["input"],`
			`"model_id": app.state.config.TTS_MODEL,`
			`"voice_settings": {"stability": 0.5, "similarity_boost": 0.5},`
			`}`

			`try:`
			`r = requests.post(url, json=data, headers=headers)`

			`r.raise_for_status()`

			`# Save the streaming content to a file`
			`with open(file_path, "wb") as f:`
			`for chunk in r.iter_content(chunk_size=8192):`
			`f.write(chunk)`

			`with open(file_body_path, "w") as f:`
			`json.dump(json.loads(body.decode("utf-8")), f)`

			`# Return the saved file`
			`return FileResponse(file_path)`

			`except Exception as e:`
			`log.exception(e)`
			`error_detail = "Open WebUI: Server Connection Error"`
			`if r is not None:`
			`try:`
			`res = r.json()`
			`if "error" in res:`
			`error_detail = f"External: {res['error']['message']}"`
			`except:`
			`error_detail = f"External: {e}"`

			`raise HTTPException(`
			`status_code=r.status_code if r != None else 500,`
			`detail=error_detail,`
			`)`
refac: audio 2024-04-20 20:15:59 +00:00
feat: whisper support 2024-02-11 08:17:50 +00:00
refac: audio 2024-04-20 20:15:59 +00:00			`@app.post("/transcriptions")`
feat: whisper support 2024-02-11 08:17:50 +00:00			`def transcribe(`
			`file: UploadFile = File(...),`
			`user=Depends(get_current_user),`
			`):`
Migrate to python logging module with env var control. 2024-03-20 23:11:36 +00:00			`log.info(f"file.content_type: {file.content_type}")`
feat: whisper support 2024-02-11 08:17:50 +00:00
			`if file.content_type not in ["audio/mpeg", "audio/wav"]:`
			`raise HTTPException(`
			`status_code=status.HTTP_400_BAD_REQUEST,`
			`detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,`
			`)`

			`try:`
refac 2024-06-07 03:44:42 +00:00			`ext = file.filename.split(".")[-1]`

			`id = uuid.uuid4()`
			`filename = f"{id}.{ext}"`

			`file_dir = f"{CACHE_DIR}/audio/transcriptions"`
			`os.makedirs(file_dir, exist_ok=True)`
			`file_path = f"{file_dir}/{filename}"`

fix: safari audio issue 2024-06-08 09:07:19 +00:00			`print(filename)`

feat: whisper support 2024-02-11 08:17:50 +00:00			`contents = file.file.read()`
			`with open(file_path, "wb") as f:`
			`f.write(contents)`
			`f.close()`

feat: external stt 2024-06-08 03:31:52 +00:00			`if app.state.config.STT_ENGINE == "":`
refac: lazily load faster_whisper to reduce start up memory usage 2024-07-01 00:13:02 +00:00			`from faster_whisper import WhisperModel`

feat: external stt 2024-06-08 03:31:52 +00:00			`whisper_kwargs = {`
			`"model_size_or_path": WHISPER_MODEL,`
			`"device": whisper_device_type,`
			`"compute_type": "int8",`
			`"download_root": WHISPER_MODEL_DIR,`
			`"local_files_only": not WHISPER_MODEL_AUTO_UPDATE,`
			`}`

			`log.debug(f"whisper_kwargs: {whisper_kwargs}")`

			`try:`
			`model = WhisperModel(**whisper_kwargs)`
			`except:`
			`log.warning(`
			`"WhisperModel initialization failed, attempting download with local_files_only=False"`
			`)`
			`whisper_kwargs["local_files_only"] = False`
			`model = WhisperModel(**whisper_kwargs)`

			`segments, info = model.transcribe(file_path, beam_size=5)`
			`log.info(`
			`"Detected language '%s' with probability %f"`
			`% (info.language, info.language_probability)`
Format fix 2024-04-11 02:44:44 +00:00			`)`
feat: whisper support 2024-02-11 08:17:50 +00:00
feat: external stt 2024-06-08 03:31:52 +00:00			`transcript = "".join([segment.text for segment in list(segments)])`

fix: safari audio issue 2024-06-08 09:07:19 +00:00			`data = {"text": transcript.strip()}`

feat: external stt 2024-06-08 03:31:52 +00:00			`# save the transcript to a json file`
			`transcript_file = f"{file_dir}/{id}.json"`
			`with open(transcript_file, "w") as f:`
fix: safari audio issue 2024-06-08 09:07:19 +00:00			`json.dump(data, f)`
refac 2024-06-08 07:52:19 +00:00
			`print(data)`

			`return data`
feat: whisper support 2024-02-11 08:17:50 +00:00
feat: external stt 2024-06-08 03:31:52 +00:00			`elif app.state.config.STT_ENGINE == "openai":`
fix: safari audio issue 2024-06-08 09:07:19 +00:00			`if is_mp4_audio(file_path):`
			`print("is_mp4_audio")`
			`os.rename(file_path, file_path.replace(".wav", ".mp4"))`
			`# Convert MP4 audio file to WAV format`
			`convert_mp4_to_wav(file_path.replace(".wav", ".mp4"), file_path)`

feat: external stt 2024-06-08 03:31:52 +00:00			`headers = {"Authorization": f"Bearer {app.state.config.STT_OPENAI_API_KEY}"}`
refac 2024-06-07 03:44:42 +00:00
feat: external stt 2024-06-08 03:31:52 +00:00			`files = {"file": (filename, open(file_path, "rb"))}`
Make STT model configurable 2024-06-25 21:46:12 +00:00			`data = {"model": app.state.config.STT_MODEL}`
feat: external stt 2024-06-08 03:31:52 +00:00
fix: safari audio issue 2024-06-08 09:07:19 +00:00			`print(files, data)`

feat: external stt 2024-06-08 03:31:52 +00:00			`r = None`
			`try:`
			`r = requests.post(`
			`url=f"{app.state.config.STT_OPENAI_API_BASE_URL}/audio/transcriptions",`
			`headers=headers,`
			`files=files,`
			`data=data,`
			`)`

			`r.raise_for_status()`
refac 2024-06-08 07:52:19 +00:00
			`data = r.json()`
fix: safari audio issue 2024-06-08 09:07:19 +00:00
			`# save the transcript to a json file`
			`transcript_file = f"{file_dir}/{id}.json"`
			`with open(transcript_file, "w") as f:`
			`json.dump(data, f)`

refac 2024-06-08 07:52:19 +00:00			`print(data)`
			`return data`
feat: external stt 2024-06-08 03:31:52 +00:00			`except Exception as e:`
			`log.exception(e)`
			`error_detail = "Open WebUI: Server Connection Error"`
			`if r is not None:`
			`try:`
			`res = r.json()`
			`if "error" in res:`
			`error_detail = f"External: {res['error']['message']}"`
			`except:`
			`error_detail = f"External: {e}"`

			`raise HTTPException(`
			`status_code=r.status_code if r != None else 500,`
			`detail=error_detail,`
			`)`
feat: whisper support 2024-02-11 08:17:50 +00:00
			`except Exception as e:`
Migrate to python logging module with env var control. 2024-03-20 23:11:36 +00:00			`log.exception(e)`
feat: whisper support 2024-02-11 08:17:50 +00:00
			`raise HTTPException(`
			`status_code=status.HTTP_400_BAD_REQUEST,`
			`detail=ERROR_MESSAGES.DEFAULT(e),`
			`)`
enh: ElevenLabs voice name -> ID 2024-07-20 06:56:00 +00:00

refac 2024-08-02 17:24:47 +00:00			`def get_available_models() -> List[dict]:`
			`if app.state.config.TTS_ENGINE == "openai":`
			`return [{"id": "tts-1"}, {"id": "tts-1-hd"}]`
			`elif app.state.config.TTS_ENGINE == "elevenlabs":`
			`headers = {`
			`"xi-api-key": app.state.config.TTS_API_KEY,`
			`"Content-Type": "application/json",`
			`}`

			`try:`
			`response = requests.get(`
			`"https://api.elevenlabs.io/v1/models", headers=headers`
			`)`
			`response.raise_for_status()`
			`models = response.json()`
			`return [`
			`{"name": model["name"], "id": model["model_id"]} for model in models`
			`]`
			`except requests.RequestException as e:`
			`log.error(f"Error fetching voices: {str(e)}")`
			`return []`


			`@app.get("/models")`
			`async def get_models(user=Depends(get_verified_user)):`
			`return {"models": get_available_models()}`


			`def get_available_voices() -> List[dict]:`
			`if app.state.config.TTS_ENGINE == "openai":`
			`return [`
			`{"name": "alloy", "id": "alloy"},`
			`{"name": "echo", "id": "echo"},`
			`{"name": "fable", "id": "fable"},`
			`{"name": "onyx", "id": "onyx"},`
			`{"name": "nova", "id": "nova"},`
			`{"name": "shimmer", "id": "shimmer"},`
			`]`
			`elif app.state.config.TTS_ENGINE == "elevenlabs":`
			`headers = {`
			`"xi-api-key": app.state.config.TTS_API_KEY,`
			`"Content-Type": "application/json",`
			`}`

			`try:`
			`response = requests.get(`
			`"https://api.elevenlabs.io/v1/voices", headers=headers`
			`)`
			`response.raise_for_status()`
			`voices_data = response.json()`

			`voices = []`
			`for voice in voices_data.get("voices", []):`
			`voices.append({"name": voice["name"], "id": voice["voice_id"]})`
			`return voices`
			`except requests.RequestException as e:`
			`log.error(f"Error fetching voices: {str(e)}")`

			`return []`


enh: ElevenLabs voice name -> ID 2024-07-20 06:56:00 +00:00			`@app.get("/voices")`
			`async def get_voices(user=Depends(get_verified_user)):`
refac 2024-08-02 17:24:47 +00:00			`return {"voices": get_available_voices()}`