fixes and updates

This commit is contained in:
Jannik Streidl 2024-04-02 14:47:52 +02:00
commit 9bcb37ea10
6 changed files with 150 additions and 81 deletions

View File

@ -1,5 +1,4 @@
#
name: Create and publish a Docker image
name: Create and publish Docker images with specific build args
# Configures this workflow to run every time a change is pushed to the branch called `release`.
on:
@ -23,7 +22,7 @@ jobs:
permissions:
contents: read
packages: write
#
steps:
- name: Checkout repository
uses: actions/checkout@v4
@ -41,8 +40,8 @@ jobs:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata for Docker images
id: meta
- name: Extract metadata for Docker images (default latest tag)
id: meta-latest
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
@ -53,14 +52,31 @@ jobs:
type=sha,prefix=git-
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
flavor: |
latest=${{ github.ref == 'refs/heads/main' }}
latest=true
- name: Build and push Docker image
- name: Build and push Docker image (latest)
uses: docker/build-push-action@v5
with:
context: .
push: true
platforms: linux/amd64,linux/arm64
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
tags: ${{ steps.meta-latest.outputs.tags }}
labels: ${{ steps.meta-latest.outputs.labels }}
- name: Build and push Docker image with CUDA
uses: docker/build-push-action@v5
with:
context: .
push: true
platforms: linux/amd64,linux/arm64
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:cuda
build-args: USE_CUDA=true
- name: Build and push Docker image with Ollama
uses: docker/build-push-action@v5
with:
context: .
push: true
platforms: linux/amd64,linux/arm64
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:with-ollama
build-args: USE_OLLAMA=true

View File

@ -1,77 +1,110 @@
# syntax=docker/dockerfile:1
# Initialize device type args
# use build args in the docker build commmand with --build-arg="BUILDARG=true"
ARG USE_CUDA=false
ARG USE_OLLAMA=false
# Tested with cu117 for CUDA 11 and cu121 for CUDA 12 (default)
ARG USE_CUDA_VER=cu121
# any sentence transformer model; models to use can be found at https://huggingface.co/models?library=sentence-transformers
# Leaderboard: https://huggingface.co/spaces/mteb/leaderboard
# for better performance and multilangauge support use "intfloat/multilingual-e5-large" (~2.5GB) or "intfloat/multilingual-e5-base" (~1.5GB)
# IMPORTANT: If you change the default model (all-MiniLM-L6-v2) and vice versa, you aren't able to use RAG Chat with your previous documents loaded in the WebUI! You need to re-embed them.
ARG USE_EMBEDDING_MODEL=all-MiniLM-L6-v2
FROM node:alpine as build
######## WebUI frontend ########
FROM node:21-alpine3.19 as build
WORKDIR /app
# wget embedding model weight from alpine (does not exist from slim-buster)
RUN wget "https://chroma-onnx-models.s3.amazonaws.com/all-MiniLM-L6-v2/onnx.tar.gz" -O - | \
tar -xzf - -C /app
COPY package.json package-lock.json ./
RUN npm ci
COPY . .
RUN npm run build
######## WebUI backend ########
FROM python:3.11-slim-bookworm as base
ENV ENV=prod
ENV PORT ""
# Use args
ARG USE_CUDA
ARG USE_OLLAMA
ARG USE_CUDA_VER
ARG USE_EMBEDDING_MODEL
ENV OLLAMA_BASE_URL "/ollama"
## Basis ##
ENV ENV=prod \
PORT=8080 \
# pass build args to the build
USE_OLLAMA_DOCKER=${USE_OLLAMA} \
USE_CUDA_DOCKER=${USE_CUDA} \
USE_CUDA_DOCKER_VER=${USE_CUDA_VER} \
USE_EMBEDDING_MODEL_DOCKER=${USE_EMBEDDING_MODEL}
ENV OPENAI_API_BASE_URL ""
ENV OPENAI_API_KEY ""
## Basis URL Config ##
ENV OLLAMA_BASE_URL="/ollama" \
OPENAI_API_BASE_URL=""
ENV WEBUI_SECRET_KEY ""
## API Key and Security Config ##
ENV OPENAI_API_KEY="" \
WEBUI_SECRET_KEY="" \
SCARF_NO_ANALYTICS=true \
DO_NOT_TRACK=true
ENV SCARF_NO_ANALYTICS true
ENV DO_NOT_TRACK true
#### Other models #########################################################
## whisper TTS model settings ##
ENV WHISPER_MODEL="base" \
WHISPER_MODEL_DIR="/app/backend/data/cache/whisper/models"
######## Preloaded models ########
# whisper TTS Settings
ENV WHISPER_MODEL="base"
ENV WHISPER_MODEL_DIR="/app/backend/data/cache/whisper/models"
# RAG Embedding Model Settings
# any sentence transformer model; models to use can be found at https://huggingface.co/models?library=sentence-transformers
# Leaderboard: https://huggingface.co/spaces/mteb/leaderboard
# for better persormance and multilangauge support use "intfloat/multilingual-e5-large" (~2.5GB) or "intfloat/multilingual-e5-base" (~1.5GB)
# IMPORTANT: If you change the default model (all-MiniLM-L6-v2) and vice versa, you aren't able to use RAG Chat with your previous documents loaded in the WebUI! You need to re-embed them.
ENV RAG_EMBEDDING_MODEL="all-MiniLM-L6-v2"
# device type for whisper tts and embbeding models - "cpu" (default), "cuda" (nvidia gpu and CUDA required) or "mps" (apple silicon) - choosing this right can lead to better performance
ENV RAG_EMBEDDING_MODEL_DEVICE_TYPE="cpu"
ENV RAG_EMBEDDING_MODEL_DIR="/app/backend/data/cache/embedding/models"
ENV SENTENCE_TRANSFORMERS_HOME $RAG_EMBEDDING_MODEL_DIR
######## Preloaded models ########
## RAG Embedding model settings ##
ENV RAG_EMBEDDING_MODEL="$USE_EMBEDDING_MODEL_DOCKER" \
RAG_EMBEDDING_MODEL_DIR="/app/backend/data/cache/embedding/models" \
SENTENCE_TRANSFORMERS_HOME="/app/backend/data/cache/embedding/models"
#### Other models ##########################################################
WORKDIR /app/backend
# install python dependencies
COPY ./backend/requirements.txt ./requirements.txt
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
RUN if [ "$USE_CUDA" = "true" ]; then \
# If you use CUDA the whisper and embedding modell will be downloaded on first use
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/$USE_CUDA_DOCKER_VER --no-cache-dir && \
pip3 install -r requirements.txt --no-cache-dir && \
python -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='cpu', compute_type='int8', download_root=os.environ['WHISPER_MODEL_DIR'])"; \
else \
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir && \
pip3 install -r requirements.txt --no-cache-dir && \
python -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='cpu', compute_type='int8', download_root=os.environ['WHISPER_MODEL_DIR'])" && \
python -c "import os; from chromadb.utils import embedding_functions; sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=os.environ['RAG_EMBEDDING_MODEL'], device='cpu')"; \
fi
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
RUN pip3 install -r requirements.txt --no-cache-dir
# Install pandoc and netcat
# RUN python -c "import pypandoc; pypandoc.download_pandoc()"
RUN apt-get update \
&& apt-get install -y pandoc netcat-openbsd \
&& rm -rf /var/lib/apt/lists/*
RUN if [ "$USE_OLLAMA" = "true" ]; then \
apt-get update && \
# Install pandoc and netcat
apt-get install -y --no-install-recommends pandoc netcat-openbsd && \
# for RAG OCR
apt-get install -y --no-install-recommends ffmpeg libsm6 libxext6 && \
# install helper tools
apt-get install -y --no-install-recommends curl && \
# install ollama
curl -fsSL https://ollama.com/install.sh | sh && \
# cleanup
rm -rf /var/lib/apt/lists/*; \
else \
apt-get update && \
# Install pandoc and netcat
apt-get install -y --no-install-recommends pandoc netcat-openbsd && \
# for RAG OCR
apt-get install -y --no-install-recommends ffmpeg libsm6 libxext6 && \
# cleanup
rm -rf /var/lib/apt/lists/*; \
fi
# preload embedding model
RUN python -c "import os; from chromadb.utils import embedding_functions; sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=os.environ['RAG_EMBEDDING_MODEL'], device=os.environ['RAG_EMBEDDING_MODEL_DEVICE_TYPE'])"
# preload tts model
RUN python -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='auto', compute_type='int8', download_root=os.environ['WHISPER_MODEL_DIR'])"
# copy embedding weight from build
RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2
COPY --from=build /app/onnx /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx
# RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2
# COPY --from=build /app/onnx /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx
# copy built frontend files
COPY --from=build /app/build /app/build
@ -81,4 +114,6 @@ COPY --from=build /app/package.json /app/package.json
# copy backend files
COPY ./backend .
CMD [ "bash", "start.sh"]
EXPOSE 8080
CMD [ "bash", "start.sh"]

View File

@ -28,6 +28,7 @@ from config import (
UPLOAD_DIR,
WHISPER_MODEL,
WHISPER_MODEL_DIR,
DEVICE_TYPE,
)
log = logging.getLogger(__name__)
@ -42,6 +43,10 @@ app.add_middleware(
allow_headers=["*"],
)
# setting device type for whisper model
whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu"
log.info(f"whisper_device_type: {whisper_device_type}")
@app.post("/transcribe")
def transcribe(
@ -66,7 +71,7 @@ def transcribe(
model = WhisperModel(
WHISPER_MODEL,
device="auto",
device=whisper_device_type,
compute_type="int8",
download_root=WHISPER_MODEL_DIR,
)

View File

@ -59,7 +59,7 @@ from config import (
UPLOAD_DIR,
DOCS_DIR,
RAG_EMBEDDING_MODEL,
RAG_EMBEDDING_MODEL_DEVICE_TYPE,
DEVICE_TYPE,
CHROMA_CLIENT,
CHUNK_SIZE,
CHUNK_OVERLAP,
@ -71,15 +71,6 @@ from constants import ERROR_MESSAGES
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
#
# if RAG_EMBEDDING_MODEL:
# sentence_transformer_ef = SentenceTransformer(
# model_name_or_path=RAG_EMBEDDING_MODEL,
# cache_folder=RAG_EMBEDDING_MODEL_DIR,
# device=RAG_EMBEDDING_MODEL_DEVICE_TYPE,
# )
app = FastAPI()
app.state.PDF_EXTRACT_IMAGES = False
@ -92,7 +83,7 @@ app.state.TOP_K = 4
app.state.sentence_transformer_ef = (
embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=app.state.RAG_EMBEDDING_MODEL,
device=RAG_EMBEDDING_MODEL_DEVICE_TYPE,
device=DEVICE_TYPE,
)
)
@ -147,10 +138,9 @@ async def update_embedding_model(
app.state.sentence_transformer_ef = (
embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=app.state.RAG_EMBEDDING_MODEL,
device=RAG_EMBEDDING_MODEL_DEVICE_TYPE,
device=DEVICE_TYPE,
)
)
return {
"status": True,
"embedding_model": app.state.RAG_EMBEDDING_MODEL,

View File

@ -253,6 +253,8 @@ OLLAMA_API_BASE_URL = os.environ.get(
OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "")
K8S_FLAG = os.environ.get("K8S_FLAG", "")
USE_OLLAMA_DOCKER = os.environ.get("USE_OLLAMA_DOCKER", "false")
if OLLAMA_BASE_URL == "" and OLLAMA_API_BASE_URL != "":
OLLAMA_BASE_URL = (
@ -263,7 +265,12 @@ if OLLAMA_BASE_URL == "" and OLLAMA_API_BASE_URL != "":
if ENV == "prod":
if OLLAMA_BASE_URL == "/ollama":
OLLAMA_BASE_URL = "http://host.docker.internal:11434"
if USE_OLLAMA_DOCKER.lower() == "true":
# if you use all-in-one docker container (Open WebUI + Ollama)
# with the docker build arg USE_OLLAMA=true (--build-arg="USE_OLLAMA=true") this only works with http://localhost:11434
OLLAMA_BASE_URL = "http://localhost:11434"
else:
OLLAMA_BASE_URL = "http://host.docker.internal:11434"
elif K8S_FLAG:
OLLAMA_BASE_URL = "http://ollama-service.open-webui.svc.cluster.local:11434"
@ -384,10 +391,16 @@ if WEBUI_AUTH and WEBUI_SECRET_KEY == "":
CHROMA_DATA_PATH = f"{DATA_DIR}/vector_db"
# this uses the model defined in the Dockerfile ENV variable. If you dont use docker or docker based deployments such as k8s, the default embedding model will be used (all-MiniLM-L6-v2)
RAG_EMBEDDING_MODEL = os.environ.get("RAG_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
log.info(f"Embedding model set: {RAG_EMBEDDING_MODEL}"),
# device type ebbeding models - "cpu" (default), "cuda" (nvidia gpu required) or "mps" (apple silicon) - choosing this right can lead to better performance
RAG_EMBEDDING_MODEL_DEVICE_TYPE = os.environ.get(
"RAG_EMBEDDING_MODEL_DEVICE_TYPE", "cpu"
)
USE_CUDA = os.environ.get("USE_CUDA_DOCKER", "false")
if USE_CUDA.lower() == "true":
DEVICE_TYPE = "cuda"
else:
DEVICE_TYPE = "cpu"
CHROMA_CLIENT = chromadb.PersistentClient(
path=CHROMA_DATA_PATH,
settings=Settings(allow_reset=True, anonymized_telemetry=False),

View File

@ -7,16 +7,26 @@ KEY_FILE=.webui_secret_key
PORT="${PORT:-8080}"
if test "$WEBUI_SECRET_KEY $WEBUI_JWT_SECRET_KEY" = " "; then
echo No WEBUI_SECRET_KEY provided
echo "No WEBUI_SECRET_KEY provided"
if ! [ -e "$KEY_FILE" ]; then
echo Generating WEBUI_SECRET_KEY
echo "Generating WEBUI_SECRET_KEY"
# Generate a random value to use as a WEBUI_SECRET_KEY in case the user didn't provide one.
echo $(head -c 12 /dev/random | base64) > $KEY_FILE
echo $(head -c 12 /dev/random | base64) > "$KEY_FILE"
fi
echo Loading WEBUI_SECRET_KEY from $KEY_FILE
WEBUI_SECRET_KEY=`cat $KEY_FILE`
echo "Loading WEBUI_SECRET_KEY from $KEY_FILE"
WEBUI_SECRET_KEY=$(cat "$KEY_FILE")
fi
WEBUI_SECRET_KEY="$WEBUI_SECRET_KEY" exec uvicorn main:app --host 0.0.0.0 --port "$PORT" --forwarded-allow-ips '*'
if [ "$USE_OLLAMA_DOCKER" = "true" ]; then
echo "USE_OLLAMA is set to true, starting ollama serve."
ollama serve &
fi
if [ "$USE_CUDA_DOCKER" = "true" ]; then
echo "CUDA is enabled, appending LD_LIBRARY_PATH to include torch/cudnn & cublas libraries."
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.11/site-packages/torch/lib:/usr/local/lib/python3.11/site-packages/nvidia/cublas/lib"
fi
WEBUI_SECRET_KEY="$WEBUI_SECRET_KEY" exec uvicorn main:app --host 0.0.0.0 --port "$PORT" --forwarded-allow-ips '*'