diff --git a/.github/workflows/docker-build.yaml b/.github/workflows/docker-build.yaml
index ebc4cc42b..44c9c654b 100644
--- a/.github/workflows/docker-build.yaml
+++ b/.github/workflows/docker-build.yaml
@@ -1,5 +1,4 @@
-#
-name: Create and publish a Docker image
+name: Create and publish Docker images with specific build args
 
 # Configures this workflow to run every time a change is pushed to the branch called `release`.
 on:
@@ -24,7 +23,7 @@ jobs:
     permissions:
       contents: read
       packages: write
-      #
+
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -42,8 +41,8 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Extract metadata for Docker images
-        id: meta
+      - name: Extract metadata for Docker images (default latest tag)
+        id: meta-latest
         uses: docker/metadata-action@v5
         with:
           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
@@ -54,14 +53,31 @@ jobs:
             type=sha,prefix=git-
             type=semver,pattern={{version}}
             type=semver,pattern={{major}}.{{minor}}
-          flavor: |
-            latest=${{ github.ref == 'refs/heads/main' }}
+            latest=true
 
-      - name: Build and push Docker image
+      - name: Build and push Docker image (latest)
         uses: docker/build-push-action@v5
         with:
           context: .
           push: true
           platforms: linux/amd64,linux/arm64
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
+          tags: ${{ steps.meta-latest.outputs.tags }}
+          labels: ${{ steps.meta-latest.outputs.labels }}
+
+      - name: Build and push Docker image with CUDA
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          platforms: linux/amd64,linux/arm64
+          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:cuda
+          build-args: USE_CUDA=true
+
+      - name: Build and push Docker image with Ollama
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          platforms: linux/amd64,linux/arm64
+          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:ollama
+          build-args: USE_OLLAMA=true
diff --git a/Dockerfile b/Dockerfile
index f76f8c32e..e18915222 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,82 +1,111 @@
 # syntax=docker/dockerfile:1
+# Initialize device type args
+# use build args in the docker build commmand with --build-arg="BUILDARG=true"
+ARG USE_CUDA=false
+ARG USE_OLLAMA=false
+# Tested with cu117 for CUDA 11 and cu121 for CUDA 12 (default)
+ARG USE_CUDA_VER=cu121
+# any sentence transformer model; models to use can be found at https://huggingface.co/models?library=sentence-transformers
+# Leaderboard: https://huggingface.co/spaces/mteb/leaderboard 
+# for better performance and multilangauge support use "intfloat/multilingual-e5-large" (~2.5GB) or "intfloat/multilingual-e5-base" (~1.5GB)
+# IMPORTANT: If you change the default model (all-MiniLM-L6-v2) and vice versa, you aren't able to use RAG Chat with your previous documents loaded in the WebUI! You need to re-embed them.
+ARG USE_EMBEDDING_MODEL=all-MiniLM-L6-v2
 
-FROM node:alpine as build
+######## WebUI frontend ########
+FROM node:21-alpine3.19 as build
 
 WORKDIR /app
 
-# wget embedding model weight from alpine (does not exist from slim-buster)
-RUN wget "https://chroma-onnx-models.s3.amazonaws.com/all-MiniLM-L6-v2/onnx.tar.gz" -O - | \
-    tar -xzf - -C /app
-
 COPY package.json package-lock.json ./
 RUN npm ci
 
 COPY . .
 RUN npm run build
 
-
+######## WebUI backend ########
 FROM python:3.11-slim-bookworm as base
 
-ENV ENV=prod
-ENV PORT ""
+# Use args
+ARG USE_CUDA
+ARG USE_OLLAMA
+ARG USE_CUDA_VER
+ARG USE_EMBEDDING_MODEL
 
-ENV OLLAMA_BASE_URL "/ollama"
+## Basis ##
+ENV ENV=prod \
+    PORT=8080 \
+    # pass build args to the build
+    USE_OLLAMA_DOCKER=${USE_OLLAMA} \
+    USE_CUDA_DOCKER=${USE_CUDA} \
+    USE_CUDA_DOCKER_VER=${USE_CUDA_VER} \
+    USE_EMBEDDING_MODEL_DOCKER=${USE_EMBEDDING_MODEL}
 
-ENV OPENAI_API_BASE_URL ""
-ENV OPENAI_API_KEY ""
+## Basis URL Config ##
+ENV OLLAMA_BASE_URL="/ollama" \
+    OPENAI_API_BASE_URL=""
 
-ENV WEBUI_SECRET_KEY ""
-ENV WEBUI_AUTH_TRUSTED_EMAIL_HEADER ""
+## API Key and Security Config ##
+ENV OPENAI_API_KEY="" \
+    WEBUI_SECRET_KEY="" \
+    SCARF_NO_ANALYTICS=true \
+    DO_NOT_TRACK=true
 
-ENV SCARF_NO_ANALYTICS true
-ENV DO_NOT_TRACK true
+#### Other models #########################################################
+## whisper TTS model settings ##
+ENV WHISPER_MODEL="base" \
+    WHISPER_MODEL_DIR="/app/backend/data/cache/whisper/models"
 
-# Use locally bundled version of the LiteLLM cost map json
-# to avoid repetitive startup connections
-ENV LITELLM_LOCAL_MODEL_COST_MAP="True"
-
-######## Preloaded models ########
-# whisper TTS Settings
-ENV WHISPER_MODEL="base"
-ENV WHISPER_MODEL_DIR="/app/backend/data/cache/whisper/models"
-
-# RAG Embedding Model Settings
-# any sentence transformer model; models to use can be found at https://huggingface.co/models?library=sentence-transformers
-# Leaderboard: https://huggingface.co/spaces/mteb/leaderboard 
-# for better persormance and multilangauge support use "intfloat/multilingual-e5-large" (~2.5GB) or "intfloat/multilingual-e5-base" (~1.5GB)
-# IMPORTANT: If you change the default model (all-MiniLM-L6-v2) and vice versa, you aren't able to use RAG Chat with your previous documents loaded in the WebUI! You need to re-embed them.
-ENV RAG_EMBEDDING_MODEL="all-MiniLM-L6-v2"
-# device type for whisper tts and embbeding models - "cpu" (default), "cuda" (nvidia gpu and CUDA required) or "mps" (apple silicon) - choosing this right can lead to better performance
-ENV RAG_EMBEDDING_MODEL_DEVICE_TYPE="cpu"
-ENV RAG_EMBEDDING_MODEL_DIR="/app/backend/data/cache/embedding/models"
-ENV SENTENCE_TRANSFORMERS_HOME $RAG_EMBEDDING_MODEL_DIR
-
-######## Preloaded models ########
+## RAG Embedding model settings ##
+ENV RAG_EMBEDDING_MODEL="$USE_EMBEDDING_MODEL_DOCKER" \
+    RAG_EMBEDDING_MODEL_DIR="/app/backend/data/cache/embedding/models" \
+    SENTENCE_TRANSFORMERS_HOME="/app/backend/data/cache/embedding/models"
+#### Other models ##########################################################
 
 WORKDIR /app/backend
-
 # install python dependencies
 COPY ./backend/requirements.txt ./requirements.txt
 
-RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+RUN if [ "$USE_CUDA" = "true" ]; then \
+        # If you use CUDA the whisper and embedding modell will be downloaded on first use
+        pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/$USE_CUDA_DOCKER_VER --no-cache-dir && \
+        pip3 install -r requirements.txt --no-cache-dir && \
+        python -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='cpu', compute_type='int8', download_root=os.environ['WHISPER_MODEL_DIR'])" && \
+        python -c "import os; from chromadb.utils import embedding_functions; sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=os.environ['RAG_EMBEDDING_MODEL'], device='cpu')"; \
+    else \
+        pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir && \
+        pip3 install -r requirements.txt --no-cache-dir && \
+        python -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='cpu', compute_type='int8', download_root=os.environ['WHISPER_MODEL_DIR'])" && \
+        python -c "import os; from chromadb.utils import embedding_functions; sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=os.environ['RAG_EMBEDDING_MODEL'], device='cpu')"; \
+    fi
 
-RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
-RUN pip3 install -r requirements.txt --no-cache-dir
 
-# Install pandoc and netcat
-# RUN python -c "import pypandoc; pypandoc.download_pandoc()"
-RUN apt-get update \
-    && apt-get install -y pandoc netcat-openbsd \
-    && rm -rf /var/lib/apt/lists/*
+RUN if [ "$USE_OLLAMA" = "true" ]; then \
+        apt-get update && \
+        # Install pandoc and netcat
+        apt-get install -y --no-install-recommends pandoc netcat-openbsd && \
+        # for RAG OCR
+        apt-get install -y --no-install-recommends ffmpeg libsm6 libxext6 && \
+        # install helper tools
+        apt-get install -y --no-install-recommends curl && \
+        # install ollama
+        curl -fsSL https://ollama.com/install.sh | sh && \
+        # cleanup
+        rm -rf /var/lib/apt/lists/*; \
+    else \
+        apt-get update && \
+        # Install pandoc and netcat
+        apt-get install -y --no-install-recommends pandoc netcat-openbsd && \
+        # for RAG OCR
+        apt-get install -y --no-install-recommends ffmpeg libsm6 libxext6 && \
+        # cleanup
+        rm -rf /var/lib/apt/lists/*; \
+    fi
+
 
-# preload embedding model
-RUN python -c "import os; from chromadb.utils import embedding_functions; sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=os.environ['RAG_EMBEDDING_MODEL'], device=os.environ['RAG_EMBEDDING_MODEL_DEVICE_TYPE'])"
-# preload tts model
-RUN python -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='auto', compute_type='int8', download_root=os.environ['WHISPER_MODEL_DIR'])"
 
 # copy embedding weight from build
-RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2
-COPY --from=build /app/onnx /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx
+# RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2
+# COPY --from=build /app/onnx /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx
 
 # copy built frontend files
 COPY --from=build /app/build /app/build
@@ -86,4 +115,6 @@ COPY --from=build /app/package.json /app/package.json
 # copy backend files
 COPY ./backend .
 
-CMD [ "bash", "start.sh"]
+EXPOSE 8080
+
+CMD [ "bash", "start.sh"]
\ No newline at end of file
diff --git a/README.md b/README.md
index e2ee284e3..3c0093e71 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,65 @@ Don't forget to explore our sibling project, [Open WebUI Community](https://open
 
 - After installation, you can access Open WebUI at [http://localhost:3000](http://localhost:3000). Enjoy! 😄
 
+- **If you want to customize your build with additional args**, use this commands:
+
+  > [!NOTE]  
+  > If you only want to use Open WebUI with Ollama included or CUDA acelleration it's recomented to use our official images with the tags :cuda or :with-ollama
+  > If you want a combination of both or more customisation options like a different embedding model and/or CUDA version you need to build the image yourself following the instructions below.
+
+  **For the build:**
+
+  ```bash
+  docker build -t open-webui
+  ```
+
+  Optional build ARGS (use them in the docker build command below if needed):
+
+      e.g.
+
+  ```bash
+  --build-arg="USE_EMBEDDING_MODEL=intfloat/multilingual-e5-large"
+  ```
+
+  For "intfloat/multilingual-e5-large" custom embedding model (default is all-MiniLM-L6-v2), only works with [sentence transforer models](https://huggingface.co/models?library=sentence-transformers). Current [Leaderbord](https://huggingface.co/spaces/mteb/leaderboard) of embedding models.
+
+  ```bash
+  --build-arg="USE_OLLAMA=true"
+  ```
+
+  For including ollama in the image.
+
+  ```bash
+  --build-arg="USE_CUDA=true"
+  ```
+
+  To use CUDA exeleration for the embedding and whisper models.
+
+  > [!NOTE]
+  > You need to install the [Nvidia CUDA container toolkit](https://docs.nvidia.com/dgx/nvidia-container-runtime-upgrade/) on your machine to be able to set CUDA as the Docker engine. Only works with Linux - use WSL for Windows!
+
+  ```bash
+  --build-arg="USE_CUDA_VER=cu117"
+  ```
+
+  For CUDA 11 (default is CUDA 12)
+
+  **To run the image:**
+
+  - **If you DID NOT use the USE_CUDA=true build ARG**, use this command:
+
+  ```bash
+    docker run -d -p 3000:8080 -v open-webui:/app/backend/data --name open-webui --restart always ghcr.io/open-webui/open-webui:main
+  ```
+
+  - **If you DID use the USE_CUDA=true build ARG**, use this command:
+
+  ```bash
+    docker run --gpus all -d -p 3000:8080 -v open-webui:/app/backend/data --name open-webui --restart always ghcr.io/open-webui/open-webui:main
+  ```
+
+  - After installation, you can access Open WebUI at [http://localhost:3000](http://localhost:3000). Enjoy! 😄
+
 #### Open WebUI: Server Connection Error
 
 If you're experiencing connection issues, it’s often due to the WebUI docker container not being able to reach the Ollama server at 127.0.0.1:11434 (host.docker.internal:11434) inside the container . Use the `--network=host` flag in your docker command to resolve this. Note that the port changes from 3000 to 8080, resulting in the link: `http://localhost:8080`.
diff --git a/backend/apps/audio/main.py b/backend/apps/audio/main.py
index bb3cd0536..02d1f5e8e 100644
--- a/backend/apps/audio/main.py
+++ b/backend/apps/audio/main.py
@@ -28,6 +28,7 @@ from config import (
     UPLOAD_DIR,
     WHISPER_MODEL,
     WHISPER_MODEL_DIR,
+    DEVICE_TYPE,
 )
 
 log = logging.getLogger(__name__)
@@ -42,6 +43,10 @@ app.add_middleware(
     allow_headers=["*"],
 )
 
+# setting device type for whisper model
+whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu"
+log.info(f"whisper_device_type: {whisper_device_type}")
+
 
 @app.post("/transcribe")
 def transcribe(
@@ -66,7 +71,7 @@ def transcribe(
 
         model = WhisperModel(
             WHISPER_MODEL,
-            device="auto",
+            device=whisper_device_type,
             compute_type="int8",
             download_root=WHISPER_MODEL_DIR,
         )
diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py
index 671429bb5..08639866f 100644
--- a/backend/apps/rag/main.py
+++ b/backend/apps/rag/main.py
@@ -59,7 +59,7 @@ from config import (
     UPLOAD_DIR,
     DOCS_DIR,
     RAG_EMBEDDING_MODEL,
-    RAG_EMBEDDING_MODEL_DEVICE_TYPE,
+    DEVICE_TYPE,
     CHROMA_CLIENT,
     CHUNK_SIZE,
     CHUNK_OVERLAP,
@@ -71,15 +71,6 @@ from constants import ERROR_MESSAGES
 log = logging.getLogger(__name__)
 log.setLevel(SRC_LOG_LEVELS["RAG"])
 
-#
-# if RAG_EMBEDDING_MODEL:
-#    sentence_transformer_ef = SentenceTransformer(
-#        model_name_or_path=RAG_EMBEDDING_MODEL,
-#        cache_folder=RAG_EMBEDDING_MODEL_DIR,
-#        device=RAG_EMBEDDING_MODEL_DEVICE_TYPE,
-#    )
-
-
 app = FastAPI()
 
 app.state.PDF_EXTRACT_IMAGES = False
@@ -92,7 +83,7 @@ app.state.TOP_K = 4
 app.state.sentence_transformer_ef = (
     embedding_functions.SentenceTransformerEmbeddingFunction(
         model_name=app.state.RAG_EMBEDDING_MODEL,
-        device=RAG_EMBEDDING_MODEL_DEVICE_TYPE,
+        device=DEVICE_TYPE,
     )
 )
 
@@ -147,10 +138,9 @@ async def update_embedding_model(
     app.state.sentence_transformer_ef = (
         embedding_functions.SentenceTransformerEmbeddingFunction(
             model_name=app.state.RAG_EMBEDDING_MODEL,
-            device=RAG_EMBEDDING_MODEL_DEVICE_TYPE,
+            device=DEVICE_TYPE,
         )
     )
-
     return {
         "status": True,
         "embedding_model": app.state.RAG_EMBEDDING_MODEL,
diff --git a/backend/config.py b/backend/config.py
index 39411d255..402a4183e 100644
--- a/backend/config.py
+++ b/backend/config.py
@@ -257,6 +257,7 @@ OLLAMA_API_BASE_URL = os.environ.get(
 
 OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "")
 K8S_FLAG = os.environ.get("K8S_FLAG", "")
+USE_OLLAMA_DOCKER = os.environ.get("USE_OLLAMA_DOCKER", "false")
 
 if OLLAMA_BASE_URL == "" and OLLAMA_API_BASE_URL != "":
     OLLAMA_BASE_URL = (
@@ -266,9 +267,13 @@ if OLLAMA_BASE_URL == "" and OLLAMA_API_BASE_URL != "":
     )
 
 if ENV == "prod":
-    if OLLAMA_BASE_URL == "/ollama":
-        OLLAMA_BASE_URL = "http://host.docker.internal:11434"
-
+    if OLLAMA_BASE_URL == "/ollama" and not K8S_FLAG:
+        if USE_OLLAMA_DOCKER.lower() == "true":
+            # if you use all-in-one docker container (Open WebUI + Ollama)
+            # with the docker build arg USE_OLLAMA=true (--build-arg="USE_OLLAMA=true") this only works with http://localhost:11434
+            OLLAMA_BASE_URL = "http://localhost:11434"
+        else:
+            OLLAMA_BASE_URL = "http://host.docker.internal:11434"
     elif K8S_FLAG:
         OLLAMA_BASE_URL = "http://ollama-service.open-webui.svc.cluster.local:11434"
 
@@ -391,10 +396,16 @@ if WEBUI_AUTH and WEBUI_SECRET_KEY == "":
 CHROMA_DATA_PATH = f"{DATA_DIR}/vector_db"
 # this uses the model defined in the Dockerfile ENV variable. If you dont use docker or docker based deployments such as k8s, the default embedding model will be used (all-MiniLM-L6-v2)
 RAG_EMBEDDING_MODEL = os.environ.get("RAG_EMBEDDING_MODEL", "all-MiniLM-L6-v2")
+log.info(f"Embedding model set: {RAG_EMBEDDING_MODEL}"),
 # device type ebbeding models - "cpu" (default), "cuda" (nvidia gpu required) or "mps" (apple silicon) - choosing this right can lead to better performance
-RAG_EMBEDDING_MODEL_DEVICE_TYPE = os.environ.get(
-    "RAG_EMBEDDING_MODEL_DEVICE_TYPE", "cpu"
-)
+USE_CUDA = os.environ.get("USE_CUDA_DOCKER", "false")
+
+if USE_CUDA.lower() == "true":
+    DEVICE_TYPE = "cuda"
+else:
+    DEVICE_TYPE = "cpu"
+
+
 CHROMA_CLIENT = chromadb.PersistentClient(
     path=CHROMA_DATA_PATH,
     settings=Settings(allow_reset=True, anonymized_telemetry=False),
diff --git a/backend/start.sh b/backend/start.sh
index f9ed59485..06adf1ff8 100755
--- a/backend/start.sh
+++ b/backend/start.sh
@@ -7,16 +7,26 @@ KEY_FILE=.webui_secret_key
 
 PORT="${PORT:-8080}"
 if test "$WEBUI_SECRET_KEY $WEBUI_JWT_SECRET_KEY" = " "; then
-  echo No WEBUI_SECRET_KEY provided
+  echo "No WEBUI_SECRET_KEY provided"
 
   if ! [ -e "$KEY_FILE" ]; then
-    echo Generating WEBUI_SECRET_KEY
+    echo "Generating WEBUI_SECRET_KEY"
     # Generate a random value to use as a WEBUI_SECRET_KEY in case the user didn't provide one.
-    echo $(head -c 12 /dev/random | base64) > $KEY_FILE
+    echo $(head -c 12 /dev/random | base64) > "$KEY_FILE"
   fi
 
-  echo Loading WEBUI_SECRET_KEY from $KEY_FILE
-  WEBUI_SECRET_KEY=`cat $KEY_FILE`
+  echo "Loading WEBUI_SECRET_KEY from $KEY_FILE"
+  WEBUI_SECRET_KEY=$(cat "$KEY_FILE")
 fi
 
-WEBUI_SECRET_KEY="$WEBUI_SECRET_KEY" exec uvicorn main:app --host 0.0.0.0 --port "$PORT" --forwarded-allow-ips '*'
\ No newline at end of file
+if [ "$USE_OLLAMA_DOCKER" = "true" ]; then
+    echo "USE_OLLAMA is set to true, starting ollama serve."
+    ollama serve &
+fi
+
+if [ "$USE_CUDA_DOCKER" = "true" ]; then
+  echo "CUDA is enabled, appending LD_LIBRARY_PATH to include torch/cudnn & cublas libraries."
+  export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.11/site-packages/torch/lib:/usr/local/lib/python3.11/site-packages/nvidia/cudnn/lib"
+fi
+
+WEBUI_SECRET_KEY="$WEBUI_SECRET_KEY" exec uvicorn main:app --host 0.0.0.0 --port "$PORT" --forwarded-allow-ips '*'