diff --git a/backend/open_webui/models/feedbacks.py b/backend/open_webui/models/feedbacks.py index 6b1231e45..1e807ac8d 100644 --- a/backend/open_webui/models/feedbacks.py +++ b/backend/open_webui/models/feedbacks.py @@ -68,6 +68,13 @@ class FeedbackIdResponse(BaseModel): updated_at: int +class LeaderboardFeedbackData(BaseModel): + """Minimal feedback data for leaderboard computation (excludes snapshot/meta).""" + + id: str + data: Optional[dict] = None + + class RatingData(BaseModel): rating: Optional[str | int] = None model_id: Optional[str] = None @@ -271,6 +278,16 @@ class FeedbackTable: .all() ] + def get_feedbacks_for_leaderboard( + self, db: Optional[Session] = None + ) -> list[LeaderboardFeedbackData]: + """Fetch only id and data for leaderboard computation (excludes snapshot/meta).""" + with get_db_context(db) as db: + return [ + LeaderboardFeedbackData(id=row.id, data=row.data) + for row in db.query(Feedback.id, Feedback.data).all() + ] + def get_feedbacks_by_type( self, type: str, db: Optional[Session] = None ) -> list[FeedbackModel]: diff --git a/backend/open_webui/routers/evaluations.py b/backend/open_webui/routers/evaluations.py index 1bc6f1421..86fe02224 100644 --- a/backend/open_webui/routers/evaluations.py +++ b/backend/open_webui/routers/evaluations.py @@ -1,5 +1,7 @@ from typing import Optional +import logging from fastapi import APIRouter, Depends, HTTPException, status, Request +from fastapi.concurrency import run_in_threadpool from pydantic import BaseModel from open_webui.models.users import Users, UserModel @@ -10,6 +12,7 @@ from open_webui.models.feedbacks import ( FeedbackForm, FeedbackUserResponse, FeedbackListResponse, + LeaderboardFeedbackData, Feedbacks, ) @@ -18,9 +21,239 @@ from open_webui.utils.auth import get_admin_user, get_verified_user from open_webui.internal.db import get_session from sqlalchemy.orm import Session +log = logging.getLogger(__name__) + + router = APIRouter() +# Leaderboard Elo Rating Computation +# +# How it works: +# 1. Each model starts with a rating of 1000 +# 2. When a user picks a winner between two models, ratings are adjusted: +# - Winner gains points, loser loses points +# - The amount depends on expected outcome (upset = bigger change) +# 3. The Elo formula: new_rating = old_rating + K * (actual - expected) +# - K=32 controls how much ratings can change per match +# - expected = probability of winning based on current ratings +# +# Query-based re-ranking (optional): +# When a user searches for a topic (e.g., "coding"), we want to show +# which models perform best FOR THAT TOPIC. We do this by: +# 1. Computing semantic similarity between the query and each feedback's tags +# 2. Using that similarity as a weight in the Elo calculation +# 3. Feedbacks about "coding" contribute more to the final ranking +# 4. Feedbacks about unrelated topics (e.g., "cooking") contribute less +# This gives topic-specific leaderboards without needing separate data. + +EMBEDDING_MODEL_NAME = "TaylorAI/bge-micro-v2" +_embedding_model = None + + +def _get_embedding_model(): + global _embedding_model + if _embedding_model is None: + try: + from sentence_transformers import SentenceTransformer + + _embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME) + except Exception as e: + log.error(f"Embedding model load failed: {e}") + return _embedding_model + + +def _calculate_elo( + feedbacks: list[LeaderboardFeedbackData], similarities: dict = None +) -> dict: + """ + Calculate Elo ratings for models based on user feedback. + + Each feedback represents a comparison where a user rated one model + against its opponents (sibling_model_ids). Rating=1 means the model won, + rating=-1 means it lost. + + The Elo system adjusts ratings based on: + - Current rating difference (upsets cause bigger swings) + - Optional similarity weights (for query-based filtering) + + Returns: {model_id: {"rating": float, "won": int, "lost": int}} + """ + K_FACTOR = 32 # Standard Elo K-factor for rating volatility + model_stats = {} + + def get_or_create_stats(model_id): + if model_id not in model_stats: + model_stats[model_id] = {"rating": 1000.0, "won": 0, "lost": 0} + return model_stats[model_id] + + for feedback in feedbacks: + data = feedback.data or {} + winner_id = data.get("model_id") + rating_value = str(data.get("rating", "")) + if not winner_id or rating_value not in ("1", "-1"): + continue + + won = rating_value == "1" + weight = similarities.get(feedback.id, 1.0) if similarities else 1.0 + + for opponent_id in data.get("sibling_model_ids") or []: + winner = get_or_create_stats(winner_id) + opponent = get_or_create_stats(opponent_id) + expected = 1 / (1 + 10 ** ((opponent["rating"] - winner["rating"]) / 400)) + + winner["rating"] += K_FACTOR * ((1 if won else 0) - expected) * weight + opponent["rating"] += ( + K_FACTOR * ((0 if won else 1) - (1 - expected)) * weight + ) + + if won: + winner["won"] += 1 + opponent["lost"] += 1 + else: + winner["lost"] += 1 + opponent["won"] += 1 + + return model_stats + + +def _get_top_tags(feedbacks: list[LeaderboardFeedbackData], limit: int = 5) -> dict: + """ + Count tag occurrences per model and return the most frequent ones. + + Each feedback can have tags describing the conversation topic. + This aggregates those tags per model to show what topics each model + is commonly used for. + + Returns: {model_id: [{"tag": str, "count": int}, ...]} + """ + from collections import defaultdict + + tag_counts = defaultdict(lambda: defaultdict(int)) + + for feedback in feedbacks: + data = feedback.data or {} + model_id = data.get("model_id") + if model_id: + for tag in data.get("tags", []): + tag_counts[model_id][tag] += 1 + + return { + model_id: [ + {"tag": tag, "count": count} + for tag, count in sorted(tags.items(), key=lambda x: -x[1])[:limit] + ] + for model_id, tags in tag_counts.items() + } + + +def _compute_similarities(feedbacks: list[LeaderboardFeedbackData], query: str) -> dict: + """ + Compute how relevant each feedback is to a search query. + + Uses embeddings to find semantic similarity between the query and + each feedback's tags. Higher similarity means the feedback is more + relevant to what the user searched for. + + This is used to weight Elo calculations - feedbacks matching the + query have more influence on the final rankings. + + Returns: {feedback_id: similarity_score (0-1)} + """ + import numpy as np + + embedding_model = _get_embedding_model() + if not embedding_model: + return {} + + all_tags = list( + { + tag + for feedback in feedbacks + if feedback.data + for tag in feedback.data.get("tags", []) + } + ) + if not all_tags: + return {} + + try: + tag_embeddings = embedding_model.encode(all_tags) + query_embedding = embedding_model.encode([query])[0] + except Exception as e: + log.error(f"Embedding error: {e}") + return {} + + # Vectorized cosine similarity + tag_norms = np.linalg.norm(tag_embeddings, axis=1) + query_norm = np.linalg.norm(query_embedding) + similarities = np.dot(tag_embeddings, query_embedding) / ( + tag_norms * query_norm + 1e-9 + ) + tag_similarity_map = dict(zip(all_tags, similarities.tolist())) + + return { + feedback.id: max( + ( + tag_similarity_map.get(tag, 0) + for tag in (feedback.data or {}).get("tags", []) + ), + default=0, + ) + for feedback in feedbacks + } + + +class LeaderboardEntry(BaseModel): + model_id: str + rating: int + won: int + lost: int + count: int + top_tags: list[dict] + + +class LeaderboardResponse(BaseModel): + entries: list[LeaderboardEntry] + + +@router.get("/leaderboard", response_model=LeaderboardResponse) +async def get_leaderboard( + query: Optional[str] = None, + user=Depends(get_admin_user), + db: Session = Depends(get_session), +): + """Get model leaderboard with Elo ratings. Query filters by tag similarity.""" + feedbacks = Feedbacks.get_feedbacks_for_leaderboard(db=db) + + similarities = None + if query and query.strip(): + similarities = await run_in_threadpool( + _compute_similarities, feedbacks, query.strip() + ) + + elo_stats = _calculate_elo(feedbacks, similarities) + tags_by_model = _get_top_tags(feedbacks) + + entries = sorted( + [ + LeaderboardEntry( + model_id=mid, + rating=round(s["rating"]), + won=s["won"], + lost=s["lost"], + count=s["won"] + s["lost"], + top_tags=tags_by_model.get(mid, []), + ) + for mid, s in elo_stats.items() + ], + key=lambda e: e.rating, + reverse=True, + ) + + return LeaderboardResponse(entries=entries) + + ############################ # GetConfig ############################ diff --git a/src/lib/apis/evaluations/index.ts b/src/lib/apis/evaluations/index.ts index 1f48c7bfb..315315fb0 100644 --- a/src/lib/apis/evaluations/index.ts +++ b/src/lib/apis/evaluations/index.ts @@ -93,6 +93,40 @@ export const getAllFeedbacks = async (token: string = '') => { return res; }; +export const getLeaderboard = async (token: string = '', query: string = '') => { + let error = null; + + const searchParams = new URLSearchParams(); + if (query) searchParams.append('query', query); + + const res = await fetch( + `${WEBUI_API_BASE_URL}/evaluations/leaderboard?${searchParams.toString()}`, + { + method: 'GET', + headers: { + Accept: 'application/json', + 'Content-Type': 'application/json', + authorization: `Bearer ${token}` + } + } + ) + .then(async (res) => { + if (!res.ok) throw await res.json(); + return res.json(); + }) + .catch((err) => { + error = err.detail; + console.error(err); + return null; + }); + + if (error) { + throw error; + } + + return res; +}; + export const getFeedbackItems = async (token: string = '', orderBy, direction, page) => { let error = null; diff --git a/src/lib/components/admin/Evaluations.svelte b/src/lib/components/admin/Evaluations.svelte index e3984af11..89b7664d8 100644 --- a/src/lib/components/admin/Evaluations.svelte +++ b/src/lib/components/admin/Evaluations.svelte @@ -6,8 +6,7 @@ import Leaderboard from './Evaluations/Leaderboard.svelte'; import Feedbacks from './Evaluations/Feedbacks.svelte'; - import { getAllFeedbacks } from '$lib/apis/evaluations'; - + const i18n = getContext('i18n'); let selectedTab; @@ -30,12 +29,8 @@ }; let loaded = false; - let feedbacks = []; onMount(async () => { - // TODO: feedbacks elo rating calculation should be done in the backend; remove below line later - feedbacks = await getAllFeedbacks(localStorage.token); - loaded = true; const containerElement = document.getElementById('users-tabs-container'); @@ -117,7 +112,7 @@
{#if selectedTab === 'leaderboard'} - + {:else if selectedTab === 'feedback'} {/if} diff --git a/src/lib/components/admin/Evaluations/Leaderboard.svelte b/src/lib/components/admin/Evaluations/Leaderboard.svelte index 36d857722..0fdfd6f68 100644 --- a/src/lib/components/admin/Evaluations/Leaderboard.svelte +++ b/src/lib/components/admin/Evaluations/Leaderboard.svelte @@ -1,559 +1,198 @@ - + -
-
-
- {$i18n.t('Leaderboard')} -
- -
- {rankedModels.length} -
-
- -
- -
-
- -
- { - loadEmbeddingModel(); - }} - /> -
-
+
+
+ {$i18n.t('Leaderboard')} + {rankedModels.length}
+ +
+ + +
+
-
- {#if loadingLeaderboard} -
-
- -
+
+ {#if loading} +
+
{/if} - {#if (rankedModels ?? []).length === 0} -
- {$i18n.t('No models found')} -
- {:else} - + + {#if !rankedModels.length && !loading} +
{$i18n.t('No models found')}
+ {:else if rankedModels.length} +
- - - - - - + + {#each [ + { key: 'rating', label: 'RK', class: 'w-3' }, + { key: 'name', label: 'Model', class: '' }, + { key: 'rating', label: 'Rating', class: 'text-right w-fit' }, + { key: 'won', label: 'Won', class: 'text-right w-5' }, + { key: 'lost', label: 'Lost', class: 'text-right w-5' } + ] as col} + + {/each} - - {#each sortedModels as model, modelIdx (model.id)} + + {#each sortedModels as model, idx (model.id)} openLeaderboardModelModal(model)} + class="bg-white dark:bg-gray-900 text-xs group cursor-pointer hover:bg-gray-50 dark:hover:bg-gray-850/50 transition" + on:click={() => openModal(model)} > - - - - - - - {/each} @@ -562,15 +201,11 @@ {/if} -
-
+
+
- ⓘ {$i18n.t( - 'The evaluation leaderboard is based on the Elo rating system and is updated in real-time.' - )} + ⓘ {$i18n.t('The evaluation leaderboard is based on the Elo rating system and is updated in real-time.')}
- {$i18n.t( - 'The leaderboard is currently in beta, and we may adjust the rating calculations as we refine the algorithm.' - )} + {$i18n.t('The leaderboard is currently in beta, and we may adjust the rating calculations as we refine the algorithm.')}
diff --git a/src/lib/components/admin/Evaluations/LeaderboardModal.svelte b/src/lib/components/admin/Evaluations/LeaderboardModal.svelte index b3d784979..0b82f9a6f 100644 --- a/src/lib/components/admin/Evaluations/LeaderboardModal.svelte +++ b/src/lib/components/admin/Evaluations/LeaderboardModal.svelte @@ -3,7 +3,6 @@ import { getContext } from 'svelte'; export let show = false; export let model = null; - export let feedbacks = []; export let onClose: () => void = () => {}; const i18n = getContext('i18n'); import XMark from '$lib/components/icons/XMark.svelte'; @@ -13,22 +12,8 @@ onClose(); }; - $: topTags = model ? getTopTagsForModel(model.id, feedbacks) : []; - - const getTopTagsForModel = (modelId: string, feedbacks: any[], topN = 5) => { - const tagCounts = new Map(); - feedbacks - .filter((fb) => fb.data.model_id === modelId) - .forEach((fb) => { - (fb.data.tags || []).forEach((tag) => { - tagCounts.set(tag, (tagCounts.get(tag) || 0) + 1); - }); - }); - return Array.from(tagCounts.entries()) - .sort((a, b) => b[1] - a[1]) - .slice(0, topN) - .map(([tag, count]) => ({ tag, count })); - }; + // Use top_tags from backend response (already computed) + $: topTags = model?.top_tags ?? [];
setSortKey('rating')} - > -
- {$i18n.t('RK')} - {#if orderBy === 'rating'} - - {#if direction === 'asc'} - - {:else} - - {/if} - - {:else} - - {/if} -
-
setSortKey('name')} - > -
- {$i18n.t('Model')} - {#if orderBy === 'name'} - - {#if direction === 'asc'} - - {:else} - - {/if} - - {:else} - - {/if} -
-
setSortKey('rating')} - > -
- {$i18n.t('Rating')} - {#if orderBy === 'rating'} - - {#if direction === 'asc'} - - {:else} - - {/if} - - {:else} - - {/if} -
-
setSortKey('won')} - > -
- {$i18n.t('Won')} - {#if orderBy === 'won'} - - {#if direction === 'asc'} - - {:else} - - {/if} - - {:else} - - {/if} -
-
setSortKey('lost')} - > -
- {$i18n.t('Lost')} - {#if orderBy === 'lost'} - - {#if direction === 'asc'} - - {:else} - - {/if} - - {:else} - - {/if} -
-
toggleSort(col.key)} + > +
+ {$i18n.t(col.label)} + {#if orderBy === col.key} + {#if direction === 'asc'}{:else}{/if} + {:else} + + {/if} +
+
-
- {model?.rating !== '-' ? modelIdx + 1 : '-'} -
+
+ {model.rating !== '-' ? idx + 1 : '-'} +
-
- {model.name} -
- -
- {model.name} -
+ {model.name} + {model.name}
+ {model.rating} -
- {#if model.stats.won === '-'} - - - {:else} - - {model.stats.won} - {/if} -
+
+ {#if model.stats.won === '-'}-{:else} + + {model.stats.won} + {/if} -
- {#if model.stats.lost === '-'} - - - {:else} - - {model.stats.lost} - {/if} -
+
+ {#if model.stats.lost === '-'}-{:else} + + {model.stats.lost} + {/if}