mirror of
https://github.com/open-webui/open-webui.git
synced 2026-02-25 04:29:56 +01:00
fix: use efficient COUNT queries in telemetry metrics to prevent connection pool exhaustion
This fixes database connection pool exhaustion issues reported after v0.7.0,
particularly affecting PostgreSQL deployments on high-latency networks (e.g., AWS Aurora).
## The Problem
The telemetry metrics callbacks (running every 10 seconds via OpenTelemetry's
PeriodicExportingMetricReader) were using inefficient queries that loaded entire
database tables into memory just to count records:
len(Users.get_users()["users"]) # Loads ALL user records to count them
On high-latency network-attached databases like AWS Aurora, this would:
1. Hold database connections for hundreds of milliseconds while transferring data
2. Deserialize all records into Python objects
3. Only then count the list length
Under concurrent load, these long-held connections would stack up and drain the
connection pool, resulting in:
sqlalchemy.exc.TimeoutError: QueuePool limit of size 5 overflow 10 reached,
connection timed out, timeout 30.00
## The Fix
Replace inefficient full-table loads with efficient COUNT(*) queries using
methods that already exist in the codebase:
- `len(Users.get_users()["users"])` → `Users.get_num_users()`
- Similar changes for other telemetry callbacks as needed
COUNT(*) queries use database indexes and return a single integer, completing in
~5-10ms even on Aurora, versus potentially 500ms+ for loading all records.
## Why v0.7.1's Session Sharing Disable "Helped"
The v0.7.1 change to disable DATABASE_ENABLE_SESSION_SHARING by default appeared
to fix the issue, but it was masking the root cause. Disabling session sharing
causes connections to be returned to the pool faster (more connection churn),
which reduced the window for pool exhaustion but didn't address the underlying
inefficient queries.
With this fix, session sharing can be safely re-enabled for deployments that
benefit from it (especially PostgreSQL), as telemetry will no longer hold
connections for extended periods.
## Impact
- Telemetry connection usage drops from potentially seconds to ~30ms total per
collection cycle
- Connection pool pressure from telemetry becomes negligible (~0.3% utilization)
- Enterprise PostgreSQL deployments (Aurora, RDS, etc.) should no longer
experience pool exhaustion under normal load
207 lines
6.6 KiB
Python
207 lines
6.6 KiB
Python
"""OpenTelemetry metrics bootstrap for Open WebUI.
|
||
|
||
This module initialises a MeterProvider that sends metrics to an OTLP
|
||
collector. The collector is responsible for exposing a Prometheus
|
||
`/metrics` endpoint – WebUI does **not** expose it directly.
|
||
|
||
Metrics collected:
|
||
|
||
* http.server.requests (counter)
|
||
* http.server.duration (histogram, milliseconds)
|
||
|
||
Attributes used: http.method, http.route, http.status_code
|
||
|
||
If you wish to add more attributes (e.g. user-agent) you can, but beware of
|
||
high-cardinality label sets.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import time
|
||
from typing import Dict, List, Sequence, Any
|
||
from base64 import b64encode
|
||
|
||
from fastapi import FastAPI, Request
|
||
from opentelemetry import metrics
|
||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (
|
||
OTLPMetricExporter,
|
||
)
|
||
|
||
from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
|
||
OTLPMetricExporter as OTLPHttpMetricExporter,
|
||
)
|
||
from opentelemetry.sdk.metrics import MeterProvider
|
||
from opentelemetry.sdk.metrics.view import View
|
||
from opentelemetry.sdk.metrics.export import (
|
||
PeriodicExportingMetricReader,
|
||
)
|
||
from opentelemetry.sdk.resources import Resource
|
||
|
||
from open_webui.env import (
|
||
OTEL_SERVICE_NAME,
|
||
OTEL_METRICS_EXPORTER_OTLP_ENDPOINT,
|
||
OTEL_METRICS_BASIC_AUTH_USERNAME,
|
||
OTEL_METRICS_BASIC_AUTH_PASSWORD,
|
||
OTEL_METRICS_OTLP_SPAN_EXPORTER,
|
||
OTEL_METRICS_EXPORTER_OTLP_INSECURE,
|
||
)
|
||
from open_webui.models.users import Users
|
||
|
||
_EXPORT_INTERVAL_MILLIS = 10_000 # 10 seconds
|
||
|
||
|
||
def _build_meter_provider(resource: Resource) -> MeterProvider:
|
||
"""Return a configured MeterProvider."""
|
||
headers = []
|
||
if OTEL_METRICS_BASIC_AUTH_USERNAME and OTEL_METRICS_BASIC_AUTH_PASSWORD:
|
||
auth_string = (
|
||
f"{OTEL_METRICS_BASIC_AUTH_USERNAME}:{OTEL_METRICS_BASIC_AUTH_PASSWORD}"
|
||
)
|
||
auth_header = b64encode(auth_string.encode()).decode()
|
||
headers = [("authorization", f"Basic {auth_header}")]
|
||
|
||
# Periodic reader pushes metrics over OTLP/gRPC to collector
|
||
if OTEL_METRICS_OTLP_SPAN_EXPORTER == "http":
|
||
readers: List[PeriodicExportingMetricReader] = [
|
||
PeriodicExportingMetricReader(
|
||
OTLPHttpMetricExporter(
|
||
endpoint=OTEL_METRICS_EXPORTER_OTLP_ENDPOINT, headers=headers
|
||
),
|
||
export_interval_millis=_EXPORT_INTERVAL_MILLIS,
|
||
)
|
||
]
|
||
else:
|
||
readers: List[PeriodicExportingMetricReader] = [
|
||
PeriodicExportingMetricReader(
|
||
OTLPMetricExporter(
|
||
endpoint=OTEL_METRICS_EXPORTER_OTLP_ENDPOINT,
|
||
insecure=OTEL_METRICS_EXPORTER_OTLP_INSECURE,
|
||
headers=headers,
|
||
),
|
||
export_interval_millis=_EXPORT_INTERVAL_MILLIS,
|
||
)
|
||
]
|
||
|
||
# Optional view to limit cardinality: drop user-agent etc.
|
||
views: List[View] = [
|
||
View(
|
||
instrument_name="http.server.duration",
|
||
attribute_keys=["http.method", "http.route", "http.status_code"],
|
||
),
|
||
View(
|
||
instrument_name="http.server.requests",
|
||
attribute_keys=["http.method", "http.route", "http.status_code"],
|
||
),
|
||
View(
|
||
instrument_name="webui.users.total",
|
||
),
|
||
View(
|
||
instrument_name="webui.users.active",
|
||
),
|
||
View(
|
||
instrument_name="webui.users.active.today",
|
||
),
|
||
]
|
||
|
||
provider = MeterProvider(
|
||
resource=resource,
|
||
metric_readers=list(readers),
|
||
views=views,
|
||
)
|
||
return provider
|
||
|
||
|
||
def setup_metrics(app: FastAPI, resource: Resource) -> None:
|
||
"""Attach OTel metrics middleware to *app* and initialise provider."""
|
||
|
||
metrics.set_meter_provider(_build_meter_provider(resource))
|
||
meter = metrics.get_meter(__name__)
|
||
|
||
# Instruments
|
||
request_counter = meter.create_counter(
|
||
name="http.server.requests",
|
||
description="Total HTTP requests",
|
||
unit="1",
|
||
)
|
||
duration_histogram = meter.create_histogram(
|
||
name="http.server.duration",
|
||
description="HTTP request duration",
|
||
unit="ms",
|
||
)
|
||
|
||
def observe_active_users(
|
||
options: metrics.CallbackOptions,
|
||
) -> Sequence[metrics.Observation]:
|
||
return [
|
||
metrics.Observation(
|
||
value=Users.get_active_user_count(),
|
||
)
|
||
]
|
||
|
||
def observe_total_registered_users(
|
||
options: metrics.CallbackOptions,
|
||
) -> Sequence[metrics.Observation]:
|
||
# IMPORTANT: Use get_num_users() for efficient COUNT(*) query.
|
||
# Do NOT use len(get_users()["users"]) - it loads ALL user records into memory,
|
||
# causing connection pool exhaustion on high-latency databases (e.g., Aurora).
|
||
return [
|
||
metrics.Observation(
|
||
value=Users.get_num_users() or 0,
|
||
)
|
||
]
|
||
|
||
meter.create_observable_gauge(
|
||
name="webui.users.total",
|
||
description="Total number of registered users",
|
||
unit="users",
|
||
callbacks=[observe_total_registered_users],
|
||
)
|
||
|
||
meter.create_observable_gauge(
|
||
name="webui.users.active",
|
||
description="Number of currently active users",
|
||
unit="users",
|
||
callbacks=[observe_active_users],
|
||
)
|
||
|
||
def observe_users_active_today(
|
||
options: metrics.CallbackOptions,
|
||
) -> Sequence[metrics.Observation]:
|
||
return [metrics.Observation(value=Users.get_num_users_active_today())]
|
||
|
||
meter.create_observable_gauge(
|
||
name="webui.users.active.today",
|
||
description="Number of users active since midnight today",
|
||
unit="users",
|
||
callbacks=[observe_users_active_today],
|
||
)
|
||
|
||
# FastAPI middleware
|
||
@app.middleware("http")
|
||
async def _metrics_middleware(request: Request, call_next):
|
||
start_time = time.perf_counter()
|
||
|
||
status_code = None
|
||
try:
|
||
response = await call_next(request)
|
||
status_code = getattr(response, "status_code", 500)
|
||
return response
|
||
except Exception:
|
||
status_code = 500
|
||
raise
|
||
finally:
|
||
elapsed_ms = (time.perf_counter() - start_time) * 1000.0
|
||
|
||
# Route template e.g. "/items/{item_id}" instead of real path.
|
||
route = request.scope.get("route")
|
||
route_path = getattr(route, "path", request.url.path)
|
||
|
||
attrs: Dict[str, str | int] = {
|
||
"http.method": request.method,
|
||
"http.route": route_path,
|
||
"http.status_code": status_code,
|
||
}
|
||
|
||
request_counter.add(1, attrs)
|
||
duration_histogram.record(elapsed_ms, attrs)
|