Version 0.4.0
All checks were successful
Run linters on applied template / Python 3.13 lint and build (push) Successful in 1m40s

Changes:
- put ObservabilityMiddleware before ExceptionHandlerMiddleware to avoid repetative code
- add application startup and last metrics update metrics along with CPU usage metric and threads count
- move host and port to new uvicorn section at config along with new reload and forwarded_allow_ips
- add request_id and remove trace_id/span_id generation if tracing is disabled
- move logging logic from utils to observability
- pass trace_id/span_id in HEX form
This commit is contained in:
2026-01-03 11:01:43 +03:00
parent b8acb017fd
commit 53f14a8624
26 changed files with 901 additions and 730 deletions

View File

@@ -0,0 +1,49 @@
"""Observability config is defined here."""
from dataclasses import dataclass, field
from typing import Literal
LoggingLevel = Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
@dataclass
class ExporterConfig:
endpoint: str
level: LoggingLevel = "INFO"
tls_insecure: bool = False
@dataclass
class FileLogger:
filename: str
level: LoggingLevel
@dataclass
class LoggingConfig:
root_logger_level: LoggingLevel = "INFO"
stderr_level: LoggingLevel | None = None
exporter: ExporterConfig | None = None
files: list[FileLogger] = field(default_factory=list)
def __post_init__(self):
if len(self.files) > 0 and isinstance(self.files[0], dict):
self.files = [FileLogger(**f) for f in self.files]
@dataclass
class PrometheusConfig:
host: str
port: int
@dataclass
class JaegerConfig:
endpoint: str
@dataclass
class ObservabilityConfig:
logging: LoggingConfig
prometheus: PrometheusConfig | None = None
jaeger: JaegerConfig | None = None

View File

@@ -0,0 +1,165 @@
"""Observability helper functions are defined here."""
import json
import logging
import sys
from pathlib import Path
from typing import Any
import structlog
from opentelemetry import trace
from opentelemetry._logs import set_logger_provider
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
from opentelemetry.sdk._logs import (
LoggerProvider,
LoggingHandler,
LogRecordProcessor,
ReadWriteLogRecord,
)
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.util.types import Attributes
from {{project_slug}}.observability.otel_agent import get_resource
from .config import ExporterConfig, FileLogger, LoggingConfig, LoggingLevel
def configure_logging(
config: LoggingConfig,
tracing_enabled: bool,
) -> structlog.stdlib.BoundLogger:
processors = [
structlog.contextvars.merge_contextvars,
structlog.stdlib.add_log_level,
structlog.stdlib.add_logger_name,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
]
if tracing_enabled:
processors.insert(len(processors) - 1, _add_open_telemetry_spans)
structlog.configure(
processors=processors,
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
)
root_logger = logging.getLogger()
root_logger.setLevel(config.root_logger_level)
if config.stderr_level is not None:
_configure_stderr_logger(root_logger, config.stderr_level)
if len(config.files) > 0:
_configure_file_loggers(root_logger, config.files)
if config.exporter is not None:
_configure_otel_exporter(root_logger, config.exporter)
logger: structlog.stdlib.BoundLogger = structlog.get_logger("{{project_name}}")
logger.setLevel(_level_name_mapping[config.root_logger_level])
return logger
_level_name_mapping: dict[LoggingLevel, int] = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL,
}
def _configure_stderr_logger(root_logger: logging.Logger, level: LoggingLevel) -> None:
stderr_handler = logging.StreamHandler(sys.stderr)
stderr_handler.setFormatter(
structlog.stdlib.ProcessorFormatter(processor=structlog.dev.ConsoleRenderer(colors=True))
)
stderr_handler.setLevel(_level_name_mapping[level])
root_logger.addHandler(stderr_handler)
def _configure_file_loggers(root_logger: logging.Logger, config_files: list[FileLogger]) -> None:
files = {logger_config.filename: logger_config.level for logger_config in config_files}
for filename, level in files.items():
try:
Path(filename).parent.mkdir(parents=True, exist_ok=True)
except Exception as exc: # pylint: disable=broad-except
print(f"Cannot create directory for log file {filename}, application will crash most likely. {exc!r}")
file_handler = logging.FileHandler(filename=filename, encoding="utf-8")
file_handler.setFormatter(structlog.stdlib.ProcessorFormatter(processor=structlog.processors.JSONRenderer()))
file_handler.setLevel(_level_name_mapping[level])
root_logger.addHandler(file_handler)
def _configure_otel_exporter(root_logger: logging.Logger, config: ExporterConfig) -> None:
logger_provider = LoggerProvider(resource=get_resource())
set_logger_provider(logger_provider)
otlp_exporter = OTLPLogExporter(endpoint=config.endpoint, insecure=config.tls_insecure)
logger_provider.add_log_record_processor(OtelLogPreparationProcessor())
logger_provider.add_log_record_processor(BatchLogRecordProcessor(otlp_exporter))
exporter_handler = AttrFilteredLoggingHandler(
level=config.level,
logger_provider=logger_provider,
)
exporter_handler.setLevel(_level_name_mapping[config.level])
root_logger.addHandler(exporter_handler)
def _add_open_telemetry_spans(_, __, event_dict: dict):
span = trace.get_current_span()
if not span or not span.is_recording():
return event_dict
ctx = span.get_span_context()
event_dict["span_id"] = format(ctx.span_id, "016x")
event_dict["trace_id"] = format(ctx.trace_id, "032x")
return event_dict
class AttrFilteredLoggingHandler(LoggingHandler):
DROP_ATTRIBUTES = ["_logger"]
@staticmethod
def _get_attributes(record: logging.LogRecord) -> Attributes:
attributes = LoggingHandler._get_attributes(record)
for attr in AttrFilteredLoggingHandler.DROP_ATTRIBUTES:
if attr in attributes:
del attributes[attr]
return attributes
class OtelLogPreparationProcessor(LogRecordProcessor):
"""Processor which moves everything except message from log record body to attributes."""
def on_emit(self, log_record: ReadWriteLogRecord) -> None:
if not isinstance(log_record.log_record.body, dict):
return
for key in log_record.log_record.body:
if key == "event":
continue
save_key = key
if key in log_record.log_record.attributes:
save_key = f"{key}__body"
log_record.log_record.attributes[save_key] = self._format_value(log_record.log_record.body[key])
log_record.log_record.body = log_record.log_record.body["event"]
def _format_value(self, value: Any) -> str:
if isinstance(value, (dict, list)):
return json.dumps(value)
return str(value)
def force_flush(self, timeout_millis=30000):
pass
def shutdown(self):
pass

View File

@@ -1,54 +0,0 @@
"""Application metrics are defined here."""
from dataclasses import dataclass
from opentelemetry import metrics
from opentelemetry.sdk.metrics import Counter, Histogram
@dataclass
class HTTPMetrics:
request_processing_duration: Histogram
"""Processing time histogram in seconds by `["method", "path"]`."""
requests_started: Counter
"""Total started requests counter by `["method", "path"]`."""
requests_finished: Counter
"""Total finished requests counter by `["method", "path", "status_code"]`."""
errors: Counter
"""Total errors (exceptions) counter by `["method", "path", "error_type", "status_code"]`."""
@dataclass
class Metrics:
http: HTTPMetrics
def init_metrics() -> Metrics:
meter = metrics.get_meter("{{project_name}}")
return Metrics(
http=HTTPMetrics(
request_processing_duration=meter.create_histogram(
"request_processing_duration",
"sec",
"Request processing duration time in seconds",
explicit_bucket_boundaries_advisory=[
0.05,
0.2,
0.3,
0.7,
1.0,
1.5,
2.5,
5.0,
10.0,
20.0,
40.0,
60.0,
120.0,
],
),
requests_started=meter.create_counter("requests_started_total", "1", "Total number of started requests"),
requests_finished=meter.create_counter("request_finished_total", "1", "Total number of finished requests"),
errors=meter.create_counter("request_errors_total", "1", "Total number of errors (exceptions) in requests"),
)
)

View File

@@ -0,0 +1,113 @@
"""Application metrics are defined here."""
import threading
import time
from dataclasses import dataclass
from typing import Callable
import psutil
from opentelemetry import metrics
from opentelemetry.metrics import CallbackOptions, Observation
from opentelemetry.sdk.metrics import Counter, Histogram, UpDownCounter
from {{project_slug}}.version import VERSION
@dataclass
class HTTPMetrics:
request_processing_duration: Histogram
"""Processing time histogram in seconds by `["method", "path"]`."""
requests_started: Counter
"""Total started requests counter by `["method", "path"]`."""
requests_finished: Counter
"""Total finished requests counter by `["method", "path", "status_code"]`."""
errors: Counter
"""Total errors (exceptions) counter by `["method", "path", "error_type", "status_code"]`."""
inflight_requests: UpDownCounter
"""Current number of requests handled simultaniously."""
@dataclass
class Metrics:
http: HTTPMetrics
def setup_metrics() -> Metrics:
meter = metrics.get_meter("{{project_name}}")
_setup_callback_metrics(meter)
return Metrics(
http=HTTPMetrics(
request_processing_duration=meter.create_histogram(
"request_processing_duration",
"sec",
"Request processing duration time in seconds",
explicit_bucket_boundaries_advisory=[
0.05,
0.2,
0.3,
0.7,
1.0,
1.5,
2.5,
5.0,
10.0,
20.0,
40.0,
60.0,
120.0,
],
),
requests_started=meter.create_counter("requests_started_total", "1", "Total number of started requests"),
requests_finished=meter.create_counter("request_finished_total", "1", "Total number of finished requests"),
errors=meter.create_counter("request_errors_total", "1", "Total number of errors (exceptions) in requests"),
inflight_requests=meter.create_up_down_counter(
"inflight_requests", "1", "Current number of requests handled simultaniously"
),
)
)
def _setup_callback_metrics(meter: metrics.Meter) -> None:
# Create observable gauge
meter.create_observable_gauge(
name="system_resource_usage",
description="System resource utilization",
unit="1",
callbacks=[_get_system_metrics_callback()],
)
meter.create_observable_gauge(
name="application_metrics",
description="Application-specific metrics",
unit="1",
callbacks=[_get_application_metrics_callback()],
)
def _get_system_metrics_callback() -> Callable[[CallbackOptions], None]:
def system_metrics_callback(options: CallbackOptions): # pylint: disable=unused-argument
"""Callback function to collect system metrics"""
# Process CPU time, a bit more information than `process_cpu_seconds_total`
cpu_times = psutil.Process().cpu_times()
yield Observation(cpu_times.user, {"resource": "cpu", "mode": "user"})
yield Observation(cpu_times.system, {"resource": "cpu", "mode": "system"})
return system_metrics_callback
def _get_application_metrics_callback() -> Callable[[CallbackOptions], None]:
startup_time = time.time()
def application_metrics_callback(options: CallbackOptions): # pylint: disable=unused-argument
"""Callback function to collect application-specific metrics"""
# Current timestamp
yield Observation(startup_time, {"metric": "startup_time", "version": VERSION})
yield Observation(time.time(), {"metric": "last_update_time", "version": VERSION})
# Active threads
active_threads = threading.active_count()
yield Observation(active_threads, {"metric": "active_threads"})
return application_metrics_callback

View File

@@ -1,10 +1,13 @@
"""Open Telemetry agent initialization is defined here"""
import platform
from functools import cache
from opentelemetry import metrics, trace
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.prometheus import PrometheusMetricReader
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource
from opentelemetry.sdk.resources import SERVICE_INSTANCE_ID, SERVICE_NAME, SERVICE_VERSION, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
@@ -14,14 +17,16 @@ from {{project_slug}}.version import VERSION as APP_VERSION
from .metrics_server import PrometheusServer
@cache
def get_resource() -> Resource:
return Resource.create(
attributes={SERVICE_NAME: "{{project_slug}}", SERVICE_VERSION: APP_VERSION, SERVICE_INSTANCE_ID: platform.node()}
)
class OpenTelemetryAgent: # pylint: disable=too-few-public-methods
def __init__(self, prometheus_config: PrometheusConfig | None, jaeger_config: JaegerConfig | None):
self._resource = Resource.create(
attributes={
SERVICE_NAME: "{{project_name}}",
SERVICE_VERSION: APP_VERSION,
}
)
self._resource = get_resource()
self._prometheus: PrometheusServer | None = None
self._span_exporter: OTLPSpanExporter | None = None

View File

@@ -0,0 +1,64 @@
"""Observability-related utility functions and classes are located here."""
import re
from collections import defaultdict
import fastapi
import structlog
from opentelemetry import trace
class URLsMapper:
"""Helper to change URL from given regex pattern to the given static value.
For example, with map {"GET": {"/api/debug/.*": "/api/debug/*"}} all GET-requests with URL
starting with "/api/debug/" will be placed in path "/api/debug/*" in metrics.
"""
def __init__(self, urls_map: dict[str, dict[str, str]] | None = None):
self._map: dict[str, dict[re.Pattern, str]] = defaultdict(dict)
"""[method -> [pattern -> mapped_to]]"""
if urls_map is not None:
for method, patterns in urls_map.items():
for pattern, value in patterns.items():
self.add(method, pattern, value)
def add(self, method: str, pattern: str, mapped_to: str) -> None:
"""Add entry to the map. If pattern compilation is failed, ValueError is raised."""
regexp = re.compile(pattern)
self._map[method.upper()][regexp] = mapped_to
def add_routes(self, routes: list[fastapi.routing.APIRoute]) -> None:
"""Add full route regexes to the map."""
logger: structlog.stdlib.BoundLogger = structlog.get_logger(__name__)
for route in routes:
if not hasattr(route, "path_regex") or not hasattr(route, "path"):
logger.warning("route has no 'path_regex' or 'path' attribute", route=route)
continue
if "{" not in route.path: # ignore simple routes
continue
route_path = route.path
while "{" in route_path:
lbrace = route_path.index("{")
rbrace = route_path.index("}", lbrace + 1)
route_path = route_path[:lbrace] + "*" + route_path[rbrace + 1 :]
for method in route.methods:
self._map[method.upper()][route.path_regex] = route_path
def map(self, method: str, url: str) -> str:
"""Check every map entry with `re.match` and return matched value. If not found, return original string."""
for regexp, mapped_to in self._map[method.upper()].items():
if regexp.match(url) is not None:
return mapped_to
return url
def get_tracing_headers() -> dict[str, str]:
ctx = trace.get_current_span().get_span_context()
if ctx.trace_id == 0:
return {}
return {
"X-Span-Id": format(ctx.span_id, "016x"),
"X-Trace-Id": format(ctx.trace_id, "032x"),
}