Version 0.4.0
All checks were successful
Run linters on applied template / Python 3.13 lint and build (push) Successful in 1m40s
All checks were successful
Run linters on applied template / Python 3.13 lint and build (push) Successful in 1m40s
Changes: - put ObservabilityMiddleware before ExceptionHandlerMiddleware to avoid repetative code - add application startup and last metrics update metrics along with CPU usage metric and threads count - move host and port to new uvicorn section at config along with new reload and forwarded_allow_ips - add request_id and remove trace_id/span_id generation if tracing is disabled - move logging logic from utils to observability - pass trace_id/span_id in HEX form
This commit is contained in:
49
{{project_slug}}/observability/config.py
Normal file
49
{{project_slug}}/observability/config.py
Normal file
@@ -0,0 +1,49 @@
|
||||
"""Observability config is defined here."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Literal
|
||||
|
||||
LoggingLevel = Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExporterConfig:
|
||||
endpoint: str
|
||||
level: LoggingLevel = "INFO"
|
||||
tls_insecure: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileLogger:
|
||||
filename: str
|
||||
level: LoggingLevel
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoggingConfig:
|
||||
root_logger_level: LoggingLevel = "INFO"
|
||||
stderr_level: LoggingLevel | None = None
|
||||
exporter: ExporterConfig | None = None
|
||||
files: list[FileLogger] = field(default_factory=list)
|
||||
|
||||
def __post_init__(self):
|
||||
if len(self.files) > 0 and isinstance(self.files[0], dict):
|
||||
self.files = [FileLogger(**f) for f in self.files]
|
||||
|
||||
|
||||
@dataclass
|
||||
class PrometheusConfig:
|
||||
host: str
|
||||
port: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class JaegerConfig:
|
||||
endpoint: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ObservabilityConfig:
|
||||
logging: LoggingConfig
|
||||
prometheus: PrometheusConfig | None = None
|
||||
jaeger: JaegerConfig | None = None
|
||||
165
{{project_slug}}/observability/logging.py.jinja
Normal file
165
{{project_slug}}/observability/logging.py.jinja
Normal file
@@ -0,0 +1,165 @@
|
||||
"""Observability helper functions are defined here."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
from opentelemetry import trace
|
||||
from opentelemetry._logs import set_logger_provider
|
||||
from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
|
||||
from opentelemetry.sdk._logs import (
|
||||
LoggerProvider,
|
||||
LoggingHandler,
|
||||
LogRecordProcessor,
|
||||
ReadWriteLogRecord,
|
||||
)
|
||||
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
||||
from opentelemetry.util.types import Attributes
|
||||
|
||||
from {{project_slug}}.observability.otel_agent import get_resource
|
||||
|
||||
from .config import ExporterConfig, FileLogger, LoggingConfig, LoggingLevel
|
||||
|
||||
|
||||
def configure_logging(
|
||||
config: LoggingConfig,
|
||||
tracing_enabled: bool,
|
||||
) -> structlog.stdlib.BoundLogger:
|
||||
processors = [
|
||||
structlog.contextvars.merge_contextvars,
|
||||
structlog.stdlib.add_log_level,
|
||||
structlog.stdlib.add_logger_name,
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.processors.format_exc_info,
|
||||
structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
|
||||
]
|
||||
|
||||
if tracing_enabled:
|
||||
processors.insert(len(processors) - 1, _add_open_telemetry_spans)
|
||||
|
||||
structlog.configure(
|
||||
processors=processors,
|
||||
logger_factory=structlog.stdlib.LoggerFactory(),
|
||||
wrapper_class=structlog.stdlib.BoundLogger,
|
||||
cache_logger_on_first_use=True,
|
||||
)
|
||||
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(config.root_logger_level)
|
||||
|
||||
if config.stderr_level is not None:
|
||||
_configure_stderr_logger(root_logger, config.stderr_level)
|
||||
|
||||
if len(config.files) > 0:
|
||||
_configure_file_loggers(root_logger, config.files)
|
||||
|
||||
if config.exporter is not None:
|
||||
_configure_otel_exporter(root_logger, config.exporter)
|
||||
|
||||
logger: structlog.stdlib.BoundLogger = structlog.get_logger("{{project_name}}")
|
||||
logger.setLevel(_level_name_mapping[config.root_logger_level])
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
_level_name_mapping: dict[LoggingLevel, int] = {
|
||||
"DEBUG": logging.DEBUG,
|
||||
"INFO": logging.INFO,
|
||||
"WARNING": logging.WARNING,
|
||||
"ERROR": logging.ERROR,
|
||||
"CRITICAL": logging.CRITICAL,
|
||||
}
|
||||
|
||||
|
||||
def _configure_stderr_logger(root_logger: logging.Logger, level: LoggingLevel) -> None:
|
||||
stderr_handler = logging.StreamHandler(sys.stderr)
|
||||
stderr_handler.setFormatter(
|
||||
structlog.stdlib.ProcessorFormatter(processor=structlog.dev.ConsoleRenderer(colors=True))
|
||||
)
|
||||
stderr_handler.setLevel(_level_name_mapping[level])
|
||||
root_logger.addHandler(stderr_handler)
|
||||
|
||||
|
||||
def _configure_file_loggers(root_logger: logging.Logger, config_files: list[FileLogger]) -> None:
|
||||
files = {logger_config.filename: logger_config.level for logger_config in config_files}
|
||||
for filename, level in files.items():
|
||||
try:
|
||||
Path(filename).parent.mkdir(parents=True, exist_ok=True)
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
print(f"Cannot create directory for log file {filename}, application will crash most likely. {exc!r}")
|
||||
file_handler = logging.FileHandler(filename=filename, encoding="utf-8")
|
||||
file_handler.setFormatter(structlog.stdlib.ProcessorFormatter(processor=structlog.processors.JSONRenderer()))
|
||||
file_handler.setLevel(_level_name_mapping[level])
|
||||
root_logger.addHandler(file_handler)
|
||||
|
||||
|
||||
def _configure_otel_exporter(root_logger: logging.Logger, config: ExporterConfig) -> None:
|
||||
logger_provider = LoggerProvider(resource=get_resource())
|
||||
set_logger_provider(logger_provider)
|
||||
|
||||
otlp_exporter = OTLPLogExporter(endpoint=config.endpoint, insecure=config.tls_insecure)
|
||||
logger_provider.add_log_record_processor(OtelLogPreparationProcessor())
|
||||
logger_provider.add_log_record_processor(BatchLogRecordProcessor(otlp_exporter))
|
||||
|
||||
exporter_handler = AttrFilteredLoggingHandler(
|
||||
level=config.level,
|
||||
logger_provider=logger_provider,
|
||||
)
|
||||
exporter_handler.setLevel(_level_name_mapping[config.level])
|
||||
root_logger.addHandler(exporter_handler)
|
||||
|
||||
|
||||
def _add_open_telemetry_spans(_, __, event_dict: dict):
|
||||
span = trace.get_current_span()
|
||||
if not span or not span.is_recording():
|
||||
return event_dict
|
||||
|
||||
ctx = span.get_span_context()
|
||||
|
||||
event_dict["span_id"] = format(ctx.span_id, "016x")
|
||||
event_dict["trace_id"] = format(ctx.trace_id, "032x")
|
||||
|
||||
return event_dict
|
||||
|
||||
|
||||
class AttrFilteredLoggingHandler(LoggingHandler):
|
||||
DROP_ATTRIBUTES = ["_logger"]
|
||||
|
||||
@staticmethod
|
||||
def _get_attributes(record: logging.LogRecord) -> Attributes:
|
||||
attributes = LoggingHandler._get_attributes(record)
|
||||
for attr in AttrFilteredLoggingHandler.DROP_ATTRIBUTES:
|
||||
if attr in attributes:
|
||||
del attributes[attr]
|
||||
return attributes
|
||||
|
||||
|
||||
class OtelLogPreparationProcessor(LogRecordProcessor):
|
||||
"""Processor which moves everything except message from log record body to attributes."""
|
||||
|
||||
def on_emit(self, log_record: ReadWriteLogRecord) -> None:
|
||||
if not isinstance(log_record.log_record.body, dict):
|
||||
return
|
||||
for key in log_record.log_record.body:
|
||||
if key == "event":
|
||||
continue
|
||||
save_key = key
|
||||
if key in log_record.log_record.attributes:
|
||||
save_key = f"{key}__body"
|
||||
log_record.log_record.attributes[save_key] = self._format_value(log_record.log_record.body[key])
|
||||
log_record.log_record.body = log_record.log_record.body["event"]
|
||||
|
||||
def _format_value(self, value: Any) -> str:
|
||||
if isinstance(value, (dict, list)):
|
||||
return json.dumps(value)
|
||||
return str(value)
|
||||
|
||||
def force_flush(self, timeout_millis=30000):
|
||||
pass
|
||||
|
||||
def shutdown(self):
|
||||
pass
|
||||
@@ -1,54 +0,0 @@
|
||||
"""Application metrics are defined here."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.sdk.metrics import Counter, Histogram
|
||||
|
||||
|
||||
@dataclass
|
||||
class HTTPMetrics:
|
||||
request_processing_duration: Histogram
|
||||
"""Processing time histogram in seconds by `["method", "path"]`."""
|
||||
requests_started: Counter
|
||||
"""Total started requests counter by `["method", "path"]`."""
|
||||
requests_finished: Counter
|
||||
"""Total finished requests counter by `["method", "path", "status_code"]`."""
|
||||
errors: Counter
|
||||
"""Total errors (exceptions) counter by `["method", "path", "error_type", "status_code"]`."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metrics:
|
||||
http: HTTPMetrics
|
||||
|
||||
|
||||
def init_metrics() -> Metrics:
|
||||
meter = metrics.get_meter("{{project_name}}")
|
||||
return Metrics(
|
||||
http=HTTPMetrics(
|
||||
request_processing_duration=meter.create_histogram(
|
||||
"request_processing_duration",
|
||||
"sec",
|
||||
"Request processing duration time in seconds",
|
||||
explicit_bucket_boundaries_advisory=[
|
||||
0.05,
|
||||
0.2,
|
||||
0.3,
|
||||
0.7,
|
||||
1.0,
|
||||
1.5,
|
||||
2.5,
|
||||
5.0,
|
||||
10.0,
|
||||
20.0,
|
||||
40.0,
|
||||
60.0,
|
||||
120.0,
|
||||
],
|
||||
),
|
||||
requests_started=meter.create_counter("requests_started_total", "1", "Total number of started requests"),
|
||||
requests_finished=meter.create_counter("request_finished_total", "1", "Total number of finished requests"),
|
||||
errors=meter.create_counter("request_errors_total", "1", "Total number of errors (exceptions) in requests"),
|
||||
)
|
||||
)
|
||||
113
{{project_slug}}/observability/metrics.py.jinja
Normal file
113
{{project_slug}}/observability/metrics.py.jinja
Normal file
@@ -0,0 +1,113 @@
|
||||
"""Application metrics are defined here."""
|
||||
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Callable
|
||||
|
||||
import psutil
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.metrics import CallbackOptions, Observation
|
||||
from opentelemetry.sdk.metrics import Counter, Histogram, UpDownCounter
|
||||
|
||||
from {{project_slug}}.version import VERSION
|
||||
|
||||
|
||||
@dataclass
|
||||
class HTTPMetrics:
|
||||
request_processing_duration: Histogram
|
||||
"""Processing time histogram in seconds by `["method", "path"]`."""
|
||||
requests_started: Counter
|
||||
"""Total started requests counter by `["method", "path"]`."""
|
||||
requests_finished: Counter
|
||||
"""Total finished requests counter by `["method", "path", "status_code"]`."""
|
||||
errors: Counter
|
||||
"""Total errors (exceptions) counter by `["method", "path", "error_type", "status_code"]`."""
|
||||
inflight_requests: UpDownCounter
|
||||
"""Current number of requests handled simultaniously."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metrics:
|
||||
http: HTTPMetrics
|
||||
|
||||
|
||||
def setup_metrics() -> Metrics:
|
||||
meter = metrics.get_meter("{{project_name}}")
|
||||
|
||||
_setup_callback_metrics(meter)
|
||||
|
||||
return Metrics(
|
||||
http=HTTPMetrics(
|
||||
request_processing_duration=meter.create_histogram(
|
||||
"request_processing_duration",
|
||||
"sec",
|
||||
"Request processing duration time in seconds",
|
||||
explicit_bucket_boundaries_advisory=[
|
||||
0.05,
|
||||
0.2,
|
||||
0.3,
|
||||
0.7,
|
||||
1.0,
|
||||
1.5,
|
||||
2.5,
|
||||
5.0,
|
||||
10.0,
|
||||
20.0,
|
||||
40.0,
|
||||
60.0,
|
||||
120.0,
|
||||
],
|
||||
),
|
||||
requests_started=meter.create_counter("requests_started_total", "1", "Total number of started requests"),
|
||||
requests_finished=meter.create_counter("request_finished_total", "1", "Total number of finished requests"),
|
||||
errors=meter.create_counter("request_errors_total", "1", "Total number of errors (exceptions) in requests"),
|
||||
inflight_requests=meter.create_up_down_counter(
|
||||
"inflight_requests", "1", "Current number of requests handled simultaniously"
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _setup_callback_metrics(meter: metrics.Meter) -> None:
|
||||
# Create observable gauge
|
||||
meter.create_observable_gauge(
|
||||
name="system_resource_usage",
|
||||
description="System resource utilization",
|
||||
unit="1",
|
||||
callbacks=[_get_system_metrics_callback()],
|
||||
)
|
||||
meter.create_observable_gauge(
|
||||
name="application_metrics",
|
||||
description="Application-specific metrics",
|
||||
unit="1",
|
||||
callbacks=[_get_application_metrics_callback()],
|
||||
)
|
||||
|
||||
|
||||
def _get_system_metrics_callback() -> Callable[[CallbackOptions], None]:
|
||||
def system_metrics_callback(options: CallbackOptions): # pylint: disable=unused-argument
|
||||
"""Callback function to collect system metrics"""
|
||||
|
||||
# Process CPU time, a bit more information than `process_cpu_seconds_total`
|
||||
cpu_times = psutil.Process().cpu_times()
|
||||
yield Observation(cpu_times.user, {"resource": "cpu", "mode": "user"})
|
||||
yield Observation(cpu_times.system, {"resource": "cpu", "mode": "system"})
|
||||
|
||||
return system_metrics_callback
|
||||
|
||||
|
||||
def _get_application_metrics_callback() -> Callable[[CallbackOptions], None]:
|
||||
startup_time = time.time()
|
||||
|
||||
def application_metrics_callback(options: CallbackOptions): # pylint: disable=unused-argument
|
||||
"""Callback function to collect application-specific metrics"""
|
||||
# Current timestamp
|
||||
yield Observation(startup_time, {"metric": "startup_time", "version": VERSION})
|
||||
yield Observation(time.time(), {"metric": "last_update_time", "version": VERSION})
|
||||
|
||||
# Active threads
|
||||
active_threads = threading.active_count()
|
||||
yield Observation(active_threads, {"metric": "active_threads"})
|
||||
|
||||
return application_metrics_callback
|
||||
@@ -1,10 +1,13 @@
|
||||
"""Open Telemetry agent initialization is defined here"""
|
||||
|
||||
import platform
|
||||
from functools import cache
|
||||
|
||||
from opentelemetry import metrics, trace
|
||||
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.exporter.prometheus import PrometheusMetricReader
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource
|
||||
from opentelemetry.sdk.resources import SERVICE_INSTANCE_ID, SERVICE_NAME, SERVICE_VERSION, Resource
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
|
||||
@@ -14,14 +17,16 @@ from {{project_slug}}.version import VERSION as APP_VERSION
|
||||
from .metrics_server import PrometheusServer
|
||||
|
||||
|
||||
@cache
|
||||
def get_resource() -> Resource:
|
||||
return Resource.create(
|
||||
attributes={SERVICE_NAME: "{{project_slug}}", SERVICE_VERSION: APP_VERSION, SERVICE_INSTANCE_ID: platform.node()}
|
||||
)
|
||||
|
||||
|
||||
class OpenTelemetryAgent: # pylint: disable=too-few-public-methods
|
||||
def __init__(self, prometheus_config: PrometheusConfig | None, jaeger_config: JaegerConfig | None):
|
||||
self._resource = Resource.create(
|
||||
attributes={
|
||||
SERVICE_NAME: "{{project_name}}",
|
||||
SERVICE_VERSION: APP_VERSION,
|
||||
}
|
||||
)
|
||||
self._resource = get_resource()
|
||||
self._prometheus: PrometheusServer | None = None
|
||||
self._span_exporter: OTLPSpanExporter | None = None
|
||||
|
||||
|
||||
64
{{project_slug}}/observability/utils.py
Normal file
64
{{project_slug}}/observability/utils.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""Observability-related utility functions and classes are located here."""
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
import fastapi
|
||||
import structlog
|
||||
from opentelemetry import trace
|
||||
|
||||
|
||||
class URLsMapper:
|
||||
"""Helper to change URL from given regex pattern to the given static value.
|
||||
|
||||
For example, with map {"GET": {"/api/debug/.*": "/api/debug/*"}} all GET-requests with URL
|
||||
starting with "/api/debug/" will be placed in path "/api/debug/*" in metrics.
|
||||
"""
|
||||
|
||||
def __init__(self, urls_map: dict[str, dict[str, str]] | None = None):
|
||||
self._map: dict[str, dict[re.Pattern, str]] = defaultdict(dict)
|
||||
"""[method -> [pattern -> mapped_to]]"""
|
||||
|
||||
if urls_map is not None:
|
||||
for method, patterns in urls_map.items():
|
||||
for pattern, value in patterns.items():
|
||||
self.add(method, pattern, value)
|
||||
|
||||
def add(self, method: str, pattern: str, mapped_to: str) -> None:
|
||||
"""Add entry to the map. If pattern compilation is failed, ValueError is raised."""
|
||||
regexp = re.compile(pattern)
|
||||
self._map[method.upper()][regexp] = mapped_to
|
||||
|
||||
def add_routes(self, routes: list[fastapi.routing.APIRoute]) -> None:
|
||||
"""Add full route regexes to the map."""
|
||||
logger: structlog.stdlib.BoundLogger = structlog.get_logger(__name__)
|
||||
for route in routes:
|
||||
if not hasattr(route, "path_regex") or not hasattr(route, "path"):
|
||||
logger.warning("route has no 'path_regex' or 'path' attribute", route=route)
|
||||
continue
|
||||
if "{" not in route.path: # ignore simple routes
|
||||
continue
|
||||
route_path = route.path
|
||||
while "{" in route_path:
|
||||
lbrace = route_path.index("{")
|
||||
rbrace = route_path.index("}", lbrace + 1)
|
||||
route_path = route_path[:lbrace] + "*" + route_path[rbrace + 1 :]
|
||||
for method in route.methods:
|
||||
self._map[method.upper()][route.path_regex] = route_path
|
||||
|
||||
def map(self, method: str, url: str) -> str:
|
||||
"""Check every map entry with `re.match` and return matched value. If not found, return original string."""
|
||||
for regexp, mapped_to in self._map[method.upper()].items():
|
||||
if regexp.match(url) is not None:
|
||||
return mapped_to
|
||||
return url
|
||||
|
||||
|
||||
def get_tracing_headers() -> dict[str, str]:
|
||||
ctx = trace.get_current_span().get_span_context()
|
||||
if ctx.trace_id == 0:
|
||||
return {}
|
||||
return {
|
||||
"X-Span-Id": format(ctx.span_id, "016x"),
|
||||
"X-Trace-Id": format(ctx.trace_id, "032x"),
|
||||
}
|
||||
Reference in New Issue
Block a user