How would you implement structured logging in a Gen AI application?
#gen-ai#python#mlops
Answer
Structured Logging in Gen AI Applications
Structured logging outputs logs as machine-readable JSON rather than plain text strings — enabling search, filtering, alerting, and dashboards in tools like Datadog, CloudWatch, and Grafana.
Why Structured Logging Matters
python# ❌ Unstructured — hard to search and parse print(f"User asked: {question}, LLM took 3.2s, used 512 tokens") # ✅ Structured — queryable, parseable, alertable logger.info("llm_request_completed", extra={ "question": question, "latency_ms": 3200, "tokens_used": 512, "model": "gpt-4o", "user_id": "user_123", })
Setup with textstructlog
text
structlogpythonimport structlog import logging import sys def configure_logging(environment: str = "development"): # Configure Python logging backend logging.basicConfig( stream=sys.stdout, level=logging.INFO, format="%(message)s", ) # Configure structlog processors = [ structlog.contextvars.merge_contextvars, structlog.processors.add_log_level, structlog.processors.TimeStamper(fmt="iso"), structlog.processors.StackInfoRenderer(), ] if environment == "production": processors.append(structlog.processors.JSONRenderer()) else: processors.append(structlog.dev.ConsoleRenderer()) # Pretty for dev structlog.configure( processors=processors, wrapper_class=structlog.make_filtering_bound_logger(logging.INFO), context_class=dict, logger_factory=structlog.PrintLoggerFactory(), ) logger = structlog.get_logger()
LLM Request Logging
pythonimport time import structlog from openai import OpenAI from typing import Optional logger = structlog.get_logger() client = OpenAI() def tracked_llm_call( messages: list[dict], model: str = "gpt-4o", user_id: Optional[str] = None, request_id: Optional[str] = None, ) -> str: start = time.perf_counter() # Bind context for this request (all logs in this call include these fields) bound_logger = logger.bind( model=model, user_id=user_id, request_id=request_id, message_count=len(messages), ) bound_logger.info("llm_request_started") try: response = client.chat.completions.create( model=model, messages=messages, ) latency_ms = int((time.perf_counter() - start) * 1000) usage = response.usage bound_logger.info( "llm_request_completed", latency_ms=latency_ms, prompt_tokens=usage.prompt_tokens, completion_tokens=usage.completion_tokens, total_tokens=usage.total_tokens, cost_usd=estimate_cost(usage, model), ) return response.choices[0].message.content except Exception as e: latency_ms = int((time.perf_counter() - start) * 1000) bound_logger.error( "llm_request_failed", error=str(e), error_type=type(e).__name__, latency_ms=latency_ms, ) raise def estimate_cost(usage, model: str) -> float: rates = { "gpt-4o": {"prompt": 0.0025, "completion": 0.01}, "gpt-4o-mini": {"prompt": 0.00015, "completion": 0.0006}, } r = rates.get(model, {"prompt": 0, "completion": 0}) return (usage.prompt_tokens * r["prompt"] + usage.completion_tokens * r["completion"]) / 1000
RAG Pipeline Logging
pythonimport uuid def rag_query(question: str, user_id: str) -> dict: request_id = str(uuid.uuid4()) # Context vars propagate to all log calls within this request structlog.contextvars.bind_contextvars( request_id=request_id, user_id=user_id, ) logger.info("rag_query_started", question=question) # Retrieval t0 = time.perf_counter() docs = retriever.retrieve(question) retrieval_ms = int((time.perf_counter() - t0) * 1000) logger.info("retrieval_completed", docs_retrieved=len(docs), retrieval_ms=retrieval_ms, top_score=docs[0].score if docs else None, ) # Generation answer = tracked_llm_call( messages=build_messages(question, docs), user_id=user_id, request_id=request_id, ) logger.info("rag_query_completed", answer_length=len(answer)) structlog.contextvars.clear_contextvars() return {"answer": answer, "sources": [d.source for d in docs]}
JSON Output in Production
json{"event": "llm_request_completed", "model": "gpt-4o", "user_id": "user_123", "request_id": "abc-123", "latency_ms": 3241, "prompt_tokens": 512, "completion_tokens": 128, "total_tokens": 640, "cost_usd": 0.00256, "timestamp": "2026-03-20T10:23:45Z", "level": "info"}
This JSON can be ingested by Datadog, CloudWatch Logs, Grafana Loki, or Elasticsearch for dashboards and alerts.
Key metrics to log and alert on: p99 latency > 10s, cost_usd > $0.10 per request, error rate > 1%, token usage approaching model limits.