Concept #34Mediumpython-for-gen-ai

How would you implement structured logging in a Gen AI application?

#gen-ai#python#mlops

Answer

Structured Logging in Gen AI Applications

Structured logging outputs logs as machine-readable JSON rather than plain text strings — enabling search, filtering, alerting, and dashboards in tools like Datadog, CloudWatch, and Grafana.

Why Structured Logging Matters

python
# ❌ Unstructured — hard to search and parse
print(f"User asked: {question}, LLM took 3.2s, used 512 tokens")

# ✅ Structured — queryable, parseable, alertable
logger.info("llm_request_completed", extra={
    "question": question,
    "latency_ms": 3200,
    "tokens_used": 512,
    "model": "gpt-4o",
    "user_id": "user_123",
})

Setup with
text
structlog

python
import structlog
import logging
import sys

def configure_logging(environment: str = "development"):
    # Configure Python logging backend
    logging.basicConfig(
        stream=sys.stdout,
        level=logging.INFO,
        format="%(message)s",
    )

    # Configure structlog
    processors = [
        structlog.contextvars.merge_contextvars,
        structlog.processors.add_log_level,
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.processors.StackInfoRenderer(),
    ]

    if environment == "production":
        processors.append(structlog.processors.JSONRenderer())
    else:
        processors.append(structlog.dev.ConsoleRenderer())  # Pretty for dev

    structlog.configure(
        processors=processors,
        wrapper_class=structlog.make_filtering_bound_logger(logging.INFO),
        context_class=dict,
        logger_factory=structlog.PrintLoggerFactory(),
    )

logger = structlog.get_logger()

LLM Request Logging

python
import time
import structlog
from openai import OpenAI
from typing import Optional

logger = structlog.get_logger()
client = OpenAI()

def tracked_llm_call(
    messages: list[dict],
    model: str = "gpt-4o",
    user_id: Optional[str] = None,
    request_id: Optional[str] = None,
) -> str:
    start = time.perf_counter()

    # Bind context for this request (all logs in this call include these fields)
    bound_logger = logger.bind(
        model=model,
        user_id=user_id,
        request_id=request_id,
        message_count=len(messages),
    )

    bound_logger.info("llm_request_started")

    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
        )

        latency_ms = int((time.perf_counter() - start) * 1000)
        usage = response.usage

        bound_logger.info(
            "llm_request_completed",
            latency_ms=latency_ms,
            prompt_tokens=usage.prompt_tokens,
            completion_tokens=usage.completion_tokens,
            total_tokens=usage.total_tokens,
            cost_usd=estimate_cost(usage, model),
        )

        return response.choices[0].message.content

    except Exception as e:
        latency_ms = int((time.perf_counter() - start) * 1000)
        bound_logger.error(
            "llm_request_failed",
            error=str(e),
            error_type=type(e).__name__,
            latency_ms=latency_ms,
        )
        raise

def estimate_cost(usage, model: str) -> float:
    rates = {
        "gpt-4o": {"prompt": 0.0025, "completion": 0.01},
        "gpt-4o-mini": {"prompt": 0.00015, "completion": 0.0006},
    }
    r = rates.get(model, {"prompt": 0, "completion": 0})
    return (usage.prompt_tokens * r["prompt"] + usage.completion_tokens * r["completion"]) / 1000

RAG Pipeline Logging

python
import uuid

def rag_query(question: str, user_id: str) -> dict:
    request_id = str(uuid.uuid4())

    # Context vars propagate to all log calls within this request
    structlog.contextvars.bind_contextvars(
        request_id=request_id,
        user_id=user_id,
    )

    logger.info("rag_query_started", question=question)

    # Retrieval
    t0 = time.perf_counter()
    docs = retriever.retrieve(question)
    retrieval_ms = int((time.perf_counter() - t0) * 1000)

    logger.info("retrieval_completed",
        docs_retrieved=len(docs),
        retrieval_ms=retrieval_ms,
        top_score=docs[0].score if docs else None,
    )

    # Generation
    answer = tracked_llm_call(
        messages=build_messages(question, docs),
        user_id=user_id,
        request_id=request_id,
    )

    logger.info("rag_query_completed", answer_length=len(answer))
    structlog.contextvars.clear_contextvars()

    return {"answer": answer, "sources": [d.source for d in docs]}

JSON Output in Production

json
{"event": "llm_request_completed", "model": "gpt-4o", "user_id": "user_123",
 "request_id": "abc-123", "latency_ms": 3241, "prompt_tokens": 512,
 "completion_tokens": 128, "total_tokens": 640, "cost_usd": 0.00256,
 "timestamp": "2026-03-20T10:23:45Z", "level": "info"}

This JSON can be ingested by Datadog, CloudWatch Logs, Grafana Loki, or Elasticsearch for dashboards and alerts.

Key metrics to log and alert on: p99 latency > 10s, cost_usd > $0.10 per request, error rate > 1%, token usage approaching model limits.