Concept #47Mediumproduction-mlops

What's your strategy for handling model updates in production?

#gen-ai#mlops

Answer

Model Update Strategy in Production

Updating LLM models in production (e.g., migrating from GPT-4 to GPT-4o, or deploying a fine-tuned model) requires careful validation to avoid regressions.

The Model Update Framework

Step 1: Offline Evaluation on Golden Set

python
from dataclasses import dataclass

@dataclass
class ModelEvalResult:
    model: str
    accuracy: float
    avg_faithfulness: float
    p95_latency_ms: float
    cost_per_1k_queries: float

def evaluate_model(model_name: str, golden_set: list[dict]) -> ModelEvalResult:
    from ragas import evaluate
    from ragas.metrics import faithfulness, answer_relevancy
    import time

    results = []
    latencies = []

    for item in golden_set:
        start = time.perf_counter()
        answer = call_model(model_name, item["question"])
        latencies.append((time.perf_counter() - start) * 1000)

        correct = any(kw in answer.lower() for kw in item["keywords"])
        results.append({"correct": correct, "answer": answer, "question": item["question"]})

    accuracy = sum(r["correct"] for r in results) / len(results)
    latencies.sort()
    p95 = latencies[int(len(latencies) * 0.95)]

    return ModelEvalResult(
        model=model_name,
        accuracy=accuracy,
        avg_faithfulness=0.0,  # Fill with RAGAS scores
        p95_latency_ms=p95,
        cost_per_1k_queries=estimate_cost(model_name, golden_set),
    )

# Compare before deployment
current = evaluate_model("gpt-4", golden_set)
candidate = evaluate_model("gpt-4o", golden_set)

print(f"Accuracy: {current.accuracy:.2%}{candidate.accuracy:.2%}")
print(f"p95 Latency: {current.p95_latency_ms:.0f}ms → {candidate.p95_latency_ms:.0f}ms")
print(f"Cost/1K: ${current.cost_per_1k_queries:.3f} → ${candidate.cost_per_1k_queries:.3f}")

Step 2: Shadow Mode Testing

python
import threading
import logging

logger = logging.getLogger(__name__)

def shadow_call(question: str, primary_model: str, shadow_model: str) -> str:
    '''Run primary model normally; run shadow model in background for comparison.'''

    def shadow_worker():
        try:
            shadow_answer = call_model(shadow_model, question)
            logger.info("shadow_comparison",
                question=question[:100],
                primary_model=primary_model,
                shadow_model=shadow_model,
                shadow_answer=shadow_answer[:200],
            )
        except Exception as e:
            logger.error(f"Shadow call failed: {e}")

    # Primary call (synchronous — user waits for this)
    answer = call_model(primary_model, question)

    # Shadow call (async — user doesn't wait)
    thread = threading.Thread(target=shadow_worker, daemon=True)
    thread.start()

    return answer

Step 3: Canary Deployment

python
import random

MODEL_CONFIG = {
    "primary": {"model": "gpt-4", "weight": 90},
    "canary": {"model": "gpt-4o", "weight": 10},
}

def route_request(question: str) -> str:
    '''Route 10% of traffic to canary model.'''
    rand = random.random() * 100
    cumulative = 0

    for name, config in MODEL_CONFIG.items():
        cumulative += config["weight"]
        if rand < cumulative:
            model = config["model"]
            break

    logger.info("model_routing", model=model, question=question[:50])
    return call_model(model, question)

Rollback Strategy

python
# Feature flag approach — instant rollback without deployment
MODEL_OVERRIDE = None  # Set to "gpt-4" to force rollback

def get_active_model() -> str:
    if MODEL_OVERRIDE:
        return MODEL_OVERRIDE
    return "gpt-4o"  # Current default

# Rollback: set MODEL_OVERRIDE = "gpt-4" via config API
# No code deployment needed — takes effect immediately

Promotion Criteria

GateThresholdMeasurement Period
Accuracy vs golden set≥ 95% of baselinePre-deployment
Faithfulness score≥ 4.0/5.024h in canary
p95 latency≤ 110% of baseline24h in canary
Error rate≤ baseline24h in canary
User feedback≥ baseline CSAT48h in canary

Key principle: Never promote a new model based on offline metrics alone. Always validate with real production traffic through shadow testing and canary deployment.