Concept #50Mediumproduction-mlops

What's your testing strategy for Gen AI applications?

#gen-ai#mlops

Answer

Testing Strategy for Gen AI Applications

Gen AI testing is harder than traditional software because outputs are non-deterministic and quality is subjective. You need a multi-layer testing pyramid.

The Gen AI Testing Pyramid

text
           /\
          /  \
         / E2E \          ← Full pipeline tests (few, slow)
        /--------\
       /  Quality  \      ← LLM-as-judge + RAGAS (medium)
      /   Evals    \
     /--------------\
    /  Integration  \     ← Component integration tests (medium)
   /------------------\
  /  Unit Tests        \  ← Deterministic functions (many, fast)
 /----------------------\

Layer 1: Unit Tests (Deterministic Functions)

python
import pytest
from src.chunking import chunk_text
from src.similarity import cosine_similarity
import numpy as np

def test_chunk_text_basic():
    text = "word " * 100
    chunks = chunk_text(text, chunk_size=20, overlap=5)
    assert len(chunks) > 1
    assert all(c["word_count"] <= 20 for c in chunks)

def test_chunk_text_overlap():
    text = " ".join(f"w{i}" for i in range(30))
    chunks = chunk_text(text, chunk_size=10, overlap=3)
    # Verify overlap: last 3 words of chunk[0] == first 3 words of chunk[1]
    words_0 = chunks[0]["text"].split()
    words_1 = chunks[1]["text"].split()
    assert words_0[-3:] == words_1[:3]

def test_cosine_similarity_identical():
    v = np.array([1.0, 0.0, 0.0])
    assert abs(cosine_similarity(v, v) - 1.0) < 1e-6

def test_cosine_similarity_orthogonal():
    a = np.array([1.0, 0.0])
    b = np.array([0.0, 1.0])
    assert abs(cosine_similarity(a, b)) < 1e-6

def test_chunk_text_empty():
    assert chunk_text("", chunk_size=512) == []

def test_chunk_text_overlap_too_large():
    with pytest.raises(ValueError):
        chunk_text("some text", chunk_size=10, overlap=15)

Layer 2: Integration Tests with Real API

python
import pytest

@pytest.mark.integration
def test_embedding_returns_correct_shape():
    from src.embedder import BatchEmbedder
    embedder = BatchEmbedder(model="text-embedding-3-small")
    results = embedder.embed(["test document"])
    assert len(results) == 1
    assert results[0].success
    assert len(results[0].embedding) == 1536

@pytest.mark.integration
def test_rag_returns_grounded_answer():
    '''Test that RAG grounds answers in retrieved context.'''
    from src.rag_pipeline import RAGPipeline
    pipeline = RAGPipeline()

    result = pipeline.query("What is our return policy?")
    assert result["answer"]
    assert len(result["sources"]) > 0
    assert "30 days" in result["answer"]  # Known fact in test KB

Layer 3: LLM Output Quality Tests

python
import pytest
from openai import OpenAI

client = OpenAI()

def llm_judge(question: str, answer: str, criterion: str) -> bool:
    '''Use GPT-4o-mini as a judge for output quality.'''
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content":
            f'Does this answer satisfy the criterion "{criterion}"?\n'
            f'Question: {question}\n'
            f'Answer: {answer}\n'
            f'Respond with only YES or NO.'
        }],
        temperature=0,
    )
    return "yes" in response.choices[0].message.content.lower()

@pytest.mark.llm_eval
@pytest.mark.parametrize("question,criterion", [
    ("What is RAG?", "mentions retrieval and generation"),
    ("How does fine-tuning work?", "mentions model weights or training"),
    ("What is a transformer?", "mentions attention mechanism"),
])
def test_answer_quality(question: str, criterion: str):
    answer = call_rag_pipeline(question)
    assert llm_judge(question, answer, criterion),         f"Answer failed criterion: {criterion}\nAnswer: {answer}"

Layer 4: Regression Testing (Golden Set)

python
GOLDEN_SET = [
    {"question": "What is the refund policy?",
     "must_contain": ["30 days"],
     "must_not_contain": ["competitor", "sorry, I don't know"]},
    {"question": "How do I reset my password?",
     "must_contain": ["email", "link"]},
]

def run_regression_suite() -> dict:
    results = {"passed": 0, "failed": 0, "failures": []}
    for item in GOLDEN_SET:
        answer = call_pipeline(item["question"])
        for phrase in item.get("must_contain", []):
            if phrase.lower() not in answer.lower():
                results["failed"] += 1
                results["failures"].append({"question": item["question"], "missing": phrase})
                break
        else:
            results["passed"] += 1
    return results

CI/CD Integration

yaml
# .github/workflows/test.yml
- name: Unit tests
  run: pytest tests/unit/ -v

- name: Integration tests
  run: pytest tests/integration/ -m integration --timeout=30
  env:
    OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

- name: Regression suite
  run: python scripts/run_regression.py
  # Fails CI if regression score drops > 5%

Testing budget: Unit tests (free, run on every commit), integration tests (cheap, run on PR), LLM-as-judge quality tests (moderate cost, run nightly), full RAGAS evaluation (most expensive, run weekly).