Concept #54Hardadvanced-topics

How would you run a blind evaluation of two LLM models?

#gen-ai#evaluation

Answer

Blind Evaluation of Two LLM Models

A blind evaluation (A/B test) presents outputs from two models without revealing which is which, to eliminate bias. This is the gold standard for comparing LLM quality.

Why Blind Evaluation?

Evaluators — human or LLM — are biased toward outputs from models they know are "better" (GPT-4 vs. unknown model). Blind evaluation removes this bias by hiding model identity.

Method 1: Human Blind Evaluation

python
import random
import uuid
from typing import Literal

class BlindEvaluationFramework:
    def __init__(self, model_a_fn, model_b_fn):
        self.models = {"A": model_a_fn, "B": model_b_fn}
        self.results = []  # {eval_id, question, winner, preferred_response}

    def create_eval_pair(self, question: str) -> dict:
        '''Generate response pair with randomised position.'''
        response_a = self.models["A"](question)
        response_b = self.models["B"](question)

        # Randomise which is shown as "Response 1" vs "Response 2"
        eval_id = str(uuid.uuid4())
        if random.random() > 0.5:
            mapping = {"1": "A", "2": "B"}
            shown = {"response_1": response_a, "response_2": response_b}
        else:
            mapping = {"1": "B", "2": "A"}
            shown = {"response_1": response_b, "response_2": response_a}

        # Store mapping secretly
        self._mappings[eval_id] = mapping

        return {
            "eval_id": eval_id,
            "question": question,
            **shown,
        }

    def record_preference(self, eval_id: str, preferred: Literal["1", "2", "tie"]) -> None:
        mapping = self._mappings[eval_id]
        winner = mapping.get(preferred, "tie")
        self.results.append({"eval_id": eval_id, "winner": winner})

    def compute_results(self) -> dict:
        wins_a = sum(1 for r in self.results if r["winner"] == "A")
        wins_b = sum(1 for r in self.results if r["winner"] == "B")
        ties = sum(1 for r in self.results if r["winner"] == "tie")
        total = len(self.results)

        return {
            "model_a_wins": wins_a,
            "model_b_wins": wins_b,
            "ties": ties,
            "model_a_win_rate": wins_a / total,
            "model_b_win_rate": wins_b / total,
        }

Method 2: LLM-as-Judge (Automated Blind Eval)

python
from openai import OpenAI
import json

client = OpenAI()

def llm_blind_judge(question: str, response_1: str, response_2: str) -> dict:
    '''Use GPT-4o as an impartial judge between two responses.'''
    prompt = f'''You are evaluating two AI responses to a question. Judge which is better.

Question: {question}

Response 1:
{response_1}

Response 2:
{response_2}

Evaluate based on:
1. Accuracy and factual correctness
2. Completeness and helpfulness
3. Clarity and conciseness
4. Appropriate tone

Respond as JSON:
{{"winner": "1" or "2" or "tie",
  "reason": "brief explanation",
  "scores": {{"response_1": {{"accuracy": 1-5, "helpfulness": 1-5}},
              "response_2": {{"accuracy": 1-5, "helpfulness": 1-5}}}}}}'''

    result = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"},
        temperature=0,
    )
    return json.loads(result.choices[0].message.content)


def run_automated_blind_eval(
    test_questions: list[str],
    model_a_fn,
    model_b_fn,
) -> dict:
    '''Full automated blind evaluation pipeline.'''
    judgements = []

    for question in test_questions:
        # Generate responses
        response_a = model_a_fn(question)
        response_b = model_b_fn(question)

        # Randomise order to prevent position bias
        if random.random() > 0.5:
            r1, r2, map_ = response_a, response_b, {"1": "A", "2": "B"}
        else:
            r1, r2, map_ = response_b, response_a, {"1": "B", "2": "A"}

        # Judge
        judgement = llm_blind_judge(question, r1, r2)
        winner_model = map_.get(judgement["winner"], "tie")

        judgements.append({
            "question": question,
            "winner": winner_model,
            "reason": judgement["reason"],
        })

    wins_a = sum(1 for j in judgements if j["winner"] == "A")
    wins_b = sum(1 for j in judgements if j["winner"] == "B")
    ties = sum(1 for j in judgements if j["winner"] == "tie")
    total = len(judgements)

    # Statistical significance (binomial test)
    from scipy.stats import binom_test
    p_value = binom_test(wins_a, wins_a + wins_b, 0.5)

    return {
        "model_a_win_rate": wins_a / total,
        "model_b_win_rate": wins_b / total,
        "tie_rate": ties / total,
        "p_value": p_value,
        "statistically_significant": p_value < 0.05,
        "total_comparisons": total,
    }

Statistical Significance

python
from scipy.stats import binom_test

# With 100 comparisons: Model A wins 65, Model B wins 35
p = binom_test(65, 100, 0.5, alternative='two-sided')
print(f"p-value: {p:.4f}")  # 0.0026 — statistically significant!
print(f"Significant at p<0.05: {p < 0.05}")

# Rule of thumb: need at least 50-100 comparisons for statistical power

Sample Size Requirements

Desired Win Rate DifferenceMin Comparisons Needed
70% vs 30%~30
60% vs 40%~100
55% vs 45%~400
52% vs 48%~2,500

Evaluation Dimensions

Structure your evaluation questions to cover:

  • Factual accuracy — questions with known correct answers
  • Reasoning — multi-step problems
  • Format compliance — instructions requiring specific output format
  • Edge cases — ambiguous, trick, or out-of-scope queries
  • Domain coverage — representative sample of real user queries

Best practice: Use a stratified sample of your actual production queries as the test set — not hand-crafted examples. Real production diversity reveals true model differences.