Monday, August 11, 2025

LLM as a Judge EXPLAINED! 🏆 Fair AI Rankings with BTL, Elo & Bias Busting Secrets!

 

Content Summary

🔥 Learn how to make Large Language Models (LLMs) your ultimate fair judges!
In this step-by-step tutorial, we’ll go from beginner-friendly basics to research-grade techniques for building an unbiased, mathematically grounded evaluation pipeline.

You’ll learn:

  • What is LLM-as-a-Judge and why it’s a game-changer for model evaluation.

  • Bradley–Terry–Luce (BTL) for global rankings from pairwise matches.

  • Elo Rating for live, online leaderboards.

  • Wilson Score Confidence Interval to measure ranking reliability.

  • Bias detection & mitigation — position bias, verbosity bias, self-enhancement, and more.

  • Working Python Code using the Groq API with llama3-70b-8192.

  • How to combine BTL + Elo + Wilson CI in a complete evaluation pipeline.

  • Visual explanations, analogies, and a flowchart for your own projects.

Working Code:

import os
import json
import math
import time
import random
import statistics
import dataclasses
from typing import Dict, List, Tuple, Any, Optional
import requests

# -------- Groq API Config --------
USE_REAL_LLM = True # Set False for mock/test (no network)
ENDPOINT = "https://api.groq.com/openai/v1/chat/completions"
MODEL = "llama3-70b-8192" # Or "llama3-8b-8192"
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "Use your own key here")

# ---------- Utilities ----------

def seed_everything(seed: int = 123):
random.seed(seed)

def wilson_ci(wins: int, n: int, z: float = 1.96) -> Tuple[float, float]:
if n == 0:
return (0.0, 1.0)
p = wins / n
denom = 1.0 + z**2 / n
num = p + z**2 / (2 * n)
rad = z * math.sqrt((p * (1 - p) + z**2 / (4 * n)) / n)
lo = (num - rad) / denom
hi = (num + rad) / denom
return (max(0.0, lo), min(1.0, hi))

# ---------- Judge prompt ----------

JUDGE_SYSTEM_PROMPT = """You are a strict, fair evaluation judge.
Follow the rubric exactly. Do NOT reward verbosity.
If both candidates are poor or indistinguishable, output TIE.
Always produce VALID JSON ONLY (no extra text)."""

def build_pairwise_user_prompt(task: str, rubric: Dict[str, str], candA: str, candB: str) -> str:
rubric_lines = "\n".join([f"- {k} (0-5): {v}" for k, v in rubric.items()])
prompt = f"""TASK:
{task}

RUBRIC:
{rubric_lines}

CANDIDATE A:
{candA}

CANDIDATE B:
{candB}

INSTRUCTIONS:
1) Briefly justify per dimension (1–2 sentences each).
2) Output JSON ONLY with fields:
{{
"scores": {{
"A": {{"Correctness": x, "Faithfulness": x, "Completeness": x, "Clarity": x, "Safety": x}},
"B": {{"Correctness": x, "Faithfulness": x, "Completeness": x, "Clarity": x, "Safety": x}}
}},
"winner": "A" | "B" | "TIE",
"rationale": "<1-3 sentence summary>"
}}
Note: x must be numbers in [0,5]."""
return prompt

def default_rubric() -> Dict[str, str]:
return {
"Correctness": "Factually correct and logically sound.",
"Faithfulness": "Grounded in the given input/context; no hallucinations.",
"Completeness": "Covers all requested aspects and edge cases.",
"Clarity": "Clear, concise, well-structured writing.",
"Safety": "No policy-violating or harmful content.",
}

# ---------- Groq LLM call ----------

class LLMError(Exception):
pass

def call_groq_chat(messages: List[Dict[str, str]], temperature: float = 0.0, max_retries: int = 3, timeout: int = 60) -> str:
if not USE_REAL_LLM:
return mock_llm(messages)

headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {GROQ_API_KEY}",
}
payload = {
"model": MODEL,
"messages": messages,
"temperature": temperature,
"response_format": {"type": "text"},
}

last_error = None
for attempt in range(1, max_retries + 1):
try:
resp = requests.post(ENDPOINT, headers=headers, json=payload, timeout=timeout)
if resp.status_code != 200:
last_error = f"HTTP {resp.status_code}: {resp.text[:500]}"
time.sleep(1.2 * attempt)
continue
data = resp.json()
content = data["choices"][0]["message"]["content"]
return content
except Exception as e:
last_error = str(e)
time.sleep(1.2 * attempt)

raise LLMError(f"Groq API failed after {max_retries} attempts. Last error: {last_error}")

def mock_llm(messages: List[Dict[str, str]]) -> str:
user_msg = [m for m in messages if m["role"] == "user"][-1]["content"]
a_start = user_msg.find("CANDIDATE A:")
b_start = user_msg.find("CANDIDATE B:")
a_text = user_msg[a_start:b_start]
b_text = user_msg[b_start:]
a_because = a_text.lower().count("because")
b_because = b_text.lower().count("because")

if a_because == b_because:
winner = "TIE"
else:
winner = "A" if a_because > b_because else "B"

j = {
"scores": {
"A": {"Correctness": 4, "Faithfulness": 3, "Completeness": 3, "Clarity": 5, "Safety": 5},
"B": {"Correctness": 5, "Faithfulness": 5, "Completeness": 4, "Clarity": 4, "Safety": 5},
},
"winner": winner,
"rationale": "Choice based on cause-focused content; minor clarity tradeoffs."
}
return json.dumps(j)

# ---------- Parsing & validation ----------

def parse_judge_json(s: str) -> Dict[str, Any]:
try:
obj = json.loads(s)
except json.JSONDecodeError as e:
start = s.find("{")
end = s.rfind("}")
if start != -1 and end != -1 and end > start:
obj = json.loads(s[start:end + 1])
else:
raise ValueError(f"Judge did not return valid JSON. Raw:\n{s[:500]}") from e

if "scores" not in obj or "winner" not in obj:
raise ValueError(f"Missing keys in judge JSON. Got keys: {list(obj.keys())}")

if obj["winner"] not in ["A", "B", "TIE"]:
raise ValueError(f"Invalid winner: {obj['winner']}")

for side in ["A", "B"]:
if side not in obj["scores"]:
raise ValueError(f"Missing scores for {side}")
for k in ["Correctness", "Faithfulness", "Completeness", "Clarity", "Safety"]:
v = obj["scores"][side].get(k, None)
if not isinstance(v, (int, float)):
raise ValueError(f"Score for {side}.{k} must be a number, got {v}")
if not (0 <= float(v) <= 5):
raise ValueError(f"Score for {side}.{k} out of range [0,5]: {v}")

return obj

# ---------- Judging core ----------

@dataclasses.dataclass
class JudgeResult:
winner: str
scores: Dict[str, Dict[str, float]]
rationale: str
order: Tuple[str, str]

def judge_pair(task: str, rubric: Dict[str, str], A: str, B: str,
temperature: float = 0.0, swap: bool = False) -> JudgeResult:
candA, candB = (B, A) if swap else (A, B)
order = ("B", "A") if swap else ("A", "B")

messages = [
{"role": "system", "content": JUDGE_SYSTEM_PROMPT},
{"role": "user", "content": build_pairwise_user_prompt(task, rubric, candA, candB)},
]
raw = call_groq_chat(messages, temperature=temperature)
obj = parse_judge_json(raw)

if swap:
winner_map = {"A": "B", "B": "A", "TIE": "TIE"}
obj["winner"] = winner_map[obj["winner"]]
obj["scores"] = {"A": obj["scores"]["B"], "B": obj["scores"]["A"]}

rationale = obj.get("rationale", "")
return JudgeResult(winner=obj["winner"], scores=obj["scores"], rationale=rationale, order=order)

def run_pairwise_trials(task: str, rubric: Dict[str, str], A: str, B: str,
trials: int = 10, seed: int = 123) -> List[JudgeResult]:
seed_everything(seed)
results: List[JudgeResult] = []
for t in range(trials):
swap = (random.random() < 0.5)
res = judge_pair(task, rubric, A, B, temperature=0.0, swap=swap)
results.append(res)
return results

# ---------- Aggregation: Elo & Bradley–Terry ----------

def elo_update(Ra: float, Rb: float, Sa: float, K: float = 16.0) -> Tuple[float, float]:
Ea = 1.0 / (1.0 + 10 ** ((Rb - Ra) / 400.0))
Ra_new = Ra + K * (Sa - Ea)
Rb_new = Rb + K * ((1 - Sa) - (1 - Ea))
return Ra_new, Rb_new

def aggregate_elo(results: List[JudgeResult], R_init: float = 1500.0, K: float = 16.0) -> Tuple[float, float]:
Ra, Rb = R_init, R_init
for r in results:
if r.winner == "A":
Sa = 1.0
elif r.winner == "B":
Sa = 0.0
else:
Sa = 0.5
Ra, Rb = elo_update(Ra, Rb, Sa, K=K)
return Ra, Rb

def fit_btl_from_pairwise(results: List[JudgeResult]) -> Tuple[float, float]:
w_AB = sum(1 for r in results if r.winner == "A")
w_BA = sum(1 for r in results if r.winner == "B")
n = w_AB + w_BA
if n == 0:
return 0.0, 0.0
num = w_BA + 0.5
den = w_AB + 0.5
delta = math.log(num / den)
beta_A = -0.5 * delta
beta_B = +0.5 * delta
return beta_A, beta_B

# ---------- Reporting ----------

@dataclasses.dataclass
class PairwiseReport:
n: int
wins_A: int
wins_B: int
ties: int
winrate_B: float
ci_B: Tuple[float, float]
elo_A: float
elo_B: float
btl_A: float
btl_B: float

def summarize_results(results: List[JudgeResult]) -> PairwiseReport:
n = len(results)
wins_A = sum(1 for r in results if r.winner == "A")
wins_B = sum(1 for r in results if r.winner == "B")
ties = n - wins_A - wins_B
winrate_B = 0.0 if (wins_A + wins_B) == 0 else wins_B / (wins_A + wins_B)
ci_B = wilson_ci(wins_B, wins_A + wins_B)
elo_A, elo_B = aggregate_elo(results, R_init=1500.0, K=16.0)
btl_A, btl_B = fit_btl_from_pairwise(results)
return PairwiseReport(n, wins_A, wins_B, ties, winrate_B, ci_B, elo_A, elo_B, btl_A, btl_B)

def print_report(report: PairwiseReport, example_rationales: List[str]):
print("\n=== Pairwise Judge Summary ===")
print(f"Total trials: {report.n}")
print(f"Wins A / Wins B / Ties: {report.wins_A} / {report.wins_B} / {report.ties}")
wr = f"{100*report.winrate_B:.1f}%" if report.wins_A + report.wins_B > 0 else "NA"
print(f"Win rate (B over A): {wr} (Wilson 95% CI: [{100*report.ci_B[0]:.1f}%, {100*report.ci_B[1]:.1f}%])")
print(f"Elo ratings: A={report.elo_A:.1f}, B={report.elo_B:.1f}")
print(f"BTL latent scores: A={report.btl_A:.3f}, B={report.btl_B:.3f}\n")
if example_rationales:
print("Example judge rationales:")
for r in example_rationales[:3]:
print(f"- {r}")

# ---------- Example tasks & runner ----------

EXAMPLE_TASK = (
"Summarize the paragraph in 1–2 sentences focusing on the *causes*."
)
EXAMPLE_PARAGRAPH = (
"Yesterday, heavy rain flooded several city streets. "
"Drainage systems were clogged due to poor maintenance. "
"As a result, traffic delays lasted for hours."
)
CANDIDATE_A = "Several city streets flooded yesterday, causing long traffic delays."
CANDIDATE_B = "Streets flooded yesterday because drainage systems were poorly maintained and clogged, causing long delays."

def evaluate_one_pair(task: str, context: Optional[str], A: str, B: str,
trials: int = 10, seed: int = 123) -> None:
rubric = default_rubric()
task_full = f"{task}\n\nCONTEXT:\n{context}" if context else task
print("\n" + "="*80)
print(f"Task:\n{task_full}\n")
print("Candidate A:\n", A, "\n")
print("Candidate B:\n", B, "\n")
results = run_pairwise_trials(task_full, rubric, A, B, trials=trials, seed=seed)
report = summarize_results(results)
rationales = [r.rationale for r in results if r.rationale][:5]
print_report(report, rationales)
print("="*80 + "\n")

if __name__ == "__main__":
USE_REAL_LLM = True
evaluate_one_pair(EXAMPLE_TASK, EXAMPLE_PARAGRAPH, CANDIDATE_A, CANDIDATE_B, trials=6, seed=42)
print("Done.")

References:

  1. Zheng, L. et al. (2023). “Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena.” https://arxiv.org/abs/2306.05685 arXiv
  2. Shi, L. et al. (2024). “Judging the Judges: A Systematic Study of Position Bias in LLM-as-a-Judge.” https://arxiv.org/abs/2406.07791 arXiv
  3. Gu, J. et al. (2024). “A Survey on LLM-as-a-Judge. https://arxiv.org/abs/2411.15594 arXiv
  4. Bradley, R. A., & Terry, M. E. (1952). “Rank Analysis of Incomplete Block Designs: The Method of Paired Comparisons.” Biometrikahttps://academic.oup.com/biomet/article-abstract/39/3-4/324/326091 Oxford Academic
  5. Elo, A. E. (1978). The Rating of Chessplayers, Past and Presenthttps://archive.org/details/ratingofchesspla00unse Internet Archive
  6. Wilson, E. B. (1927). “Probable Inference, the Law of Succession, and Statistical Inference.” JASAOpen PDF: https://www.med.mcgill.ca/epidemiology/Hanley/bios601/Proportion/wilson_jasa_1927.pdf McGill University Medicine
  7. Huang, T.-K., & Lin, C.-J. (2006). “Generalized Bradley-Terry Models and Multi-class Probability Estimates.” https://www.csie.ntu.edu.tw/~cjlin/papers/generalBT.pdf 國立臺灣大學資訊工程學系
  8. Li, D. et al. (2024). “From Generation to Judgment: Opportunities and Challenges of LLM-as-a-Judge.” (survey) https://arxiv.org/abs/2411.16594 arXiv