Monday, August 11, 2025

LLM as a Judge EXPLAINED! 🏆 Fair AI Rankings with BTL, Elo & Bias Busting Secrets!

 

Content Summary

🔥 Learn how to make Large Language Models (LLMs) your ultimate fair judges!
In this step-by-step tutorial, we’ll go from beginner-friendly basics to research-grade techniques for building an unbiased, mathematically grounded evaluation pipeline.

You’ll learn:

  • What is LLM-as-a-Judge and why it’s a game-changer for model evaluation.

  • Bradley–Terry–Luce (BTL) for global rankings from pairwise matches.

  • Elo Rating for live, online leaderboards.

  • Wilson Score Confidence Interval to measure ranking reliability.

  • Bias detection & mitigation — position bias, verbosity bias, self-enhancement, and more.

  • Working Python Code using the Groq API with llama3-70b-8192.

  • How to combine BTL + Elo + Wilson CI in a complete evaluation pipeline.

  • Visual explanations, analogies, and a flowchart for your own projects.

Working Code:

import os
import json
import math
import time
import random
import statistics
import dataclasses
from typing import Dict, List, Tuple, Any, Optional
import requests

# -------- Groq API Config --------
USE_REAL_LLM = True # Set False for mock/test (no network)
ENDPOINT = "https://api.groq.com/openai/v1/chat/completions"
MODEL = "llama3-70b-8192" # Or "llama3-8b-8192"
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "Use your own key here")

# ---------- Utilities ----------

def seed_everything(seed: int = 123):
random.seed(seed)

def wilson_ci(wins: int, n: int, z: float = 1.96) -> Tuple[float, float]:
if n == 0:
return (0.0, 1.0)
p = wins / n
denom = 1.0 + z**2 / n
num = p + z**2 / (2 * n)
rad = z * math.sqrt((p * (1 - p) + z**2 / (4 * n)) / n)
lo = (num - rad) / denom
hi = (num + rad) / denom
return (max(0.0, lo), min(1.0, hi))

# ---------- Judge prompt ----------

JUDGE_SYSTEM_PROMPT = """You are a strict, fair evaluation judge.
Follow the rubric exactly. Do NOT reward verbosity.
If both candidates are poor or indistinguishable, output TIE.
Always produce VALID JSON ONLY (no extra text)."""

def build_pairwise_user_prompt(task: str, rubric: Dict[str, str], candA: str, candB: str) -> str:
rubric_lines = "\n".join([f"- {k} (0-5): {v}" for k, v in rubric.items()])
prompt = f"""TASK:
{task}

RUBRIC:
{rubric_lines}

CANDIDATE A:
{candA}

CANDIDATE B:
{candB}

INSTRUCTIONS:
1) Briefly justify per dimension (1–2 sentences each).
2) Output JSON ONLY with fields:
{{
"scores": {{
"A": {{"Correctness": x, "Faithfulness": x, "Completeness": x, "Clarity": x, "Safety": x}},
"B": {{"Correctness": x, "Faithfulness": x, "Completeness": x, "Clarity": x, "Safety": x}}
}},
"winner": "A" | "B" | "TIE",
"rationale": "<1-3 sentence summary>"
}}
Note: x must be numbers in [0,5]."""
return prompt

def default_rubric() -> Dict[str, str]:
return {
"Correctness": "Factually correct and logically sound.",
"Faithfulness": "Grounded in the given input/context; no hallucinations.",
"Completeness": "Covers all requested aspects and edge cases.",
"Clarity": "Clear, concise, well-structured writing.",
"Safety": "No policy-violating or harmful content.",
}

# ---------- Groq LLM call ----------

class LLMError(Exception):
pass

def call_groq_chat(messages: List[Dict[str, str]], temperature: float = 0.0, max_retries: int = 3, timeout: int = 60) -> str:
if not USE_REAL_LLM:
return mock_llm(messages)

headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {GROQ_API_KEY}",
}
payload = {
"model": MODEL,
"messages": messages,
"temperature": temperature,
"response_format": {"type": "text"},
}

last_error = None
for attempt in range(1, max_retries + 1):
try:
resp = requests.post(ENDPOINT, headers=headers, json=payload, timeout=timeout)
if resp.status_code != 200:
last_error = f"HTTP {resp.status_code}: {resp.text[:500]}"
time.sleep(1.2 * attempt)
continue
data = resp.json()
content = data["choices"][0]["message"]["content"]
return content
except Exception as e:
last_error = str(e)
time.sleep(1.2 * attempt)

raise LLMError(f"Groq API failed after {max_retries} attempts. Last error: {last_error}")

def mock_llm(messages: List[Dict[str, str]]) -> str:
user_msg = [m for m in messages if m["role"] == "user"][-1]["content"]
a_start = user_msg.find("CANDIDATE A:")
b_start = user_msg.find("CANDIDATE B:")
a_text = user_msg[a_start:b_start]
b_text = user_msg[b_start:]
a_because = a_text.lower().count("because")
b_because = b_text.lower().count("because")

if a_because == b_because:
winner = "TIE"
else:
winner = "A" if a_because > b_because else "B"

j = {
"scores": {
"A": {"Correctness": 4, "Faithfulness": 3, "Completeness": 3, "Clarity": 5, "Safety": 5},
"B": {"Correctness": 5, "Faithfulness": 5, "Completeness": 4, "Clarity": 4, "Safety": 5},
},
"winner": winner,
"rationale": "Choice based on cause-focused content; minor clarity tradeoffs."
}
return json.dumps(j)

# ---------- Parsing & validation ----------

def parse_judge_json(s: str) -> Dict[str, Any]:
try:
obj = json.loads(s)
except json.JSONDecodeError as e:
start = s.find("{")
end = s.rfind("}")
if start != -1 and end != -1 and end > start:
obj = json.loads(s[start:end + 1])
else:
raise ValueError(f"Judge did not return valid JSON. Raw:\n{s[:500]}") from e

if "scores" not in obj or "winner" not in obj:
raise ValueError(f"Missing keys in judge JSON. Got keys: {list(obj.keys())}")

if obj["winner"] not in ["A", "B", "TIE"]:
raise ValueError(f"Invalid winner: {obj['winner']}")

for side in ["A", "B"]:
if side not in obj["scores"]:
raise ValueError(f"Missing scores for {side}")
for k in ["Correctness", "Faithfulness", "Completeness", "Clarity", "Safety"]:
v = obj["scores"][side].get(k, None)
if not isinstance(v, (int, float)):
raise ValueError(f"Score for {side}.{k} must be a number, got {v}")
if not (0 <= float(v) <= 5):
raise ValueError(f"Score for {side}.{k} out of range [0,5]: {v}")

return obj

# ---------- Judging core ----------

@dataclasses.dataclass
class JudgeResult:
winner: str
scores: Dict[str, Dict[str, float]]
rationale: str
order: Tuple[str, str]

def judge_pair(task: str, rubric: Dict[str, str], A: str, B: str,
temperature: float = 0.0, swap: bool = False) -> JudgeResult:
candA, candB = (B, A) if swap else (A, B)
order = ("B", "A") if swap else ("A", "B")

messages = [
{"role": "system", "content": JUDGE_SYSTEM_PROMPT},
{"role": "user", "content": build_pairwise_user_prompt(task, rubric, candA, candB)},
]
raw = call_groq_chat(messages, temperature=temperature)
obj = parse_judge_json(raw)

if swap:
winner_map = {"A": "B", "B": "A", "TIE": "TIE"}
obj["winner"] = winner_map[obj["winner"]]
obj["scores"] = {"A": obj["scores"]["B"], "B": obj["scores"]["A"]}

rationale = obj.get("rationale", "")
return JudgeResult(winner=obj["winner"], scores=obj["scores"], rationale=rationale, order=order)

def run_pairwise_trials(task: str, rubric: Dict[str, str], A: str, B: str,
trials: int = 10, seed: int = 123) -> List[JudgeResult]:
seed_everything(seed)
results: List[JudgeResult] = []
for t in range(trials):
swap = (random.random() < 0.5)
res = judge_pair(task, rubric, A, B, temperature=0.0, swap=swap)
results.append(res)
return results

# ---------- Aggregation: Elo & Bradley–Terry ----------

def elo_update(Ra: float, Rb: float, Sa: float, K: float = 16.0) -> Tuple[float, float]:
Ea = 1.0 / (1.0 + 10 ** ((Rb - Ra) / 400.0))
Ra_new = Ra + K * (Sa - Ea)
Rb_new = Rb + K * ((1 - Sa) - (1 - Ea))
return Ra_new, Rb_new

def aggregate_elo(results: List[JudgeResult], R_init: float = 1500.0, K: float = 16.0) -> Tuple[float, float]:
Ra, Rb = R_init, R_init
for r in results:
if r.winner == "A":
Sa = 1.0
elif r.winner == "B":
Sa = 0.0
else:
Sa = 0.5
Ra, Rb = elo_update(Ra, Rb, Sa, K=K)
return Ra, Rb

def fit_btl_from_pairwise(results: List[JudgeResult]) -> Tuple[float, float]:
w_AB = sum(1 for r in results if r.winner == "A")
w_BA = sum(1 for r in results if r.winner == "B")
n = w_AB + w_BA
if n == 0:
return 0.0, 0.0
num = w_BA + 0.5
den = w_AB + 0.5
delta = math.log(num / den)
beta_A = -0.5 * delta
beta_B = +0.5 * delta
return beta_A, beta_B

# ---------- Reporting ----------

@dataclasses.dataclass
class PairwiseReport:
n: int
wins_A: int
wins_B: int
ties: int
winrate_B: float
ci_B: Tuple[float, float]
elo_A: float
elo_B: float
btl_A: float
btl_B: float

def summarize_results(results: List[JudgeResult]) -> PairwiseReport:
n = len(results)
wins_A = sum(1 for r in results if r.winner == "A")
wins_B = sum(1 for r in results if r.winner == "B")
ties = n - wins_A - wins_B
winrate_B = 0.0 if (wins_A + wins_B) == 0 else wins_B / (wins_A + wins_B)
ci_B = wilson_ci(wins_B, wins_A + wins_B)
elo_A, elo_B = aggregate_elo(results, R_init=1500.0, K=16.0)
btl_A, btl_B = fit_btl_from_pairwise(results)
return PairwiseReport(n, wins_A, wins_B, ties, winrate_B, ci_B, elo_A, elo_B, btl_A, btl_B)

def print_report(report: PairwiseReport, example_rationales: List[str]):
print("\n=== Pairwise Judge Summary ===")
print(f"Total trials: {report.n}")
print(f"Wins A / Wins B / Ties: {report.wins_A} / {report.wins_B} / {report.ties}")
wr = f"{100*report.winrate_B:.1f}%" if report.wins_A + report.wins_B > 0 else "NA"
print(f"Win rate (B over A): {wr} (Wilson 95% CI: [{100*report.ci_B[0]:.1f}%, {100*report.ci_B[1]:.1f}%])")
print(f"Elo ratings: A={report.elo_A:.1f}, B={report.elo_B:.1f}")
print(f"BTL latent scores: A={report.btl_A:.3f}, B={report.btl_B:.3f}\n")
if example_rationales:
print("Example judge rationales:")
for r in example_rationales[:3]:
print(f"- {r}")

# ---------- Example tasks & runner ----------

EXAMPLE_TASK = (
"Summarize the paragraph in 1–2 sentences focusing on the *causes*."
)
EXAMPLE_PARAGRAPH = (
"Yesterday, heavy rain flooded several city streets. "
"Drainage systems were clogged due to poor maintenance. "
"As a result, traffic delays lasted for hours."
)
CANDIDATE_A = "Several city streets flooded yesterday, causing long traffic delays."
CANDIDATE_B = "Streets flooded yesterday because drainage systems were poorly maintained and clogged, causing long delays."

def evaluate_one_pair(task: str, context: Optional[str], A: str, B: str,
trials: int = 10, seed: int = 123) -> None:
rubric = default_rubric()
task_full = f"{task}\n\nCONTEXT:\n{context}" if context else task
print("\n" + "="*80)
print(f"Task:\n{task_full}\n")
print("Candidate A:\n", A, "\n")
print("Candidate B:\n", B, "\n")
results = run_pairwise_trials(task_full, rubric, A, B, trials=trials, seed=seed)
report = summarize_results(results)
rationales = [r.rationale for r in results if r.rationale][:5]
print_report(report, rationales)
print("="*80 + "\n")

if __name__ == "__main__":
USE_REAL_LLM = True
evaluate_one_pair(EXAMPLE_TASK, EXAMPLE_PARAGRAPH, CANDIDATE_A, CANDIDATE_B, trials=6, seed=42)
print("Done.")

References:

  1. Zheng, L. et al. (2023). “Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena.” https://arxiv.org/abs/2306.05685 arXiv
  2. Shi, L. et al. (2024). “Judging the Judges: A Systematic Study of Position Bias in LLM-as-a-Judge.” https://arxiv.org/abs/2406.07791 arXiv
  3. Gu, J. et al. (2024). “A Survey on LLM-as-a-Judge. https://arxiv.org/abs/2411.15594 arXiv
  4. Bradley, R. A., & Terry, M. E. (1952). “Rank Analysis of Incomplete Block Designs: The Method of Paired Comparisons.” Biometrikahttps://academic.oup.com/biomet/article-abstract/39/3-4/324/326091 Oxford Academic
  5. Elo, A. E. (1978). The Rating of Chessplayers, Past and Presenthttps://archive.org/details/ratingofchesspla00unse Internet Archive
  6. Wilson, E. B. (1927). “Probable Inference, the Law of Succession, and Statistical Inference.” JASAOpen PDF: https://www.med.mcgill.ca/epidemiology/Hanley/bios601/Proportion/wilson_jasa_1927.pdf McGill University Medicine
  7. Huang, T.-K., & Lin, C.-J. (2006). “Generalized Bradley-Terry Models and Multi-class Probability Estimates.” https://www.csie.ntu.edu.tw/~cjlin/papers/generalBT.pdf 國立臺灣大學資訊工程學系
  8. Li, D. et al. (2024). “From Generation to Judgment: Opportunities and Challenges of LLM-as-a-Judge.” (survey) https://arxiv.org/abs/2411.16594 arXiv

Sunday, July 20, 2025

Next-Gen AI: Multi-Agent LLMs and Policy Gradient RL (Explained)

Introduction. 

Artificial Intelligence (AI) is moving beyond single-task chatbots and into a future where multiple smart agents work together—and learn from their experiences. This new wave of AI is powered by Multi-Agent Large Language Models (LLMs) and Reinforcement Learning (RL). Let’s break down what this means, and why it matters for everyone.

What Are Multi-Agent LLMs?

If you’ve ever chatted with an AI like ChatGPT or Google Gemini, you’ve experienced a single “agent” at work. But imagine if you had a whole team of AI experts—each with a different specialty—collaborating to answer your questions or solve your problems.

That’s what Multi-Agent LLMs are: several AI “personalities” (like a general doctor, a specialist, and a risk manager) working together. They can ask each other questions, give advice, and debate the best answer—just like a real-world panel of experts.

What Is Reinforcement Learning?

Reinforcement Learning (RL) is how AI learns by doing. The AI agent tries actions, gets feedback (rewards for good decisions, penalties for mistakes), and gradually figures out the smartest way to act. It’s like how we learn new skills—trial and error, over many attempts.

Why Combine Them?

When we combine the “brainpower” of multiple LLM agents with the ability of RL to learn from experience, you get something powerful:

  • The AI agent learns to use advice from different experts, not just rely on one.

  • Over time, it gets better at making complex decisions—whether it’s diagnosing patients, handling business workflows, or answering tough questions.

  • The teamwork approach makes the system more robust, explainable, and safe.

A Real Example

In a recent AI project, we trained an agent to diagnose patient cases. It didn’t just rely on one answer—instead, it asked three LLM advisors (each playing a different medical role) for opinions, then decided what to do. As it learned from rewards and mistakes, its accuracy went up. That’s the magic of next-gen AI: collaborative, continuously learning, and smarter with every step.

Tutorial.


Code:

import numpy as np
import random
import keras
from keras import layers
from keras.optimizers import Adam
from keras.models import Model
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import requests
import time
import tensorflow as tf

# -------- Groq API Config --------
USE_REAL_LLM = True # Set False for mock/test
GROQ_API_KEY = "Use your key"
ENDPOINT = "https://api.groq.com/openai/v1/chat/completions"
MODEL = "llama3-70b-8192" # Or "llama3-8b-8192"

N_EPISODES = 10 # Lower for demo, increase for more training
EMBED_DIM = 32
N_ADVISORS = 3
N_ACTIONS = 5
GAMMA = 0.99

# ---- Synthetic Patient Dataset ----
patient_cases = [
(1, 1, 1, 'flu'),
(1, 0, 0, 'cold'),
(0, 1, 0, 'cold'),
(1, 1, 0, 'flu'),
(1, 0, 1, 'flu'),
(0, 1, 1, 'flu'),
(0, 0, 0, 'cold'),
(1, 0, 0, 'cold'),
(0, 0, 1, 'cold'),
]
def sample_case():
return random.choice(patient_cases)

# ---- Real Groq LLM API Adapter ----
def query_llm_groq(prompt, personality_name):
if not USE_REAL_LLM:
if personality_name == "Internist":
return "Stepwise testing is safest; treat if strong evidence only."
elif personality_name == "Specialist":
return "Rule out severe cases, do broad diagnostics."
elif personality_name == "Generalist":
return "Prioritize patient comfort and minimal intervention."
else:
return "No specific advice."
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {GROQ_API_KEY}"
}
system_prompt = f"You are a {personality_name} medical advisor. Return a one-sentence actionable recommendation for the case."
payload = {
"model": MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt}
],
"max_tokens": 50,
"temperature": 0.3,
"n": 1
}
for attempt in range(3): # Retry on error
try:
response = requests.post(ENDPOINT, headers=headers, json=payload, timeout=20)
if response.status_code == 200:
out = response.json()
return out['choices'][0]['message']['content'].strip()
else:
print(f"Groq LLM error code {response.status_code}, retrying...")
time.sleep(2)
except Exception as e:
print(f"Groq Exception: {e}, retrying...")
time.sleep(2)
return "[LLM Error or Timeout]"

# --- LLM Advisors (Groq + Llama3, with negotiation) ---
def get_advisors(state, prev_advices=None):
personalities = ["Internist", "Specialist", "Generalist"]
advices = []
for idx, personality in enumerate(personalities):
prompt = f"Patient symptoms: fever={state[0]}, cough={state[1]}, risk factors={state[2]}."
if prev_advices:
prompt += f" Previous advisor opinions: {' | '.join(prev_advices)}"
prompt += " Revise or comment if needed."
advice = query_llm_groq(prompt, personality)
advices.append(advice)
return advices

# --- Patient Environment ---
class PatientEnv:
def reset(self):
fever, cough, risk, diag = sample_case()
self.state = [fever, cough, risk]
self.true_diagnosis = diag
return np.array(self.state, dtype=np.float32), diag
def step(self, action):
reward = 0; done = False
if action == 0: reward = -2 # order test
elif action == 1: # diagnose cold
if self.true_diagnosis == 'cold': reward = 10; done = True
else: reward = -10; done = True
elif action == 2: # diagnose flu
if self.true_diagnosis == 'flu': reward = 10; done = True
else: reward = -10; done = True
elif action == 3: reward = -2 # prescribe
elif action == 4: reward = 0; done = True # refer
else: reward = -5
return reward, done

# --- Embedding Model ---
embedder = SentenceTransformer('all-MiniLM-L6-v2')
def embed_sentences(sentences):
arr = embedder.encode(sentences)
if arr.shape[1] > EMBED_DIM:
arr = arr[:,:EMBED_DIM]
return arr

# --- Keras 3 RL Policy Network with Attention ---
def build_policy_network(state_dim, emb_dim, n_advisors, n_actions):
state_in = keras.Input(shape=(state_dim,), name="state")
advisor_emb_in = keras.Input(shape=(n_advisors, emb_dim), name="advisor_emb")
x = layers.TimeDistributed(layers.Dense(emb_dim, activation='relu'))(advisor_emb_in)
attn_scores = layers.TimeDistributed(layers.Dense(1))(x)
attn_scores_flat = layers.Flatten()(attn_scores)
attn_weights = layers.Activation('softmax', name='attn_weights')(attn_scores_flat)
attn_weights_exp = layers.Reshape((n_advisors, 1))(attn_weights)
advisor_context = layers.Dot(axes=1)([attn_weights_exp, x])
advisor_context = layers.Flatten()(advisor_context)
concat = layers.Concatenate()([state_in, advisor_context])
dense = layers.Dense(64, activation='relu')(concat)
out = layers.Dense(n_actions, activation='softmax')(dense)
model = keras.Model([state_in, advisor_emb_in], out)
# Model for extracting attention weights
attn_model = Model([state_in, advisor_emb_in], attn_weights)
return model, attn_model

# --- Training Loop: REINFORCE Policy Gradient ---
env = PatientEnv()
policy_net, attn_model = build_policy_network(3, EMBED_DIM, N_ADVISORS, N_ACTIONS)
optimizer = Adam(learning_rate=1e-3)

reward_history = []
for episode in range(N_EPISODES):
state, diag = env.reset()
episode_logprobs = []
episode_rewards = []
done = False
step = 0
while not done:
# Advisors: negotiation
advices = get_advisors(state)
advices = get_advisors(state, advices)
advisor_embs = embed_sentences(advices)
advisor_embs = advisor_embs[np.newaxis, ...]
state_batch = state[np.newaxis, ...]
# Policy step
probs = policy_net([state_batch, advisor_embs]).numpy()[0]
action = np.random.choice(N_ACTIONS, p=probs)
# Log-prob for policy gradient
logprob = np.log(probs[action] + 1e-8)
episode_logprobs.append(logprob)
# Step in env
reward, done = env.step(action)
episode_rewards.append(reward)
step += 1

# --- Policy gradient update (REINFORCE) ---
returns = []
G = 0
for r in reversed(episode_rewards):
G = r + GAMMA * G
returns.insert(0, G)
returns = np.array(returns)
returns = (returns - returns.mean()) / (returns.std() + 1e-8) # normalize

# Policy loss (one step, for demo)
with tf.GradientTape() as tape:
state, _ = env.reset()
advices = get_advisors(state)
advices = get_advisors(state, advices)
advisor_embs = embed_sentences(advices)
advisor_embs = advisor_embs[np.newaxis, ...]
state_batch = state[np.newaxis, ...]
probs = policy_net([state_batch, advisor_embs], training=True)[0]
loss = -tf.math.log(probs[action] + 1e-8) * returns[0]
grads = tape.gradient(loss, policy_net.trainable_weights)
optimizer.apply_gradients(zip(grads, policy_net.trainable_weights))

reward_history.append(np.sum(episode_rewards))

# Print logs
if episode < 3 or episode % 5 == 0:
action_names = ["Order test", "Diagnose cold", "Diagnose flu", "Prescribe", "Refer"]
attn_vals = attn_model([state_batch, advisor_embs]).numpy()[0]
top_advisor = np.argmax(attn_vals)
print(f"\n--- Episode {episode} ---")
print(f"Patient: fever={state[0]}, cough={state[1]}, risk={state[2]}, true_diag={diag}")
for i, a in enumerate(advices):
print(f"Advisor {i+1}: {a}")
print(f"Agent chose: {action_names[action]} (Reward: {reward})")
print(f"Attention: Advisor {top_advisor+1} most influential ({attn_vals[top_advisor]:.2f})")

# --- Plot reward vs. episode ---
plt.plot(reward_history)
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("Keras 3 RL Agent + Groq Llama3 LLM Advisors: Reward vs. Episode")
plt.show()

Reference:

  1. Yao, S., Zhao, X., et al. "Tree of Thoughts: Deliberate Problem Solving with Large Language Models," arXiv preprint arXiv:2305.10601, 2023.
  2. Guohao Li, Hasan Abed Al Kader Hammoud, Hani Itani, Dmitrii Khizbullin, Bernard Ghanem. "CAMEL: Communicative Agents for 'Mind' Exploration of Large Scale Language Model Society," arXiv preprint arXiv:2303.17760, 2023.
  3. Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Beibin Li, Erkang Zhu, Li Jiang, Xiaoyun Zhang, Shaokun Zhang, Jiale Liu, Ahmed Hassan Awadallah, Ryen W White, Doug Burger, Chi Wang. "AutoGen: Enabling next-generation multi-agent LLM applications," arXiv preprint arXiv:2308.08155, 2023.
  4. Sutton, R. S., & Barto, A. G. "Reinforcement Learning: An Introduction," 2nd Edition, MIT Press, 2018.
  5. Guanzhi Wang, Yuqi Xie, Yunfan Jiang, Ajay Mandlekar, Chaowei Xiao, Yuke Zhu, Linxi Fan, Anima Anandkumar. "Voyager: An Open-Ended Embodied Agent with Large Language Models," arXiv preprint arXiv:2305.16291, 2023.
  6. Noah Shinn, Federico Cassano, Edward Berman, Ashwin Gopinath, Karthik Narasimhan, Shunyu Yao. "Reflexion: Language Agents with Verbal Reinforcement Learning," arXiv preprint arXiv:2303.11366, 2023.

Saturday, July 12, 2025

AI vs Human Writing Robust Hybrid LLM Aided Detection ( source code)

Introduction.

Can you really tell if a text is written by a human—or by ChatGPT? In this step-by-step tutorial, discover the secrets of hybrid AI detection: combining advanced statistical analysis with the power of Large Language Models (LLMs) to confidently distinguish human writing from AI-generated content. You’ll learn: How semantic, structural, and entropy features reveal AI text Why LLM meta-classification (self-consistency voting) beats single-method detection How adversarial tricks try to fool detectors—and how to spot them.

Working Code.

import os
import groq
import numpy as np
import nltk
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from nltk.util import ngrams
import re

nltk.download('punkt')

groq.api_key = os.getenv('Input your key') or "Input Your Key"

############################
# Statistical Feature Extraction
############################

def split_paragraphs(text):
return [p.strip() for p in text.split('\n') if p.strip()]

def sentences(text):
return nltk.sent_tokenize(text)

def complex_sentence_ratio(text, threshold=20):
sents = sentences(text)
return sum(1 for s in sents if len(s.split()) > threshold) / max(1, len(sents))

def entropy(ngram_list):
total = sum(ngram_list.values())
if total == 0:
return 0.0
probs = np.array(list(ngram_list.values())) / total
return -np.sum(probs * np.log2(probs + 1e-12))

def get_entropy(text, n):
words = nltk.word_tokenize(text)
if len(words) < n:
return 0.0
ngrams_list = list(ngrams(words, n))
counts = Counter(ngrams_list)
return entropy(counts)

def semantic_consistency(text, model):
chunks = split_paragraphs(text)
if len(chunks) < 2:
return 0.5 # fallback for short text
embeddings = model.encode(chunks)
similarities = [cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0] for i in range(len(embeddings)-1)]
return float(np.mean(similarities))

def structural_complexity(text, L_ref=17, w1=0.3, w2=0.6, w3=0.1):
sents = sentences(text)
lens = [len(nltk.word_tokenize(s)) for s in sents]
if not lens:
return 0.0
L_avg = np.mean(lens)
L_var = np.var(lens)
F_cmplx = complex_sentence_ratio(text)
S_struc = w1*L_var + w2*F_cmplx - w3*abs(L_avg - L_ref)
return float(S_struc)

def linguistic_entropy(text):
H1 = get_entropy(text, 1)
H2 = get_entropy(text, 2)
H3 = get_entropy(text, 3)
return float(np.mean([H1, H2, H3]))

def SSDD_score(text, model, alpha1=2, alpha2=1, alpha3=1):
S_sem = semantic_consistency(text, model)
S_struc = structural_complexity(text)
S_entropy = linguistic_entropy(text)
z = alpha1 * S_sem - alpha2 * S_struc - alpha3 * S_entropy
stat_prob = 1 / (1 + np.exp(-z))
return stat_prob, S_sem, S_struc, S_entropy

############################
# LLM Meta-Classification (with Self-Consistency)
############################

LLM_VOTING_PROMPTS = [
"Here is a text sample:\n{text}\n\nDo you think this was written by a human or by an AI assistant (like AI or Llama3)? Please respond with:\n- Answer: [Human/AI]\n- Probability: [0.0 - 1.0]\n- Explanation: [your explanation]",
"Read the following sample and estimate if it was written by a person or a language model like AI/Llama. Reply only as:\nAnswer: [Human/AI]\nProbability: [number]\nExplanation: [reason]\n\nSample:\n{text}",
"Given this passage, tell me if it's most likely AI or human generated. Provide your guess, a probability (0-1), and your main reason.\n\n{text}"
]

def parse_llm_response(output):
# Try to robustly parse: Answer, Probability, Explanation
try:
answer_match = re.search(r'Answer:\s*(AI|Human)', output, re.IGNORECASE)
prob_match = re.search(r'Probability:\s*([0-9.]+)', output)
explanation_match = re.search(r'Explanation:(.*)', output, re.DOTALL | re.IGNORECASE)
llm_pred = "AI" if answer_match and "AI" in answer_match.group(1).upper() else "Human"
llm_prob = float(prob_match.group(1)) if prob_match else 0.5
explanation = explanation_match.group(1).strip() if explanation_match else output.strip()
except Exception as e:
llm_pred, llm_prob, explanation = "Unknown", 0.5, output.strip()
return llm_pred, llm_prob, explanation

def llm_detection_self_consistency(text, model_name="llama3-70b-8192", n_prompts=3):
llm_probs, llm_preds, explanations = [], [], []
for i in range(n_prompts):
prompt = LLM_VOTING_PROMPTS[i % len(LLM_VOTING_PROMPTS)].format(text=text)
try:
response = groq.ChatCompletion.create(
model=model_name,
messages=[{"role": "user", "content": prompt}],
max_tokens=300,
temperature=0
)
output = response['choices'][0]['message']['content']
except Exception as ex:
output = "Answer: Unknown\nProbability: 0.5\nExplanation: API error: " + str(ex)
llm_pred, llm_prob, explanation = parse_llm_response(output)
llm_probs.append(llm_prob)
llm_preds.append(llm_pred)
explanations.append(explanation)
# Majority vote, or average probability
avg_prob = np.mean(llm_probs)
maj_pred = "AI" if llm_preds.count("AI") >= n_prompts//2+1 else "Human"
concat_explanation = "\n\n".join([f"Prompt {i+1}: {exp}" for i, exp in enumerate(explanations)])
return maj_pred, avg_prob, concat_explanation, llm_probs, llm_preds

############################
# Robust Hybrid Ensemble Detection (RHLAD)
############################

def adaptive_threshold(stat_probs, base=0.7):
# Set threshold to max(base, mean+std/2): more robust in real-world text
if not stat_probs:
return base
return float(max(base, np.mean(stat_probs) + np.std(stat_probs)/2))

def RHLAD_analyze(texts, model, alpha=0.5, beta=0.5, llm_model="llama3-70b-8192"):
results = []
stat_probs_all = []
# Precompute all stat scores for thresholding
for text in texts:
stat_P_AI, _, _, _ = SSDD_score(text, model)
stat_probs_all.append(stat_P_AI)
threshold = adaptive_threshold(stat_probs_all, base=0.7)
for idx, text in enumerate(texts):
stat_P_AI, S_sem, S_struc, S_entropy = SSDD_score(text, model)
llm_pred, llm_prob, explanation, llm_probs_v, llm_preds_v = llm_detection_self_consistency(
text, model_name=llm_model, n_prompts=3
)
combined_score = alpha * llm_prob + beta * stat_P_AI
prediction = "AI" if combined_score > threshold else "Human"
# Adversarial defense: If statistical score is extremely high (>0.9), flag as suspicious even if LLM disagrees
adversarial_flag = (stat_P_AI > 0.9 and llm_prob < 0.5)
results.append({
'index': idx,
'stat_P_AI': stat_P_AI,
'S_sem': S_sem,
'S_struc': S_struc,
'S_entropy': S_entropy,
'llm_prob': llm_prob,
'llm_probs_voting': llm_probs_v,
'llm_preds_voting': llm_preds_v,
'combined_score': combined_score,
'prediction': prediction,
'adversarial_flag': adversarial_flag,
'llm_explanation': explanation,
'threshold': threshold
})
return results

############################
# Sample Usage
############################

if __name__ == '__main__':
texts = [
# Human sample
"When I woke up this morning, the sky was a pale blue and birds sang outside my window. I remembered my childhood days, full of laughter and chaos, and decided to write a letter to my old friend.",
# AI Generated sample
"Artificial intelligence, particularly language models like AI, have transformed the way we interact with technology. These models are trained on vast amounts of text data and can generate human-like responses to a wide range of queries.",
# Paraphrased AI (try to fool the system)
"Leveraging massive datasets, today's AI models craft responses that feel increasingly human. Our interactions with technology have been revolutionized, thanks to these powerful language tools."
]
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

print("\nRunning Robust Hybrid LLM-Aided Detection (RHLAD) using Groq Llama3...")
results = RHLAD_analyze(texts, model, alpha=0.5, beta=0.5, llm_model="llama3-70b-8192")

for res in results:
print(f"\nSample #{res['index']+1} — Final Prediction: {res['prediction']} (Combined={res['combined_score']:.2f})")
print(f" Statistical SSDD Score: {res['stat_P_AI']:.2f} (Semantic={res['S_sem']:.2f}, Structure={res['S_struc']:.2f}, Entropy={res['S_entropy']:.2f})")
print(f" LLM Meta Probability (avg voting): {res['llm_prob']:.2f}")
print(f" LLM Voting Details: {res['llm_preds_voting']}, Probs={res['llm_probs_voting']}")
print(f" Explanation:\n{res['llm_explanation']}")
print(f" Adaptive Threshold used: {res['threshold']:.2f}")
if res['adversarial_flag']:
print(" [!ADVERSARIAL WARNING!] — Statistical and LLM signals disagree: possible paraphrased AI.")
print("-" * 70)


References.

  1. Solaiman, I., Brundage, M., Clark, J., et al. "Release Strategies and the Social Impacts of Language Models." arXiv preprint arXiv:1908.09203, 2019.
  2. Bakhtin, A., Deng, Y., Ott, M., et al."Real or Fake? Learning to Discriminate Machine from Human Generated Text." arXiv preprint arXiv:1906.03351, 2019.
  3. Ippolito, D., Duckworth, D., Callison-Burch, C., & Eck, D. "Automatic Detection of Generated Text is Easiest when Humans are Fooled." Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL), 2020, pp. 1808–1822.
  4. Jawahar, G., Sagot, B., & Seddah, D. "Automatic Detection of Machine Generated Text: A Critical Survey." arXiv preprint arXiv:2005.08512, 2020.
  5. Kreps, S., McCain, R. M., & Brundage, M. "All the News That's Fit to Fabricate: AI-Generated Text as a Tool of Media Misinformation." Journal of Experimental Political Science, vol. 10, no. 2, 2023, pp. 233–244.
  6. Mitchell, M., Wu, S., Zaldivar, A., Barnes, P., Vasserman, L., Hutchinson, B., Spitzer, E., Raji, I. D., & Gebru, T. "Model Cards for Model Reporting." Proceedings of the Conference on Fairness, Accountability, and Transparency (FAT '19)*, 2019, pp. 220–229.