Saturday, July 12, 2025

AI vs Human Writing Robust Hybrid LLM Aided Detection ( source code)

Introduction.

Can you really tell if a text is written by a human—or by ChatGPT? In this step-by-step tutorial, discover the secrets of hybrid AI detection: combining advanced statistical analysis with the power of Large Language Models (LLMs) to confidently distinguish human writing from AI-generated content. You’ll learn: How semantic, structural, and entropy features reveal AI text Why LLM meta-classification (self-consistency voting) beats single-method detection How adversarial tricks try to fool detectors—and how to spot them.

Working Code.

import os
import groq
import numpy as np
import nltk
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from nltk.util import ngrams
import re

nltk.download('punkt')

groq.api_key = os.getenv('Input your key') or "Input Your Key"

############################
# Statistical Feature Extraction
############################

def split_paragraphs(text):
return [p.strip() for p in text.split('\n') if p.strip()]

def sentences(text):
return nltk.sent_tokenize(text)

def complex_sentence_ratio(text, threshold=20):
sents = sentences(text)
return sum(1 for s in sents if len(s.split()) > threshold) / max(1, len(sents))

def entropy(ngram_list):
total = sum(ngram_list.values())
if total == 0:
return 0.0
probs = np.array(list(ngram_list.values())) / total
return -np.sum(probs * np.log2(probs + 1e-12))

def get_entropy(text, n):
words = nltk.word_tokenize(text)
if len(words) < n:
return 0.0
ngrams_list = list(ngrams(words, n))
counts = Counter(ngrams_list)
return entropy(counts)

def semantic_consistency(text, model):
chunks = split_paragraphs(text)
if len(chunks) < 2:
return 0.5 # fallback for short text
embeddings = model.encode(chunks)
similarities = [cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0] for i in range(len(embeddings)-1)]
return float(np.mean(similarities))

def structural_complexity(text, L_ref=17, w1=0.3, w2=0.6, w3=0.1):
sents = sentences(text)
lens = [len(nltk.word_tokenize(s)) for s in sents]
if not lens:
return 0.0
L_avg = np.mean(lens)
L_var = np.var(lens)
F_cmplx = complex_sentence_ratio(text)
S_struc = w1*L_var + w2*F_cmplx - w3*abs(L_avg - L_ref)
return float(S_struc)

def linguistic_entropy(text):
H1 = get_entropy(text, 1)
H2 = get_entropy(text, 2)
H3 = get_entropy(text, 3)
return float(np.mean([H1, H2, H3]))

def SSDD_score(text, model, alpha1=2, alpha2=1, alpha3=1):
S_sem = semantic_consistency(text, model)
S_struc = structural_complexity(text)
S_entropy = linguistic_entropy(text)
z = alpha1 * S_sem - alpha2 * S_struc - alpha3 * S_entropy
stat_prob = 1 / (1 + np.exp(-z))
return stat_prob, S_sem, S_struc, S_entropy

############################
# LLM Meta-Classification (with Self-Consistency)
############################

LLM_VOTING_PROMPTS = [
"Here is a text sample:\n{text}\n\nDo you think this was written by a human or by an AI assistant (like AI or Llama3)? Please respond with:\n- Answer: [Human/AI]\n- Probability: [0.0 - 1.0]\n- Explanation: [your explanation]",
"Read the following sample and estimate if it was written by a person or a language model like AI/Llama. Reply only as:\nAnswer: [Human/AI]\nProbability: [number]\nExplanation: [reason]\n\nSample:\n{text}",
"Given this passage, tell me if it's most likely AI or human generated. Provide your guess, a probability (0-1), and your main reason.\n\n{text}"
]

def parse_llm_response(output):
# Try to robustly parse: Answer, Probability, Explanation
try:
answer_match = re.search(r'Answer:\s*(AI|Human)', output, re.IGNORECASE)
prob_match = re.search(r'Probability:\s*([0-9.]+)', output)
explanation_match = re.search(r'Explanation:(.*)', output, re.DOTALL | re.IGNORECASE)
llm_pred = "AI" if answer_match and "AI" in answer_match.group(1).upper() else "Human"
llm_prob = float(prob_match.group(1)) if prob_match else 0.5
explanation = explanation_match.group(1).strip() if explanation_match else output.strip()
except Exception as e:
llm_pred, llm_prob, explanation = "Unknown", 0.5, output.strip()
return llm_pred, llm_prob, explanation

def llm_detection_self_consistency(text, model_name="llama3-70b-8192", n_prompts=3):
llm_probs, llm_preds, explanations = [], [], []
for i in range(n_prompts):
prompt = LLM_VOTING_PROMPTS[i % len(LLM_VOTING_PROMPTS)].format(text=text)
try:
response = groq.ChatCompletion.create(
model=model_name,
messages=[{"role": "user", "content": prompt}],
max_tokens=300,
temperature=0
)
output = response['choices'][0]['message']['content']
except Exception as ex:
output = "Answer: Unknown\nProbability: 0.5\nExplanation: API error: " + str(ex)
llm_pred, llm_prob, explanation = parse_llm_response(output)
llm_probs.append(llm_prob)
llm_preds.append(llm_pred)
explanations.append(explanation)
# Majority vote, or average probability
avg_prob = np.mean(llm_probs)
maj_pred = "AI" if llm_preds.count("AI") >= n_prompts//2+1 else "Human"
concat_explanation = "\n\n".join([f"Prompt {i+1}: {exp}" for i, exp in enumerate(explanations)])
return maj_pred, avg_prob, concat_explanation, llm_probs, llm_preds

############################
# Robust Hybrid Ensemble Detection (RHLAD)
############################

def adaptive_threshold(stat_probs, base=0.7):
# Set threshold to max(base, mean+std/2): more robust in real-world text
if not stat_probs:
return base
return float(max(base, np.mean(stat_probs) + np.std(stat_probs)/2))

def RHLAD_analyze(texts, model, alpha=0.5, beta=0.5, llm_model="llama3-70b-8192"):
results = []
stat_probs_all = []
# Precompute all stat scores for thresholding
for text in texts:
stat_P_AI, _, _, _ = SSDD_score(text, model)
stat_probs_all.append(stat_P_AI)
threshold = adaptive_threshold(stat_probs_all, base=0.7)
for idx, text in enumerate(texts):
stat_P_AI, S_sem, S_struc, S_entropy = SSDD_score(text, model)
llm_pred, llm_prob, explanation, llm_probs_v, llm_preds_v = llm_detection_self_consistency(
text, model_name=llm_model, n_prompts=3
)
combined_score = alpha * llm_prob + beta * stat_P_AI
prediction = "AI" if combined_score > threshold else "Human"
# Adversarial defense: If statistical score is extremely high (>0.9), flag as suspicious even if LLM disagrees
adversarial_flag = (stat_P_AI > 0.9 and llm_prob < 0.5)
results.append({
'index': idx,
'stat_P_AI': stat_P_AI,
'S_sem': S_sem,
'S_struc': S_struc,
'S_entropy': S_entropy,
'llm_prob': llm_prob,
'llm_probs_voting': llm_probs_v,
'llm_preds_voting': llm_preds_v,
'combined_score': combined_score,
'prediction': prediction,
'adversarial_flag': adversarial_flag,
'llm_explanation': explanation,
'threshold': threshold
})
return results

############################
# Sample Usage
############################

if __name__ == '__main__':
texts = [
# Human sample
"When I woke up this morning, the sky was a pale blue and birds sang outside my window. I remembered my childhood days, full of laughter and chaos, and decided to write a letter to my old friend.",
# AI Generated sample
"Artificial intelligence, particularly language models like AI, have transformed the way we interact with technology. These models are trained on vast amounts of text data and can generate human-like responses to a wide range of queries.",
# Paraphrased AI (try to fool the system)
"Leveraging massive datasets, today's AI models craft responses that feel increasingly human. Our interactions with technology have been revolutionized, thanks to these powerful language tools."
]
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

print("\nRunning Robust Hybrid LLM-Aided Detection (RHLAD) using Groq Llama3...")
results = RHLAD_analyze(texts, model, alpha=0.5, beta=0.5, llm_model="llama3-70b-8192")

for res in results:
print(f"\nSample #{res['index']+1} — Final Prediction: {res['prediction']} (Combined={res['combined_score']:.2f})")
print(f" Statistical SSDD Score: {res['stat_P_AI']:.2f} (Semantic={res['S_sem']:.2f}, Structure={res['S_struc']:.2f}, Entropy={res['S_entropy']:.2f})")
print(f" LLM Meta Probability (avg voting): {res['llm_prob']:.2f}")
print(f" LLM Voting Details: {res['llm_preds_voting']}, Probs={res['llm_probs_voting']}")
print(f" Explanation:\n{res['llm_explanation']}")
print(f" Adaptive Threshold used: {res['threshold']:.2f}")
if res['adversarial_flag']:
print(" [!ADVERSARIAL WARNING!] — Statistical and LLM signals disagree: possible paraphrased AI.")
print("-" * 70)


References.

  1. Solaiman, I., Brundage, M., Clark, J., et al. "Release Strategies and the Social Impacts of Language Models." arXiv preprint arXiv:1908.09203, 2019.
  2. Bakhtin, A., Deng, Y., Ott, M., et al."Real or Fake? Learning to Discriminate Machine from Human Generated Text." arXiv preprint arXiv:1906.03351, 2019.
  3. Ippolito, D., Duckworth, D., Callison-Burch, C., & Eck, D. "Automatic Detection of Generated Text is Easiest when Humans are Fooled." Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL), 2020, pp. 1808–1822.
  4. Jawahar, G., Sagot, B., & Seddah, D. "Automatic Detection of Machine Generated Text: A Critical Survey." arXiv preprint arXiv:2005.08512, 2020.
  5. Kreps, S., McCain, R. M., & Brundage, M. "All the News That's Fit to Fabricate: AI-Generated Text as a Tool of Media Misinformation." Journal of Experimental Political Science, vol. 10, no. 2, 2023, pp. 233–244.
  6. Mitchell, M., Wu, S., Zaldivar, A., Barnes, P., Vasserman, L., Hutchinson, B., Spitzer, E., Raji, I. D., & Gebru, T. "Model Cards for Model Reporting." Proceedings of the Conference on Fairness, Accountability, and Transparency (FAT '19)*, 2019, pp. 220–229.

No comments:

Post a Comment