Video Content Summary
This video explains a cutting-edge AI framework that allows large language models (LLMs) to automatically improve their own prompts. We'll break down how this works using a simple sentiment analysis example, showing how an average prompt can be transformed into a highly specific and effective one without any manual effort.
Discover how this "self-optimizing" loop, powered by a meta-prompt and a feedback system, can achieve better results than prompts written by human experts. We'll also dive into the latest research in this field, exploring how this concept is paving the way for a new era of AI systems that can continuously improve themselves.
Hashtags
#AI #LLM #PromptEngineering #SelfImprovingAI #AIautomation #MachineLearning #DeepLearning #TechExplained #ArtificialIntelligence #GoogleDeepMind
Tutorial:
Code:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
OPRO-style prompt optimization (sentiment classification) using Groq Llama 3.1 70B.
Fixes vs previous version:
- RateLimiter ensures a minimum gap between API calls (prevents "too quick" bursts)
- Robust retry with exponential backoff + jitter on 429/5xx
- Friendly 400 error messages with full Groq error text
- Safer defaults for max_tokens and proposals_per_round
- Optional sleeps inside scoring to further smooth request cadence
Usage:
pip install requests
export GROQ_API_KEY=...
python opro_sentiment_groq.py
"""
import os
import time
import json
import re
import random
import requests
from dataclasses import dataclass
from typing import List, Dict, Tuple, Any
# -----------------------------
# Configuration for Groq API
# -----------------------------
@dataclass
class GroqConfig:
api_key: str = os.getenv("GROQ_API_KEY", "Please use your own key")
endpoint: str = os.getenv("GROQ_ENDPOINT", "https://api.groq.com/openai/v1/chat/completions")
model: str = os.getenv("GROQ_MODEL", "llama-3.1-8b-instant")
timeout: int = int(os.getenv("GROQ_TIMEOUT", "60"))
# Rate/Retry tuning
min_interval_s: float = float(os.getenv("GROQ_MIN_INTERVAL_S", "0.8")) # min gap between calls
max_retries: int = int(os.getenv("GROQ_MAX_RETRIES", "5"))
backoff_base_s: float = float(os.getenv("GROQ_BACKOFF_BASE_S", "0.8"))
backoff_jitter_s: float = float(os.getenv("GROQ_BACKOFF_JITTER_S", "0.4"))
# -----------------------------
# Simple rate limiter
# -----------------------------
class RateLimiter:
def __init__(self, min_interval_s: float = 0.8):
self.min_interval_s = min_interval_s
self._last = 0.0
def wait(self):
now = time.time()
gap = now - self._last
if gap < self.min_interval_s:
time.sleep(self.min_interval_s - gap)
self._last = time.time()
# -----------------------------
# Groq Client (OpenAI-compatible)
# -----------------------------
class GroqClient:
def __init__(self, cfg: GroqConfig):
self.cfg = cfg
if not self.cfg.api_key:
raise RuntimeError("GROQ_API_KEY is required.")
# Ensure endpoint points to /chat/completions
if not self.cfg.endpoint.endswith("/chat/completions"):
self.cfg.endpoint = self.cfg.endpoint.rstrip("/") + "/chat/completions"
self.session = requests.Session()
self.rate = RateLimiter(self.cfg.min_interval_s)
def _headers(self):
return {
"Authorization": f"Bearer {self.cfg.api_key}",
"Content-Type": "application/json",
"User-Agent": "opro-groq-example/1.0",
}
def chat(self, messages: List[Dict[str, str]], temperature: float = 0.2,
max_tokens: int = 128, top_p: float = 1.0, stop: List[str] = None) -> str:
"""
Resilient chat with:
- min interval spacing
- retries with exponential backoff on 429/5xx
- friendly 400 error messages (no blind tracebacks)
"""
payload = {
"model": self.cfg.model,
"messages": messages,
"temperature": float(temperature),
"top_p": float(top_p),
"max_tokens": int(max_tokens),
}
if stop:
payload["stop"] = stop
# Light sanity checks to avoid 400s
if not payload["messages"] or not isinstance(payload["messages"], list):
raise ValueError("messages must be a non-empty list")
if payload["max_tokens"] <= 0:
raise ValueError("max_tokens must be > 0")
if not isinstance(payload["model"], str) or not payload["model"]:
raise ValueError("model must be a non-empty string")
attempt = 0
while True:
attempt += 1
self.rate.wait() # space out calls
try:
r = self.session.post(
self.cfg.endpoint,
headers=self._headers(),
json=payload,
timeout=self.cfg.timeout,
)
except requests.RequestException as e:
# Network-level error -> backoff unless we've exhausted retries
if attempt <= self.cfg.max_retries:
self._sleep_backoff(attempt)
continue
raise RuntimeError(f"Network error after {attempt} attempts: {e}") from e
if r.status_code == 200:
data = r.json()
try:
return data["choices"][0]["message"]["content"].strip()
except Exception as e:
raise RuntimeError(f"Unexpected response schema: {json.dumps(data)[:500]}") from e
# Friendly handling for 400 (Bad Request) to show Groq error text
if r.status_code == 400:
try:
err = r.json()
msg = err.get("error", {}).get("message") or err
except Exception:
msg = r.text
raise ValueError(f"400 Bad Request from Groq: {msg}")
# Retryable statuses
if r.status_code in (408, 409, 429, 500, 502, 503, 504):
if attempt <= self.cfg.max_retries:
self._sleep_backoff(attempt, rate_or_server=r.status_code)
continue
try:
err = r.json()
msg = err.get("error", {}).get("message") or err
except Exception:
msg = r.text
raise RuntimeError(f"HTTP {r.status_code} after {attempt} attempts: {msg}")
# Non-retryable unexpected status
try:
err = r.json()
msg = err.get("error", {}).get("message") or err
except Exception:
msg = r.text
raise RuntimeError(f"HTTP {r.status_code}: {msg}")
def _sleep_backoff(self, attempt: int, rate_or_server: int = None):
base = self.cfg.backoff_base_s * (2 ** (attempt - 1))
jitter = random.uniform(0, self.cfg.backoff_jitter_s)
# Cap the backoff to something sane
sleep_s = min(base + jitter, 10.0)
if rate_or_server:
print(f"[retry {attempt}] got {rate_or_server}; sleeping {sleep_s:.2f}s...")
else:
print(f"[retry {attempt}] network error; sleeping {sleep_s:.2f}s...")
time.sleep(sleep_s)
# -----------------------------
# Seed history from the user
# -----------------------------
prompt_history: List[Dict[str, Any]] = [
{"prompt": "Classify the following text's sentiment. Respond with 'positive', 'negative', or 'neutral'.", "score": 0.67},
{"prompt": "Analyze the text for emotional tone. Provide a single-word classification: 'positive', 'negative', or 'neutral'.", "score": 0.83},
{"prompt": "Determine the sentiment of the provided text. The possible categories are 'positive', 'negative', and 'neutral'.", "score": 0.83},
{"prompt": "What is the sentiment of the text? Your options are positive, negative, or neutral.", "score": 0.67},
{"prompt": "Please assign a sentiment label to the text. Choose from 'positive', 'negative', or 'neutral'.", "score": 0.83},
{"prompt": "Identify the sentiment. Select one: 'positive', 'negative', 'neutral'.", "score": 0.83},
{"prompt": "Strictly classify the sentiment. Output must be only one of: positive, negative, neutral.", "score": 0.83},
{"prompt": "Classify the sentiment of the text below. Be specific and use only one of the following words: positive, negative, neutral.", "score": 0.83},
{"prompt": "Determine the emotional valence of the text. Respond with positive, negative, or neutral.", "score": 0.67},
{"prompt": "Provide the sentiment classification for the text. Use only the terms positive, negative, or neutral.", "score": 0.83},
{"prompt": "Categorize the text's sentiment. The only valid outputs are 'positive', 'negative', or 'neutral'.", "score": 1.00},
{"prompt": "For the following text, is the sentiment positive, negative, or neutral? Respond with only the chosen word.", "score": 1.00},
{"prompt": "Analyze the text for its core sentiment. Respond with 'positive', 'negative', or 'neutral'. No other words.", "score": 1.00},
{"prompt": "Sentiment analysis: positive, negative, or neutral?", "score": 0.67},
{"prompt": "What is the sentiment of the text? Answer with one word: 'positive', 'negative', or 'neutral'.", "score": 0.83},
{"prompt": "Classify the sentiment of the text. Output must be a single word: positive, negative, or neutral.", "score": 1.00},
{"prompt": "Perform sentiment classification on the text. Return only the class name: positive, negative, or neutral.", "score": 1.00},
{"prompt": "The sentiment of the text is... (positive/negative/neutral).", "score": 0.83},
{"prompt": "Determine if the text has a positive, negative, or neutral sentiment.", "score": 0.67},
{"prompt": "Analyze the text and classify its sentiment as 'positive', 'negative', or 'neutral'. Respond with only the sentiment label.", "score": 1.00},
]
# -----------------------------
# Validation data (the evaluator)
# -----------------------------
validation_data: List[Tuple[str, str]] = [
("I love this product, it's amazing!", "positive"),
("The customer service was terrible.", "negative"),
("The movie was okay, not great.", "neutral"),
("This is the best day ever!", "positive"),
("I am so frustrated with the service.", "negative"),
("The delivery was on time.", "neutral"),
]
VALID_LABELS = {"positive", "negative", "neutral"}
def normalize_label(text: str) -> str:
"""Extract a clean single-word label from model output."""
t = text.strip().lower()
for token in re.split(r"[\s\.\,\!\:\;\-\_\/\|\(\)\[\]\{\}]+", t):
if token in VALID_LABELS:
return token
if "posit" in t: return "positive"
if "negat" in t: return "negative"
if "neutr" in t: return "neutral"
return t # may be invalid; caller will handle
def classify_once(llm: GroqClient, instruction: str, text: str) -> str:
"""Run the classifier model once with the given instruction and text."""
sys = (
"You are a strict sentiment classifier. "
"You must output exactly one word: positive, negative, or neutral. "
"No punctuation, no extra words."
)
user = f"{instruction}\n\nText: {text}\nSentiment:"
out = llm.chat(
messages=[{"role": "system", "content": sys},
{"role": "user", "content": user}],
temperature=0.0, # deterministic for scoring
max_tokens=4,
top_p=1.0,
stop=["\n"] # small guard to keep it to one word
)
return normalize_label(out)
def score_prompt(llm: GroqClient, instruction: str, sleep_between_calls: float = 0.25) -> float:
"""Accuracy on the small validation set (with optional inter-call sleep)."""
correct = 0
for text, gold in validation_data:
pred = classify_once(llm, instruction, text)
if pred == gold:
correct += 1
# Gentle pacing to avoid bursty scoring calls
if sleep_between_calls > 0:
time.sleep(sleep_between_calls)
return correct / len(validation_data)
# -----------------------------
# OPRO meta-prompt & proposal
# -----------------------------
def build_meta_prompt(task_desc: str,
history: List[Dict[str, Any]],
num_proposals: int) -> str:
header = (
"You are optimizing an instruction for a sentiment classification task.\n"
"Goal: maximize accuracy on a small validation set.\n\n"
"You will see previous candidate instructions with their scores (0.00–1.00).\n"
"Propose NEW instructions that are likely to achieve even higher accuracy.\n"
"Constraints for each proposed instruction:\n"
" • Demand a single-word output: positive, negative, or neutral.\n"
" • Discourage extra text or punctuation.\n"
" • Single line only (no numbering, quotes, or code blocks).\n"
"Return exactly {k} new instructions, each on its own line, NOTHING ELSE.\n"
).format(k=num_proposals)
lines = ["Previous (score → instruction):"]
for h in sorted(history, key=lambda x: x["score"], reverse=True):
lines.append(f"{h['score']:.2f} → {h['prompt']}")
tail = (
f"\nTask description:\n{task_desc}\n\n"
f"Now propose {num_proposals} new instructions as specified."
)
return header + "\n".join(lines) + tail
def parse_proposals(raw: str) -> List[str]:
cands = []
# Remove code fences if the model disobeys
raw = re.sub(r"^```.*?\n|\n```$", "", raw, flags=re.DOTALL)
for ln in raw.splitlines():
ln = ln.strip()
if not ln:
continue
ln = re.sub(r"^(\d+[\.\)]\s*|\-\s*|\*\s*)", "", ln).strip()
if ln and len(ln) < 300:
cands.append(ln)
return cands
def propose_instructions(optimizer: GroqClient,
history: List[Dict[str, Any]],
num_proposals: int = 6) -> List[str]:
task_desc = (
"Given any English text, classify its sentiment as exactly one of: "
"positive, negative, or neutral."
)
meta_prompt = build_meta_prompt(task_desc, history, num_proposals)
sys = "You are an expert prompt engineer and optimizer."
out = optimizer.chat(
messages=[{"role": "system", "content": sys},
{"role": "user", "content": meta_prompt}],
temperature=0.7, # exploration
max_tokens=256, # safe upper bound
top_p=1.0
)
return parse_proposals(out)
# -----------------------------
# prompt optimization loop
# -----------------------------
def run_opro(optimizer: GroqClient,
evaluator: GroqClient,
seed_history: List[Dict[str, Any]],
rounds: int = 2,
proposals_per_round: int = 6,
keep_top_k: int = 16) -> Tuple[List[Dict[str, Any]], Dict[str, float]]:
history = list(seed_history)
seen = set(h["prompt"] for h in history)
for r in range(1, rounds + 1):
print(f"\n=== OPRO Round {r} ===")
# Propose
proposals = propose_instructions(optimizer, history, num_proposals=proposals_per_round)
# Dedup
unique_props = [p for p in proposals if p not in seen]
if not unique_props:
print("No novel proposals received; try increasing temperature or num_proposals.")
break
# Score
new_items = []
for instr in unique_props:
try:
acc = score_prompt(evaluator, instr, sleep_between_calls=0.3)
except ValueError as ve:
# Likely a 400 with helpful message; print & skip this candidate
print(f"[skip: 400] {ve}")
continue
except Exception as e:
print(f"[skip: error] {e}")
continue
new_items.append({"prompt": instr, "score": acc})
print(f"[{acc:.2f}] {instr}")
seen.add(instr)
# Update history and keep top-k
history.extend(new_items)
history = sorted(history, key=lambda x: x["score"], reverse=True)[:keep_top_k]
# Gentle pause between rounds to avoid bursts
time.sleep(1.2)
best = max(history, key=lambda x: x["score"])
leaderboard = {h["prompt"]: h["score"] for h in history}
return history, {"best_prompt": best["prompt"], "best_score": best["score"]}
# -----------------------------
# Main
# -----------------------------
def main():
cfg = GroqConfig()
client = GroqClient(cfg)
optimizer = client
evaluator = client
# Re-score seed prompts so all scores are from THIS evaluator & dataset
rescored_history = []
print("Scoring seed prompts on validation set...")
for h in prompt_history:
try:
acc = score_prompt(evaluator, h["prompt"], sleep_between_calls=0.25)
except ValueError as ve:
# A 400 typically means malformed instruction; keep original score but warn
print(f"[warn seed 400] {ve}")
acc = h.get("score", 0.0)
except Exception as e:
print(f"[warn seed error] {e}")
acc = h.get("score", 0.0)
rescored_history.append({"prompt": h["prompt"], "score": acc})
print(f"[{acc:.2f}] {h['prompt']}")
# Small pause between seeds
time.sleep(0.2)
# Run
updated_history, summary = run_opro(
optimizer=optimizer,
evaluator=evaluator,
seed_history=rescored_history,
rounds=2, # tune as needed
proposals_per_round=6, # lower = fewer/friendlier requests
keep_top_k=16
)
print("\n=== Leaderboard (top prompts) ===")
for h in sorted(updated_history, key=lambda x: x["score"], reverse=True):
print(f"{h['score']:.2f} :: {h['prompt']}")
print("\n=== Best discovered instruction ===")
print(f"Score: {summary['best_score']:.2f}")
print(f"Prompt: {summary['best_prompt']}")
if __name__ == "__main__":
main()
Reference:
- Yang, C., Wang, X., Lu, Y., Liu, H., Le, Q. V., Zhou, D., & Chen, X. (2023, September). Large language models as optimizers. In The Twelfth International Conference on Learning Representations.
- Zhang, Tuo, Jinyue Yuan, and Salman Avestimehr. "Revisiting opro: The limitations of small-scale llms as optimizers." arXiv preprint arXiv:2405.10276 (2024).
- Guo, Q., Wang, R., Guo, J., Li, B., Song, K., Tan, X., ... & Yang, Y. (2025). EvoPrompt: Connecting LLMs with Evolutionary Algorithms Yields Powerful Prompt Optimizers. arXiv preprint arXiv:2309.08532.
- Tao, Zhengwei, Ting-En Lin, Xiancai Chen, Hangyu Li, Yuchuan Wu, Yongbin Li, Zhi Jin, Fei Huang, Dacheng Tao, and Jingren Zhou. "A survey on self-evolution of large language models." arXiv preprint arXiv:2404.14387 (2024).
No comments:
Post a Comment