Exploring AI-ML-NLP: LLMS OPTIMIZE THEMSELVES (Beyond Manual Prompt Engineering)

Video Content Summary

This video explains a cutting-edge AI framework that allows large language models (LLMs) to automatically improve their own prompts. We'll break down how this works using a simple sentiment analysis example, showing how an average prompt can be transformed into a highly specific and effective one without any manual effort.
Discover how this "self-optimizing" loop, powered by a meta-prompt and a feedback system, can achieve better results than prompts written by human experts. We'll also dive into the latest research in this field, exploring how this concept is paving the way for a new era of AI systems that can continuously improve themselves.
Hashtags

#AI #LLM #PromptEngineering #SelfImprovingAI #AIautomation #MachineLearning #DeepLearning #TechExplained #ArtificialIntelligence #GoogleDeepMind
Tutorial:

Code:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
OPRO-style prompt optimization (sentiment classification) using Groq Llama 3.1 70B.

Fixes vs previous version:
- RateLimiter ensures a minimum gap between API calls (prevents "too quick" bursts)
- Robust retry with exponential backoff + jitter on 429/5xx
- Friendly 400 error messages with full Groq error text
- Safer defaults for max_tokens and proposals_per_round
- Optional sleeps inside scoring to further smooth request cadence

Usage:
  pip install requests
  export GROQ_API_KEY=...
  python opro_sentiment_groq.py
"""

import os
import time
import json
import re
import random
import requests
from dataclasses import dataclass
from typing import List, Dict, Tuple, Any

# -----------------------------
# Configuration for Groq API
# -----------------------------
@dataclass
class GroqConfig:
    api_key: str = os.getenv("GROQ_API_KEY", "Please use your own key")
    endpoint: str = os.getenv("GROQ_ENDPOINT", "https://api.groq.com/openai/v1/chat/completions")
    model: str = os.getenv("GROQ_MODEL", "llama-3.1-8b-instant")
    timeout: int = int(os.getenv("GROQ_TIMEOUT", "60"))

    # Rate/Retry tuning
    min_interval_s: float = float(os.getenv("GROQ_MIN_INTERVAL_S", "0.8"))  # min gap between calls
    max_retries: int = int(os.getenv("GROQ_MAX_RETRIES", "5"))
    backoff_base_s: float = float(os.getenv("GROQ_BACKOFF_BASE_S", "0.8"))
    backoff_jitter_s: float = float(os.getenv("GROQ_BACKOFF_JITTER_S", "0.4"))

# -----------------------------
# Simple rate limiter
# -----------------------------
class RateLimiter:
    def __init__(self, min_interval_s: float = 0.8):
        self.min_interval_s = min_interval_s
        self._last = 0.0

    def wait(self):
        now = time.time()
        gap = now - self._last
        if gap < self.min_interval_s:
            time.sleep(self.min_interval_s - gap)
        self._last = time.time()

# -----------------------------
# Groq Client (OpenAI-compatible)
# -----------------------------
class GroqClient:
    def __init__(self, cfg: GroqConfig):
        self.cfg = cfg
        if not self.cfg.api_key:
            raise RuntimeError("GROQ_API_KEY is required.")
        # Ensure endpoint points to /chat/completions
        if not self.cfg.endpoint.endswith("/chat/completions"):
            self.cfg.endpoint = self.cfg.endpoint.rstrip("/") + "/chat/completions"
        self.session = requests.Session()
        self.rate = RateLimiter(self.cfg.min_interval_s)

    def _headers(self):
        return {
            "Authorization": f"Bearer {self.cfg.api_key}",
            "Content-Type": "application/json",
            "User-Agent": "opro-groq-example/1.0",
        }

    def chat(self, messages: List[Dict[str, str]], temperature: float = 0.2,
             max_tokens: int = 128, top_p: float = 1.0, stop: List[str] = None) -> str:
        """
        Resilient chat with:
        - min interval spacing
        - retries with exponential backoff on 429/5xx
        - friendly 400 error messages (no blind tracebacks)
        """
        payload = {
            "model": self.cfg.model,
            "messages": messages,
            "temperature": float(temperature),
            "top_p": float(top_p),
            "max_tokens": int(max_tokens),
        }
        if stop:
            payload["stop"] = stop

        # Light sanity checks to avoid 400s
        if not payload["messages"] or not isinstance(payload["messages"], list):
            raise ValueError("messages must be a non-empty list")
        if payload["max_tokens"] <= 0:
            raise ValueError("max_tokens must be > 0")
        if not isinstance(payload["model"], str) or not payload["model"]:
            raise ValueError("model must be a non-empty string")

        attempt = 0
        while True:
            attempt += 1
            self.rate.wait()  # space out calls
            try:
                r = self.session.post(
                    self.cfg.endpoint,
                    headers=self._headers(),
                    json=payload,
                    timeout=self.cfg.timeout,
                )
            except requests.RequestException as e:
                # Network-level error -> backoff unless we've exhausted retries
                if attempt <= self.cfg.max_retries:
                    self._sleep_backoff(attempt)
                    continue
                raise RuntimeError(f"Network error after {attempt} attempts: {e}") from e

            if r.status_code == 200:
                data = r.json()
                try:
                    return data["choices"][0]["message"]["content"].strip()
                except Exception as e:
                    raise RuntimeError(f"Unexpected response schema: {json.dumps(data)[:500]}") from e

            # Friendly handling for 400 (Bad Request) to show Groq error text
            if r.status_code == 400:
                try:
                    err = r.json()
                    msg = err.get("error", {}).get("message") or err
                except Exception:
                    msg = r.text
                raise ValueError(f"400 Bad Request from Groq: {msg}")

            # Retryable statuses
            if r.status_code in (408, 409, 429, 500, 502, 503, 504):
                if attempt <= self.cfg.max_retries:
                    self._sleep_backoff(attempt, rate_or_server=r.status_code)
                    continue
                try:
                    err = r.json()
                    msg = err.get("error", {}).get("message") or err
                except Exception:
                    msg = r.text
                raise RuntimeError(f"HTTP {r.status_code} after {attempt} attempts: {msg}")

            # Non-retryable unexpected status
            try:
                err = r.json()
                msg = err.get("error", {}).get("message") or err
            except Exception:
                msg = r.text
            raise RuntimeError(f"HTTP {r.status_code}: {msg}")

    def _sleep_backoff(self, attempt: int, rate_or_server: int = None):
        base = self.cfg.backoff_base_s * (2 ** (attempt - 1))
        jitter = random.uniform(0, self.cfg.backoff_jitter_s)
        # Cap the backoff to something sane
        sleep_s = min(base + jitter, 10.0)
        if rate_or_server:
            print(f"[retry {attempt}] got {rate_or_server}; sleeping {sleep_s:.2f}s...")
        else:
            print(f"[retry {attempt}] network error; sleeping {sleep_s:.2f}s...")
        time.sleep(sleep_s)

# -----------------------------
# Seed history from the user
# -----------------------------
prompt_history: List[Dict[str, Any]] = [
    {"prompt": "Classify the following text's sentiment. Respond with 'positive', 'negative', or 'neutral'.", "score": 0.67},
    {"prompt": "Analyze the text for emotional tone. Provide a single-word classification: 'positive', 'negative', or 'neutral'.", "score": 0.83},
    {"prompt": "Determine the sentiment of the provided text. The possible categories are 'positive', 'negative', and 'neutral'.", "score": 0.83},
    {"prompt": "What is the sentiment of the text? Your options are positive, negative, or neutral.", "score": 0.67},
    {"prompt": "Please assign a sentiment label to the text. Choose from 'positive', 'negative', or 'neutral'.", "score": 0.83},
    {"prompt": "Identify the sentiment. Select one: 'positive', 'negative', 'neutral'.", "score": 0.83},
    {"prompt": "Strictly classify the sentiment. Output must be only one of: positive, negative, neutral.", "score": 0.83},
    {"prompt": "Classify the sentiment of the text below. Be specific and use only one of the following words: positive, negative, neutral.", "score": 0.83},
    {"prompt": "Determine the emotional valence of the text. Respond with positive, negative, or neutral.", "score": 0.67},
    {"prompt": "Provide the sentiment classification for the text. Use only the terms positive, negative, or neutral.", "score": 0.83},
    {"prompt": "Categorize the text's sentiment. The only valid outputs are 'positive', 'negative', or 'neutral'.", "score": 1.00},
    {"prompt": "For the following text, is the sentiment positive, negative, or neutral? Respond with only the chosen word.", "score": 1.00},
    {"prompt": "Analyze the text for its core sentiment. Respond with 'positive', 'negative', or 'neutral'. No other words.", "score": 1.00},
    {"prompt": "Sentiment analysis: positive, negative, or neutral?", "score": 0.67},
    {"prompt": "What is the sentiment of the text? Answer with one word: 'positive', 'negative', or 'neutral'.", "score": 0.83},
    {"prompt": "Classify the sentiment of the text. Output must be a single word: positive, negative, or neutral.", "score": 1.00},
    {"prompt": "Perform sentiment classification on the text. Return only the class name: positive, negative, or neutral.", "score": 1.00},
    {"prompt": "The sentiment of the text is... (positive/negative/neutral).", "score": 0.83},
    {"prompt": "Determine if the text has a positive, negative, or neutral sentiment.", "score": 0.67},
    {"prompt": "Analyze the text and classify its sentiment as 'positive', 'negative', or 'neutral'. Respond with only the sentiment label.", "score": 1.00},
]

# -----------------------------
# Validation data (the evaluator)
# -----------------------------
validation_data: List[Tuple[str, str]] = [
    ("I love this product, it's amazing!", "positive"),
    ("The customer service was terrible.", "negative"),
    ("The movie was okay, not great.", "neutral"),
    ("This is the best day ever!", "positive"),
    ("I am so frustrated with the service.", "negative"),
    ("The delivery was on time.", "neutral"),
]

VALID_LABELS = {"positive", "negative", "neutral"}

def normalize_label(text: str) -> str:
    """Extract a clean single-word label from model output."""
    t = text.strip().lower()
    for token in re.split(r"[\s\.\,\!\:\;\-\_\/\|\(\)\[\]\{\}]+", t):
        if token in VALID_LABELS:
            return token
    if "posit" in t: return "positive"
    if "negat" in t: return "negative"
    if "neutr" in t: return "neutral"
    return t  # may be invalid; caller will handle

def classify_once(llm: GroqClient, instruction: str, text: str) -> str:
    """Run the classifier model once with the given instruction and text."""
    sys = (
        "You are a strict sentiment classifier. "
        "You must output exactly one word: positive, negative, or neutral. "
        "No punctuation, no extra words."
    )
    user = f"{instruction}\n\nText: {text}\nSentiment:"
    out = llm.chat(
        messages=[{"role": "system", "content": sys},
                  {"role": "user", "content": user}],
        temperature=0.0,      # deterministic for scoring
        max_tokens=4,
        top_p=1.0,
        stop=["\n"]           # small guard to keep it to one word
    )
    return normalize_label(out)

def score_prompt(llm: GroqClient, instruction: str, sleep_between_calls: float = 0.25) -> float:
    """Accuracy on the small validation set (with optional inter-call sleep)."""
    correct = 0
    for text, gold in validation_data:
        pred = classify_once(llm, instruction, text)
        if pred == gold:
            correct += 1
        # Gentle pacing to avoid bursty scoring calls
        if sleep_between_calls > 0:
            time.sleep(sleep_between_calls)
    return correct / len(validation_data)

# -----------------------------
# OPRO meta-prompt & proposal
# -----------------------------
def build_meta_prompt(task_desc: str,
                      history: List[Dict[str, Any]],
                      num_proposals: int) -> str:
    header = (
        "You are optimizing an instruction for a sentiment classification task.\n"
        "Goal: maximize accuracy on a small validation set.\n\n"
        "You will see previous candidate instructions with their scores (0.00–1.00).\n"
        "Propose NEW instructions that are likely to achieve even higher accuracy.\n"
        "Constraints for each proposed instruction:\n"
        "  • Demand a single-word output: positive, negative, or neutral.\n"
        "  • Discourage extra text or punctuation.\n"
        "  • Single line only (no numbering, quotes, or code blocks).\n"
        "Return exactly {k} new instructions, each on its own line, NOTHING ELSE.\n"
    ).format(k=num_proposals)

    lines = ["Previous (score → instruction):"]
    for h in sorted(history, key=lambda x: x["score"], reverse=True):
        lines.append(f"{h['score']:.2f} → {h['prompt']}")

    tail = (
        f"\nTask description:\n{task_desc}\n\n"
        f"Now propose {num_proposals} new instructions as specified."
    )
    return header + "\n".join(lines) + tail

def parse_proposals(raw: str) -> List[str]:
    cands = []
    # Remove code fences if the model disobeys
    raw = re.sub(r"^```.*?\n|\n```$", "", raw, flags=re.DOTALL)
    for ln in raw.splitlines():
        ln = ln.strip()
        if not ln:
            continue
        ln = re.sub(r"^(\d+[\.\)]\s*|\-\s*|\*\s*)", "", ln).strip()
        if ln and len(ln) < 300:
            cands.append(ln)
    return cands

def propose_instructions(optimizer: GroqClient,
                         history: List[Dict[str, Any]],
                         num_proposals: int = 6) -> List[str]:
    task_desc = (
        "Given any English text, classify its sentiment as exactly one of: "
        "positive, negative, or neutral."
    )
    meta_prompt = build_meta_prompt(task_desc, history, num_proposals)
    sys = "You are an expert prompt engineer and optimizer."
    out = optimizer.chat(
        messages=[{"role": "system", "content": sys},
                  {"role": "user", "content": meta_prompt}],
        temperature=0.7,    # exploration
        max_tokens=256,     # safe upper bound
        top_p=1.0
    )
    return parse_proposals(out)

# -----------------------------
# prompt optimization loop
# -----------------------------
def run_opro(optimizer: GroqClient,
             evaluator: GroqClient,
             seed_history: List[Dict[str, Any]],
             rounds: int = 2,
             proposals_per_round: int = 6,
             keep_top_k: int = 16) -> Tuple[List[Dict[str, Any]], Dict[str, float]]:
    history = list(seed_history)
    seen = set(h["prompt"] for h in history)

    for r in range(1, rounds + 1):
        print(f"\n=== OPRO Round {r} ===")
        # Propose
        proposals = propose_instructions(optimizer, history, num_proposals=proposals_per_round)

        # Dedup
        unique_props = [p for p in proposals if p not in seen]
        if not unique_props:
            print("No novel proposals received; try increasing temperature or num_proposals.")
            break

        # Score
        new_items = []
        for instr in unique_props:
            try:
                acc = score_prompt(evaluator, instr, sleep_between_calls=0.3)
            except ValueError as ve:
                # Likely a 400 with helpful message; print & skip this candidate
                print(f"[skip: 400] {ve}")
                continue
            except Exception as e:
                print(f"[skip: error] {e}")
                continue

            new_items.append({"prompt": instr, "score": acc})
            print(f"[{acc:.2f}] {instr}")
            seen.add(instr)

        # Update history and keep top-k
        history.extend(new_items)
        history = sorted(history, key=lambda x: x["score"], reverse=True)[:keep_top_k]

        # Gentle pause between rounds to avoid bursts
        time.sleep(1.2)

    best = max(history, key=lambda x: x["score"])
    leaderboard = {h["prompt"]: h["score"] for h in history}
    return history, {"best_prompt": best["prompt"], "best_score": best["score"]}

# -----------------------------
# Main
# -----------------------------
def main():
    cfg = GroqConfig()
    client = GroqClient(cfg)

    optimizer = client
    evaluator = client

    # Re-score seed prompts so all scores are from THIS evaluator & dataset
    rescored_history = []
    print("Scoring seed prompts on validation set...")
    for h in prompt_history:
        try:
            acc = score_prompt(evaluator, h["prompt"], sleep_between_calls=0.25)
        except ValueError as ve:
            # A 400 typically means malformed instruction; keep original score but warn
            print(f"[warn seed 400] {ve}")
            acc = h.get("score", 0.0)
        except Exception as e:
            print(f"[warn seed error] {e}")
            acc = h.get("score", 0.0)
        rescored_history.append({"prompt": h["prompt"], "score": acc})
        print(f"[{acc:.2f}] {h['prompt']}")
        # Small pause between seeds
        time.sleep(0.2)

    # Run
    updated_history, summary = run_opro(
        optimizer=optimizer,
        evaluator=evaluator,
        seed_history=rescored_history,
        rounds=2,                # tune as needed
        proposals_per_round=6,   # lower = fewer/friendlier requests
        keep_top_k=16
    )

    print("\n=== Leaderboard (top prompts) ===")
    for h in sorted(updated_history, key=lambda x: x["score"], reverse=True):
        print(f"{h['score']:.2f} :: {h['prompt']}")

    print("\n=== Best discovered instruction ===")
    print(f"Score:  {summary['best_score']:.2f}")
    print(f"Prompt: {summary['best_prompt']}")

if __name__ == "__main__":
    main()
Reference:

Yang, C., Wang, X., Lu, Y., Liu, H., Le, Q. V., Zhou, D., & Chen, X. (2023, September). Large language models as optimizers. In The Twelfth International Conference on Learning Representations.
Zhang, Tuo, Jinyue Yuan, and Salman Avestimehr. "Revisiting opro: The limitations of small-scale llms as optimizers." arXiv preprint arXiv:2405.10276 (2024).
Guo, Q., Wang, R., Guo, J., Li, B., Song, K., Tan, X., ... & Yang, Y. (2025). EvoPrompt: Connecting LLMs with Evolutionary Algorithms Yields Powerful Prompt Optimizers. arXiv preprint arXiv:2309.08532.
Tao, Zhengwei, Ting-En Lin, Xiancai Chen, Hangyu Li, Yuchuan Wu, Yongbin Li, Zhi Jin, Fei Huang, Dacheng Tao, and Jingren Zhou. "A survey on self-evolution of large language models." arXiv preprint arXiv:2404.14387 (2024).
Exploring AI-ML-NLP

Sunday, September 21, 2025

LLMS OPTIMIZE THEMSELVES (Beyond Manual Prompt Engineering)

Video Content Summary

Hashtags

Tutorial:

Code:

Reference:

No comments:

Post a Comment

Blog Archive