Prompt Optimization
The Pattern
Outer loop (human feedback):
└── Inner loop (variant testing):
└── Pipeline under test (the prompt being optimized)Complete Implementation
from hypergraph import Graph, node, route, END, AsyncRunner
from anthropic import Anthropic
import json
client = Anthropic()
# ═══════════════════════════════════════════════════════════════
# THE PIPELINE BEING OPTIMIZED
# ═══════════════════════════════════════════════════════════════
@node(output_name="response")
def generate(query: str, system_prompt: str) -> str:
"""The pipeline under test - uses the system prompt we're optimizing."""
message = client.messages.create(
model="claude-sonnet-4-5-20250929",
max_tokens=1024,
system=system_prompt,
messages=[{"role": "user", "content": query}],
)
return message.content[0].text
pipeline = Graph([generate], name="pipeline")
# ═══════════════════════════════════════════════════════════════
# VARIANT GENERATION
# ═══════════════════════════════════════════════════════════════
@node(output_name="variants")
def generate_variants(
base_prompt: str,
feedback: str = "",
num_variants: int = 3,
) -> list[str]:
"""
Generate prompt variants based on feedback.
Uses Claude Opus 4.5 for high-quality prompt engineering.
"""
instruction = f"""Generate {num_variants} variations of this system prompt.
Each variation should be meaningfully different while preserving the core intent.
Base prompt:
{base_prompt}
"""
if feedback:
instruction += f"""
Previous feedback to incorporate:
{feedback}
"""
instruction += """
Return a JSON array of strings, each being a complete system prompt.
No explanation, just the JSON array."""
message = client.messages.create(
model="claude-opus-4-5-20251101",
max_tokens=2048,
messages=[{"role": "user", "content": instruction}],
)
variants = json.loads(message.content[0].text)
return [base_prompt] + variants # Include original for comparison
# ═══════════════════════════════════════════════════════════════
# EVALUATION
# ═══════════════════════════════════════════════════════════════
@node(output_name="test_results")
async def test_variants(
variants: list[str],
test_cases: list[dict],
) -> list[dict]:
"""
Test each variant against the test cases.
Returns scores for each variant.
"""
runner = AsyncRunner()
results = []
for i, variant in enumerate(variants):
scores = []
for test in test_cases:
# Run the pipeline with this variant
result = await runner.run(pipeline, {
"query": test["query"],
"system_prompt": variant,
})
# Score the response
score = evaluate_response(
response=result["response"],
expected=test.get("expected_keywords", []),
criteria=test.get("criteria", {}),
)
scores.append(score)
results.append({
"variant_index": i,
"prompt": variant[:100] + "..." if len(variant) > 100 else variant,
"full_prompt": variant,
"avg_score": sum(scores) / len(scores),
"scores": scores,
})
return sorted(results, key=lambda x: x["avg_score"], reverse=True)
def evaluate_response(response: str, expected_keywords: list, criteria: dict) -> float:
"""Score a response (0-1)."""
score = 0.0
# Keyword coverage
if expected_keywords:
found = sum(1 for kw in expected_keywords if kw.lower() in response.lower())
score += 0.5 * (found / len(expected_keywords))
# Length criteria
if "min_length" in criteria:
if len(response) >= criteria["min_length"]:
score += 0.25
# Format criteria
if "must_include" in criteria:
if all(s in response for s in criteria["must_include"]):
score += 0.25
return min(score, 1.0)
@node(output_name="best_variant")
def select_best(test_results: list[dict]) -> dict:
"""Select the best performing variant."""
return test_results[0] # Already sorted by score
# ═══════════════════════════════════════════════════════════════
# OPTIMIZATION LOOP (INNER)
# ═══════════════════════════════════════════════════════════════
@node(output_name="iteration")
def track_iteration(iteration: int = 0) -> int:
return iteration + 1
@route(targets=["generate_variants", END])
def optimization_gate(
best_variant: dict,
iteration: int,
target_score: float = 0.9,
max_iterations: int = 5,
) -> str:
"""Decide if optimization should continue."""
if best_variant["avg_score"] >= target_score:
print(f"✓ Target score reached: {best_variant['avg_score']:.2f}")
return END
if iteration >= max_iterations:
print(f"✓ Max iterations reached. Best score: {best_variant['avg_score']:.2f}")
return END
print(f"→ Iteration {iteration}: score={best_variant['avg_score']:.2f}, continuing...")
return "generate_variants"
optimization_loop = Graph([
generate_variants,
test_variants,
select_best,
track_iteration,
optimization_gate,
], name="optimization")
# ═══════════════════════════════════════════════════════════════
# HUMAN-IN-THE-LOOP (OUTER)
# ═══════════════════════════════════════════════════════════════
@node(output_name="feedback")
def get_human_feedback(best_variant: dict, test_results: list[dict]) -> str:
"""
Display results and get human feedback.
In production, this might be a web UI or API call.
"""
print("\n" + "=" * 60)
print("OPTIMIZATION RESULTS")
print("=" * 60)
for i, result in enumerate(test_results[:3]): # Top 3
print(f"\n#{i+1} (score: {result['avg_score']:.2f})")
print(f" {result['prompt']}")
print("\n" + "-" * 60)
print(f"Best prompt (score: {best_variant['avg_score']:.2f}):")
print(best_variant["full_prompt"])
print("-" * 60)
feedback = input("\nFeedback (or 'done' to finish): ").strip()
return feedback
@route(targets=["optimization", END])
def human_gate(feedback: str) -> str:
"""Check if human wants to continue."""
if feedback.lower() in ("done", "quit", "exit", ""):
return END
return "optimization"
human_loop = Graph([
optimization_loop.as_node(), # Inner loop as a node
get_human_feedback,
human_gate,
], name="human_optimization")
# ═══════════════════════════════════════════════════════════════
# RUN THE FULL SYSTEM
# ═══════════════════════════════════════════════════════════════
async def main():
runner = AsyncRunner()
# Test cases for evaluation
test_cases = [
{
"query": "Explain quantum computing to a beginner",
"expected_keywords": ["qubit", "superposition", "classical"],
"criteria": {"min_length": 200},
},
{
"query": "What is machine learning?",
"expected_keywords": ["data", "algorithm", "pattern"],
"criteria": {"min_length": 150},
},
{
"query": "How does encryption work?",
"expected_keywords": ["key", "secure", "decrypt"],
"criteria": {"min_length": 150},
},
]
result = await runner.run(human_loop, {
"base_prompt": "You are a helpful assistant that explains technical concepts.",
"test_cases": test_cases,
"target_score": 0.85,
"max_iterations": 3,
})
print("\n" + "=" * 60)
print("FINAL OPTIMIZED PROMPT:")
print("=" * 60)
print(result["best_variant"]["full_prompt"])
print(f"\nFinal score: {result['best_variant']['avg_score']:.2f}")
# asyncio.run(main())Key Patterns
1. Multiple Nesting Levels
2. Automated Testing
3. Human-in-the-Loop
4. Early Termination
Variations
A/B Testing
LLM-as-Judge
What's Next?
Last updated