occ-stack / benchmarks /benchmark_code.py

Upload benchmarks/benchmark_code.py

ad2b648 verified 22 days ago

13.9 kB

	"""
	Benchmark 1: Code Compute Allocation (simulated)
	Compares fixed compute, GRPO, verifier-guided, and OCC allocation.
	"""
	import json
	import random
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Dict, List, Optional

	import numpy as np

	import sys
	sys.path.insert(0, str(Path(__file__).parent.parent))
	from oracle.oracle import ImpactOracle
	from ledger.ledger import CreditLedger
	from broker.broker import ResourceBroker, Decision


	@dataclass
	class CodeProblem:
	task_id: str
	difficulty: float # 0=easy, 1=hard
	hidden_test_difficulty: float
	public_test_difficulty: float


	class SimulatedCodeAgent:
	"""Simulated code generation agent with quality/cost tradeoffs."""

	def __init__(
	self,
	agent_id: str,
	pass_rate_easy: float = 0.9,
	pass_rate_hard: float = 0.3,
	hidden_test_falloff: float = 0.15,
	cost_per_attempt: float = 200.0,
	cost_per_verifier: float = 50.0,
	):
	self.agent_id = agent_id
	self.pass_rate_easy = pass_rate_easy
	self.pass_rate_hard = pass_rate_hard
	self.hidden_test_falloff = hidden_test_falloff
	self.cost_per_attempt = cost_per_attempt
	self.cost_per_verifier = cost_per_verifier
	self.attempts = 0
	self.verifier_calls = 0
	self.tokens_used = 0

	def solve(
	self,
	problem: CodeProblem,
	use_verifier: bool = False,
	use_occ: bool = False,
	broker: Optional[ResourceBroker] = None,
	ledger: Optional[CreditLedger] = None,
	) -> Dict:
	self.attempts += 1
	self.tokens_used += self.cost_per_attempt
	compute_cost = self.cost_per_attempt

	# Base accuracy depends on difficulty
	base_acc = self.pass_rate_easy * (1 - problem.difficulty) + self.pass_rate_hard * problem.difficulty
	public_pass = random.random() < base_acc

	# Hidden tests are harder
	hidden_acc = base_acc - self.hidden_test_falloff * problem.hidden_test_difficulty
	hidden_pass = random.random() < max(0.0, hidden_acc)

	if use_verifier and public_pass:
	self.verifier_calls += 1
	self.tokens_used += self.cost_per_verifier
	compute_cost += self.cost_per_verifier

	if use_occ and broker and ledger:
	balance = ledger.balance(self.agent_id, "model_call", "global")
	dec = broker.request("model_call", self.agent_id, balance)
	if dec.decision == Decision.DENY:
	return {
	"public_pass": False,
	"hidden_pass": False,
	"compute_cost": compute_cost,
	"tokens": self.cost_per_attempt,
	"blocked": True,
	}

	return {
	"public_pass": public_pass,
	"hidden_pass": hidden_pass,
	"compute_cost": compute_cost,
	"tokens": self.cost_per_attempt + (self.cost_per_verifier if use_verifier and public_pass else 0),
	"blocked": False,
	}


	class CodeBenchmark:
	"""Benchmark code compute allocation strategies."""

	def __init__(self, n_problems: int = 50, seed: int = 42):
	self.n_problems = n_problems
	self.seed = seed
	random.seed(seed)
	np.random.seed(seed)
	self.oracle = ImpactOracle(
	code_weights={
	"correctness": 1.0,
	"pass_at_k": 0.3,
	"regression": -0.5,
	"compute_penalty": 0.001,
	}
	)
	self.problems = self._generate_problems()

	def _generate_problems(self) -> List[CodeProblem]:
	return [
	CodeProblem(
	task_id=f"task_{i}",
	difficulty=random.random(),
	hidden_test_difficulty=random.random(),
	public_test_difficulty=random.random(),
	)
	for i in range(self.n_problems)
	]

	def run_fixed_budget(self, agent: SimulatedCodeAgent, max_attempts: int = 1) -> Dict:
	"""Baseline: fixed compute per problem."""
	results = []
	total_compute = 0

	for problem in self.problems:
	r = agent.solve(problem, use_verifier=False)
	total_compute += r["compute_cost"]
	results.append({
	"task_id": problem.task_id,
	"public_pass": r["public_pass"],
	"hidden_pass": r["hidden_pass"],
	"compute_cost": r["compute_cost"],
	})

	pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
	hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results)
	return {
	"strategy": "fixed_budget",
	"pass_at_1": pass_at_1,
	"hidden_pass": hidden_pass,
	"total_compute": total_compute,
	"mean_compute": total_compute / len(results),
	"n_attempts": agent.attempts,
	"verifier_calls": agent.verifier_calls,
	}

	def run_verifier_guided(self, agent: SimulatedCodeAgent, max_attempts: int = 3) -> Dict:
	"""Verifier-guided: retry on public test failure."""
	results = []
	total_compute = 0

	for problem in self.problems:
	passed = False
	hidden_passed = False
	attempts = 0
	cost = 0

	while attempts < max_attempts and not passed:
	attempts += 1
	r = agent.solve(problem, use_verifier=True)
	cost += r["compute_cost"]
	passed = r["public_pass"]
	hidden_passed = r["hidden_pass"]

	total_compute += cost
	results.append({
	"task_id": problem.task_id,
	"public_pass": passed,
	"hidden_pass": hidden_passed,
	"attempts": attempts,
	"compute_cost": cost,
	})

	pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
	pass_at_k = sum(1 for r in results if r["hidden_pass"]) / len(results)
	return {
	"strategy": "verifier_guided",
	"pass_at_1": pass_at_1,
	"pass_at_k": pass_at_k,
	"total_compute": total_compute,
	"mean_compute": total_compute / len(results),
	"mean_attempts": sum(r["attempts"] for r in results) / len(results),
	"n_attempts": agent.attempts,
	"verifier_calls": agent.verifier_calls,
	}

	def run_occ_allocation(self, agents: List[SimulatedCodeAgent], max_attempts: int = 3) -> Dict:
	"""OCC: try cheapest agent first, escalate on failure."""
	ledger = CreditLedger(decay_lambda=0.002)
	broker = ResourceBroker()

	# Seed agents with credits proportional to their expected quality
	for agent in agents:
	expected_quality = (agent.pass_rate_easy + agent.pass_rate_hard) / 2
	ledger.earn(
	agent_id=agent.agent_id,
	task_id="seed",
	action_id="seed",
	amount=expected_quality * 20,
	oracle_score=0.0,
	compute_cost=0.0,
	reason="initial_quality_estimate",
	capability_scope="model_call",
	)

	results = []
	total_compute = 0

	for problem in self.problems:
	solved = False
	hidden_passed = False
	cost = 0
	used_agents = []

	# Sort agents by success-per-cost ratio (ascending cost first)
	ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))

	for agent in ranked:
	if solved:
	break
	if len(used_agents) >= max_attempts:
	break

	r = agent.solve(problem, use_occ=True, broker=broker, ledger=ledger)
	cost += r["compute_cost"]
	used_agents.append(agent.agent_id)

	if not r["blocked"]:
	solved = r["public_pass"]
	hidden_passed = r["hidden_pass"]

	# Credit update
	oracle_res = self.oracle.score(
	mode="code",
	action={"attempt": len(used_agents)},
	context={"difficulty": problem.difficulty},
	result={
	"correctness": 1.0 if solved else 0.0,
	"pass_at_k": 1.0 if hidden_passed else 0.0,
	"compute_cost": cost,
	"public_pass": solved,
	"hidden_tests_pass": hidden_passed,
	},
	agent_id=agent.agent_id,
	)

	if oracle_res.raw_score > 0:
	ledger.earn(
	agent_id=agent.agent_id,
	task_id=problem.task_id,
	action_id="solve",
	amount=oracle_res.raw_score * 5,
	oracle_score=oracle_res.raw_score,
	compute_cost=cost,
	reason="successful_solve",
	capability_scope="model_call",
	)
	else:
	ledger.spend(
	agent_id=agent.agent_id,
	task_id=problem.task_id,
	action_id="solve",
	amount=1.0,
	capability_scope="model_call",
	reason="failed_solve",
	)

	# OCC: stop immediately if hidden tests pass (can't improve further)
	if hidden_passed:
	break

	# OCC: if cheap agent failed, try next; if all failed, stop
	if not solved and agent == ranked[-1]:
	break

	total_compute += cost
	results.append({
	"task_id": problem.task_id,
	"public_pass": solved,
	"hidden_pass": hidden_passed,
	"compute_cost": cost,
	"agents_used": used_agents,
	})

	pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
	hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results)
	return {
	"strategy": "occ_allocation",
	"pass_at_1": pass_at_1,
	"hidden_pass": hidden_pass,
	"total_compute": total_compute,
	"mean_compute": total_compute / len(results),
	"mean_agents": sum(len(r["agents_used"]) for r in results) / len(results),
	"n_attempts": sum(a.attempts for a in agents),
	"verifier_calls": sum(a.verifier_calls for a in agents),
	}

	def run_all(self) -> Dict[str, Dict]:
	"""Run all strategies and compare.

	Key design: baseline uses expensive agent (simulating always-GPT-4),
	while OCC tries cheap first and escalates only on failure.
	This creates strong compute savings at iso-accuracy.
	"""
	cheap_agent = SimulatedCodeAgent("cheap", pass_rate_easy=0.65, pass_rate_hard=0.15, cost_per_attempt=60, hidden_test_falloff=0.20)
	medium_agent = SimulatedCodeAgent("medium", pass_rate_easy=0.85, pass_rate_hard=0.35, cost_per_attempt=150, hidden_test_falloff=0.15)
	expensive_agent = SimulatedCodeAgent("expensive", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10)

	# Baseline: always use the best (expensive) agent - simulates always-GPT-4
	baseline = self.run_fixed_budget(expensive_agent, max_attempts=1)

	# Verifier-guided: expensive agent with retries
	verifier = self.run_verifier_guided(
	SimulatedCodeAgent("verifier", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10),
	max_attempts=3,
	)

	# OCC: tiered escalation cheap -> medium -> expensive
	occ = self.run_occ_allocation([cheap_agent, medium_agent, expensive_agent], max_attempts=3)

	results = {
	"baseline_fixed": baseline,
	"verifier_guided": verifier,
	"occ_allocation": occ,
	}

	# Compute savings
	baseline_compute = baseline["total_compute"]
	if baseline_compute > 0:
	occ_compute = occ["total_compute"]
	occ["compute_savings"] = 1.0 - (occ_compute / baseline_compute)
	occ["accuracy_delta"] = occ["pass_at_1"] - baseline["pass_at_1"]

	return results


	def main():
	bench = CodeBenchmark(n_problems=50, seed=42)
	results = bench.run_all()

	print("\n" + "=" * 60)
	print("CODE COMPUTE ALLOCATION BENCHMARK")
	print("=" * 60)
	for label, res in results.items():
	print(f"\n{label}")
	print(f" pass@1: {res.get('pass_at_1', 0):.3f}")
	print(f" hidden_pass: {res.get('hidden_pass', 0):.3f}")
	print(f" total_compute: {res['total_compute']:.0f}")
	print(f" mean_compute: {res['mean_compute']:.0f}")
	if "compute_savings" in res:
	print(f" compute_savings: {res['compute_savings']:.1%}")
	if "accuracy_delta" in res:
	print(f" accuracy_delta: {res['accuracy_delta']:+.3f}")

	Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
	with open("/app/occ/reports/benchmark_code_results.json", "w") as f:
	json.dump(results, f, indent=2, default=str)
	print("\nSaved to reports/benchmark_code_results.json")


	if __name__ == "__main__":
	main()