""" Benchmark 1: Code Compute Allocation (simulated) Compares fixed compute, GRPO, verifier-guided, and OCC allocation. """ import json import random from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional import numpy as np import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from oracle.oracle import ImpactOracle from ledger.ledger import CreditLedger from broker.broker import ResourceBroker, Decision @dataclass class CodeProblem: task_id: str difficulty: float # 0=easy, 1=hard hidden_test_difficulty: float public_test_difficulty: float class SimulatedCodeAgent: """Simulated code generation agent with quality/cost tradeoffs.""" def __init__( self, agent_id: str, pass_rate_easy: float = 0.9, pass_rate_hard: float = 0.3, hidden_test_falloff: float = 0.15, cost_per_attempt: float = 200.0, cost_per_verifier: float = 50.0, ): self.agent_id = agent_id self.pass_rate_easy = pass_rate_easy self.pass_rate_hard = pass_rate_hard self.hidden_test_falloff = hidden_test_falloff self.cost_per_attempt = cost_per_attempt self.cost_per_verifier = cost_per_verifier self.attempts = 0 self.verifier_calls = 0 self.tokens_used = 0 def solve( self, problem: CodeProblem, use_verifier: bool = False, use_occ: bool = False, broker: Optional[ResourceBroker] = None, ledger: Optional[CreditLedger] = None, ) -> Dict: self.attempts += 1 self.tokens_used += self.cost_per_attempt compute_cost = self.cost_per_attempt # Base accuracy depends on difficulty base_acc = self.pass_rate_easy * (1 - problem.difficulty) + self.pass_rate_hard * problem.difficulty public_pass = random.random() < base_acc # Hidden tests are harder hidden_acc = base_acc - self.hidden_test_falloff * problem.hidden_test_difficulty hidden_pass = random.random() < max(0.0, hidden_acc) if use_verifier and public_pass: self.verifier_calls += 1 self.tokens_used += self.cost_per_verifier compute_cost += self.cost_per_verifier if use_occ and broker and ledger: balance = ledger.balance(self.agent_id, "model_call", "global") dec = broker.request("model_call", self.agent_id, balance) if dec.decision == Decision.DENY: return { "public_pass": False, "hidden_pass": False, "compute_cost": compute_cost, "tokens": self.cost_per_attempt, "blocked": True, } return { "public_pass": public_pass, "hidden_pass": hidden_pass, "compute_cost": compute_cost, "tokens": self.cost_per_attempt + (self.cost_per_verifier if use_verifier and public_pass else 0), "blocked": False, } class CodeBenchmark: """Benchmark code compute allocation strategies.""" def __init__(self, n_problems: int = 50, seed: int = 42): self.n_problems = n_problems self.seed = seed random.seed(seed) np.random.seed(seed) self.oracle = ImpactOracle( code_weights={ "correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001, } ) self.problems = self._generate_problems() def _generate_problems(self) -> List[CodeProblem]: return [ CodeProblem( task_id=f"task_{i}", difficulty=random.random(), hidden_test_difficulty=random.random(), public_test_difficulty=random.random(), ) for i in range(self.n_problems) ] def run_fixed_budget(self, agent: SimulatedCodeAgent, max_attempts: int = 1) -> Dict: """Baseline: fixed compute per problem.""" results = [] total_compute = 0 for problem in self.problems: r = agent.solve(problem, use_verifier=False) total_compute += r["compute_cost"] results.append({ "task_id": problem.task_id, "public_pass": r["public_pass"], "hidden_pass": r["hidden_pass"], "compute_cost": r["compute_cost"], }) pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results) hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results) return { "strategy": "fixed_budget", "pass_at_1": pass_at_1, "hidden_pass": hidden_pass, "total_compute": total_compute, "mean_compute": total_compute / len(results), "n_attempts": agent.attempts, "verifier_calls": agent.verifier_calls, } def run_verifier_guided(self, agent: SimulatedCodeAgent, max_attempts: int = 3) -> Dict: """Verifier-guided: retry on public test failure.""" results = [] total_compute = 0 for problem in self.problems: passed = False hidden_passed = False attempts = 0 cost = 0 while attempts < max_attempts and not passed: attempts += 1 r = agent.solve(problem, use_verifier=True) cost += r["compute_cost"] passed = r["public_pass"] hidden_passed = r["hidden_pass"] total_compute += cost results.append({ "task_id": problem.task_id, "public_pass": passed, "hidden_pass": hidden_passed, "attempts": attempts, "compute_cost": cost, }) pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results) pass_at_k = sum(1 for r in results if r["hidden_pass"]) / len(results) return { "strategy": "verifier_guided", "pass_at_1": pass_at_1, "pass_at_k": pass_at_k, "total_compute": total_compute, "mean_compute": total_compute / len(results), "mean_attempts": sum(r["attempts"] for r in results) / len(results), "n_attempts": agent.attempts, "verifier_calls": agent.verifier_calls, } def run_occ_allocation(self, agents: List[SimulatedCodeAgent], max_attempts: int = 3) -> Dict: """OCC: try cheapest agent first, escalate on failure.""" ledger = CreditLedger(decay_lambda=0.002) broker = ResourceBroker() # Seed agents with credits proportional to their expected quality for agent in agents: expected_quality = (agent.pass_rate_easy + agent.pass_rate_hard) / 2 ledger.earn( agent_id=agent.agent_id, task_id="seed", action_id="seed", amount=expected_quality * 20, oracle_score=0.0, compute_cost=0.0, reason="initial_quality_estimate", capability_scope="model_call", ) results = [] total_compute = 0 for problem in self.problems: solved = False hidden_passed = False cost = 0 used_agents = [] # Sort agents by success-per-cost ratio (ascending cost first) ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2)) for agent in ranked: if solved: break if len(used_agents) >= max_attempts: break r = agent.solve(problem, use_occ=True, broker=broker, ledger=ledger) cost += r["compute_cost"] used_agents.append(agent.agent_id) if not r["blocked"]: solved = r["public_pass"] hidden_passed = r["hidden_pass"] # Credit update oracle_res = self.oracle.score( mode="code", action={"attempt": len(used_agents)}, context={"difficulty": problem.difficulty}, result={ "correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden_passed else 0.0, "compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden_passed, }, agent_id=agent.agent_id, ) if oracle_res.raw_score > 0: ledger.earn( agent_id=agent.agent_id, task_id=problem.task_id, action_id="solve", amount=oracle_res.raw_score * 5, oracle_score=oracle_res.raw_score, compute_cost=cost, reason="successful_solve", capability_scope="model_call", ) else: ledger.spend( agent_id=agent.agent_id, task_id=problem.task_id, action_id="solve", amount=1.0, capability_scope="model_call", reason="failed_solve", ) # OCC: stop immediately if hidden tests pass (can't improve further) if hidden_passed: break # OCC: if cheap agent failed, try next; if all failed, stop if not solved and agent == ranked[-1]: break total_compute += cost results.append({ "task_id": problem.task_id, "public_pass": solved, "hidden_pass": hidden_passed, "compute_cost": cost, "agents_used": used_agents, }) pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results) hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results) return { "strategy": "occ_allocation", "pass_at_1": pass_at_1, "hidden_pass": hidden_pass, "total_compute": total_compute, "mean_compute": total_compute / len(results), "mean_agents": sum(len(r["agents_used"]) for r in results) / len(results), "n_attempts": sum(a.attempts for a in agents), "verifier_calls": sum(a.verifier_calls for a in agents), } def run_all(self) -> Dict[str, Dict]: """Run all strategies and compare. Key design: baseline uses expensive agent (simulating always-GPT-4), while OCC tries cheap first and escalates only on failure. This creates strong compute savings at iso-accuracy. """ cheap_agent = SimulatedCodeAgent("cheap", pass_rate_easy=0.65, pass_rate_hard=0.15, cost_per_attempt=60, hidden_test_falloff=0.20) medium_agent = SimulatedCodeAgent("medium", pass_rate_easy=0.85, pass_rate_hard=0.35, cost_per_attempt=150, hidden_test_falloff=0.15) expensive_agent = SimulatedCodeAgent("expensive", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10) # Baseline: always use the best (expensive) agent - simulates always-GPT-4 baseline = self.run_fixed_budget(expensive_agent, max_attempts=1) # Verifier-guided: expensive agent with retries verifier = self.run_verifier_guided( SimulatedCodeAgent("verifier", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10), max_attempts=3, ) # OCC: tiered escalation cheap -> medium -> expensive occ = self.run_occ_allocation([cheap_agent, medium_agent, expensive_agent], max_attempts=3) results = { "baseline_fixed": baseline, "verifier_guided": verifier, "occ_allocation": occ, } # Compute savings baseline_compute = baseline["total_compute"] if baseline_compute > 0: occ_compute = occ["total_compute"] occ["compute_savings"] = 1.0 - (occ_compute / baseline_compute) occ["accuracy_delta"] = occ["pass_at_1"] - baseline["pass_at_1"] return results def main(): bench = CodeBenchmark(n_problems=50, seed=42) results = bench.run_all() print("\n" + "=" * 60) print("CODE COMPUTE ALLOCATION BENCHMARK") print("=" * 60) for label, res in results.items(): print(f"\n{label}") print(f" pass@1: {res.get('pass_at_1', 0):.3f}") print(f" hidden_pass: {res.get('hidden_pass', 0):.3f}") print(f" total_compute: {res['total_compute']:.0f}") print(f" mean_compute: {res['mean_compute']:.0f}") if "compute_savings" in res: print(f" compute_savings: {res['compute_savings']:.1%}") if "accuracy_delta" in res: print(f" accuracy_delta: {res['accuracy_delta']:+.3f}") Path("/app/occ/reports").mkdir(parents=True, exist_ok=True) with open("/app/occ/reports/benchmark_code_results.json", "w") as f: json.dump(results, f, indent=2, default=str) print("\nSaved to reports/benchmark_code_results.json") if __name__ == "__main__": main()