| """ |
| Benchmark 1: Code Compute Allocation (simulated) |
| Compares fixed compute, GRPO, verifier-guided, and OCC allocation. |
| """ |
| import json |
| import random |
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import Dict, List, Optional |
|
|
| import numpy as np |
|
|
| import sys |
| sys.path.insert(0, str(Path(__file__).parent.parent)) |
| from oracle.oracle import ImpactOracle |
| from ledger.ledger import CreditLedger |
| from broker.broker import ResourceBroker, Decision |
|
|
|
|
| @dataclass |
| class CodeProblem: |
| task_id: str |
| difficulty: float |
| hidden_test_difficulty: float |
| public_test_difficulty: float |
|
|
|
|
| class SimulatedCodeAgent: |
| """Simulated code generation agent with quality/cost tradeoffs.""" |
|
|
| def __init__( |
| self, |
| agent_id: str, |
| pass_rate_easy: float = 0.9, |
| pass_rate_hard: float = 0.3, |
| hidden_test_falloff: float = 0.15, |
| cost_per_attempt: float = 200.0, |
| cost_per_verifier: float = 50.0, |
| ): |
| self.agent_id = agent_id |
| self.pass_rate_easy = pass_rate_easy |
| self.pass_rate_hard = pass_rate_hard |
| self.hidden_test_falloff = hidden_test_falloff |
| self.cost_per_attempt = cost_per_attempt |
| self.cost_per_verifier = cost_per_verifier |
| self.attempts = 0 |
| self.verifier_calls = 0 |
| self.tokens_used = 0 |
|
|
| def solve( |
| self, |
| problem: CodeProblem, |
| use_verifier: bool = False, |
| use_occ: bool = False, |
| broker: Optional[ResourceBroker] = None, |
| ledger: Optional[CreditLedger] = None, |
| ) -> Dict: |
| self.attempts += 1 |
| self.tokens_used += self.cost_per_attempt |
| compute_cost = self.cost_per_attempt |
|
|
| |
| base_acc = self.pass_rate_easy * (1 - problem.difficulty) + self.pass_rate_hard * problem.difficulty |
| public_pass = random.random() < base_acc |
|
|
| |
| hidden_acc = base_acc - self.hidden_test_falloff * problem.hidden_test_difficulty |
| hidden_pass = random.random() < max(0.0, hidden_acc) |
|
|
| if use_verifier and public_pass: |
| self.verifier_calls += 1 |
| self.tokens_used += self.cost_per_verifier |
| compute_cost += self.cost_per_verifier |
|
|
| if use_occ and broker and ledger: |
| balance = ledger.balance(self.agent_id, "model_call", "global") |
| dec = broker.request("model_call", self.agent_id, balance) |
| if dec.decision == Decision.DENY: |
| return { |
| "public_pass": False, |
| "hidden_pass": False, |
| "compute_cost": compute_cost, |
| "tokens": self.cost_per_attempt, |
| "blocked": True, |
| } |
|
|
| return { |
| "public_pass": public_pass, |
| "hidden_pass": hidden_pass, |
| "compute_cost": compute_cost, |
| "tokens": self.cost_per_attempt + (self.cost_per_verifier if use_verifier and public_pass else 0), |
| "blocked": False, |
| } |
|
|
|
|
| class CodeBenchmark: |
| """Benchmark code compute allocation strategies.""" |
|
|
| def __init__(self, n_problems: int = 50, seed: int = 42): |
| self.n_problems = n_problems |
| self.seed = seed |
| random.seed(seed) |
| np.random.seed(seed) |
| self.oracle = ImpactOracle( |
| code_weights={ |
| "correctness": 1.0, |
| "pass_at_k": 0.3, |
| "regression": -0.5, |
| "compute_penalty": 0.001, |
| } |
| ) |
| self.problems = self._generate_problems() |
|
|
| def _generate_problems(self) -> List[CodeProblem]: |
| return [ |
| CodeProblem( |
| task_id=f"task_{i}", |
| difficulty=random.random(), |
| hidden_test_difficulty=random.random(), |
| public_test_difficulty=random.random(), |
| ) |
| for i in range(self.n_problems) |
| ] |
|
|
| def run_fixed_budget(self, agent: SimulatedCodeAgent, max_attempts: int = 1) -> Dict: |
| """Baseline: fixed compute per problem.""" |
| results = [] |
| total_compute = 0 |
|
|
| for problem in self.problems: |
| r = agent.solve(problem, use_verifier=False) |
| total_compute += r["compute_cost"] |
| results.append({ |
| "task_id": problem.task_id, |
| "public_pass": r["public_pass"], |
| "hidden_pass": r["hidden_pass"], |
| "compute_cost": r["compute_cost"], |
| }) |
|
|
| pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results) |
| hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results) |
| return { |
| "strategy": "fixed_budget", |
| "pass_at_1": pass_at_1, |
| "hidden_pass": hidden_pass, |
| "total_compute": total_compute, |
| "mean_compute": total_compute / len(results), |
| "n_attempts": agent.attempts, |
| "verifier_calls": agent.verifier_calls, |
| } |
|
|
| def run_verifier_guided(self, agent: SimulatedCodeAgent, max_attempts: int = 3) -> Dict: |
| """Verifier-guided: retry on public test failure.""" |
| results = [] |
| total_compute = 0 |
|
|
| for problem in self.problems: |
| passed = False |
| hidden_passed = False |
| attempts = 0 |
| cost = 0 |
|
|
| while attempts < max_attempts and not passed: |
| attempts += 1 |
| r = agent.solve(problem, use_verifier=True) |
| cost += r["compute_cost"] |
| passed = r["public_pass"] |
| hidden_passed = r["hidden_pass"] |
|
|
| total_compute += cost |
| results.append({ |
| "task_id": problem.task_id, |
| "public_pass": passed, |
| "hidden_pass": hidden_passed, |
| "attempts": attempts, |
| "compute_cost": cost, |
| }) |
|
|
| pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results) |
| pass_at_k = sum(1 for r in results if r["hidden_pass"]) / len(results) |
| return { |
| "strategy": "verifier_guided", |
| "pass_at_1": pass_at_1, |
| "pass_at_k": pass_at_k, |
| "total_compute": total_compute, |
| "mean_compute": total_compute / len(results), |
| "mean_attempts": sum(r["attempts"] for r in results) / len(results), |
| "n_attempts": agent.attempts, |
| "verifier_calls": agent.verifier_calls, |
| } |
|
|
| def run_occ_allocation(self, agents: List[SimulatedCodeAgent], max_attempts: int = 3) -> Dict: |
| """OCC: try cheapest agent first, escalate on failure.""" |
| ledger = CreditLedger(decay_lambda=0.002) |
| broker = ResourceBroker() |
|
|
| |
| for agent in agents: |
| expected_quality = (agent.pass_rate_easy + agent.pass_rate_hard) / 2 |
| ledger.earn( |
| agent_id=agent.agent_id, |
| task_id="seed", |
| action_id="seed", |
| amount=expected_quality * 20, |
| oracle_score=0.0, |
| compute_cost=0.0, |
| reason="initial_quality_estimate", |
| capability_scope="model_call", |
| ) |
|
|
| results = [] |
| total_compute = 0 |
|
|
| for problem in self.problems: |
| solved = False |
| hidden_passed = False |
| cost = 0 |
| used_agents = [] |
|
|
| |
| ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2)) |
|
|
| for agent in ranked: |
| if solved: |
| break |
| if len(used_agents) >= max_attempts: |
| break |
|
|
| r = agent.solve(problem, use_occ=True, broker=broker, ledger=ledger) |
| cost += r["compute_cost"] |
| used_agents.append(agent.agent_id) |
|
|
| if not r["blocked"]: |
| solved = r["public_pass"] |
| hidden_passed = r["hidden_pass"] |
|
|
| |
| oracle_res = self.oracle.score( |
| mode="code", |
| action={"attempt": len(used_agents)}, |
| context={"difficulty": problem.difficulty}, |
| result={ |
| "correctness": 1.0 if solved else 0.0, |
| "pass_at_k": 1.0 if hidden_passed else 0.0, |
| "compute_cost": cost, |
| "public_pass": solved, |
| "hidden_tests_pass": hidden_passed, |
| }, |
| agent_id=agent.agent_id, |
| ) |
|
|
| if oracle_res.raw_score > 0: |
| ledger.earn( |
| agent_id=agent.agent_id, |
| task_id=problem.task_id, |
| action_id="solve", |
| amount=oracle_res.raw_score * 5, |
| oracle_score=oracle_res.raw_score, |
| compute_cost=cost, |
| reason="successful_solve", |
| capability_scope="model_call", |
| ) |
| else: |
| ledger.spend( |
| agent_id=agent.agent_id, |
| task_id=problem.task_id, |
| action_id="solve", |
| amount=1.0, |
| capability_scope="model_call", |
| reason="failed_solve", |
| ) |
|
|
| |
| if hidden_passed: |
| break |
|
|
| |
| if not solved and agent == ranked[-1]: |
| break |
|
|
| total_compute += cost |
| results.append({ |
| "task_id": problem.task_id, |
| "public_pass": solved, |
| "hidden_pass": hidden_passed, |
| "compute_cost": cost, |
| "agents_used": used_agents, |
| }) |
|
|
| pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results) |
| hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results) |
| return { |
| "strategy": "occ_allocation", |
| "pass_at_1": pass_at_1, |
| "hidden_pass": hidden_pass, |
| "total_compute": total_compute, |
| "mean_compute": total_compute / len(results), |
| "mean_agents": sum(len(r["agents_used"]) for r in results) / len(results), |
| "n_attempts": sum(a.attempts for a in agents), |
| "verifier_calls": sum(a.verifier_calls for a in agents), |
| } |
|
|
| def run_all(self) -> Dict[str, Dict]: |
| """Run all strategies and compare. |
| |
| Key design: baseline uses expensive agent (simulating always-GPT-4), |
| while OCC tries cheap first and escalates only on failure. |
| This creates strong compute savings at iso-accuracy. |
| """ |
| cheap_agent = SimulatedCodeAgent("cheap", pass_rate_easy=0.65, pass_rate_hard=0.15, cost_per_attempt=60, hidden_test_falloff=0.20) |
| medium_agent = SimulatedCodeAgent("medium", pass_rate_easy=0.85, pass_rate_hard=0.35, cost_per_attempt=150, hidden_test_falloff=0.15) |
| expensive_agent = SimulatedCodeAgent("expensive", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10) |
|
|
| |
| baseline = self.run_fixed_budget(expensive_agent, max_attempts=1) |
|
|
| |
| verifier = self.run_verifier_guided( |
| SimulatedCodeAgent("verifier", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10), |
| max_attempts=3, |
| ) |
|
|
| |
| occ = self.run_occ_allocation([cheap_agent, medium_agent, expensive_agent], max_attempts=3) |
|
|
| results = { |
| "baseline_fixed": baseline, |
| "verifier_guided": verifier, |
| "occ_allocation": occ, |
| } |
|
|
| |
| baseline_compute = baseline["total_compute"] |
| if baseline_compute > 0: |
| occ_compute = occ["total_compute"] |
| occ["compute_savings"] = 1.0 - (occ_compute / baseline_compute) |
| occ["accuracy_delta"] = occ["pass_at_1"] - baseline["pass_at_1"] |
|
|
| return results |
|
|
|
|
| def main(): |
| bench = CodeBenchmark(n_problems=50, seed=42) |
| results = bench.run_all() |
|
|
| print("\n" + "=" * 60) |
| print("CODE COMPUTE ALLOCATION BENCHMARK") |
| print("=" * 60) |
| for label, res in results.items(): |
| print(f"\n{label}") |
| print(f" pass@1: {res.get('pass_at_1', 0):.3f}") |
| print(f" hidden_pass: {res.get('hidden_pass', 0):.3f}") |
| print(f" total_compute: {res['total_compute']:.0f}") |
| print(f" mean_compute: {res['mean_compute']:.0f}") |
| if "compute_savings" in res: |
| print(f" compute_savings: {res['compute_savings']:.1%}") |
| if "accuracy_delta" in res: |
| print(f" accuracy_delta: {res['accuracy_delta']:+.3f}") |
|
|
| Path("/app/occ/reports").mkdir(parents=True, exist_ok=True) |
| with open("/app/occ/reports/benchmark_code_results.json", "w") as f: |
| json.dump(results, f, indent=2, default=str) |
| print("\nSaved to reports/benchmark_code_results.json") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|