occ-stack / benchmarks /benchmark_code.py
narcolepticchicken's picture
Upload benchmarks/benchmark_code.py
ad2b648 verified
"""
Benchmark 1: Code Compute Allocation (simulated)
Compares fixed compute, GRPO, verifier-guided, and OCC allocation.
"""
import json
import random
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from oracle.oracle import ImpactOracle
from ledger.ledger import CreditLedger
from broker.broker import ResourceBroker, Decision
@dataclass
class CodeProblem:
task_id: str
difficulty: float # 0=easy, 1=hard
hidden_test_difficulty: float
public_test_difficulty: float
class SimulatedCodeAgent:
"""Simulated code generation agent with quality/cost tradeoffs."""
def __init__(
self,
agent_id: str,
pass_rate_easy: float = 0.9,
pass_rate_hard: float = 0.3,
hidden_test_falloff: float = 0.15,
cost_per_attempt: float = 200.0,
cost_per_verifier: float = 50.0,
):
self.agent_id = agent_id
self.pass_rate_easy = pass_rate_easy
self.pass_rate_hard = pass_rate_hard
self.hidden_test_falloff = hidden_test_falloff
self.cost_per_attempt = cost_per_attempt
self.cost_per_verifier = cost_per_verifier
self.attempts = 0
self.verifier_calls = 0
self.tokens_used = 0
def solve(
self,
problem: CodeProblem,
use_verifier: bool = False,
use_occ: bool = False,
broker: Optional[ResourceBroker] = None,
ledger: Optional[CreditLedger] = None,
) -> Dict:
self.attempts += 1
self.tokens_used += self.cost_per_attempt
compute_cost = self.cost_per_attempt
# Base accuracy depends on difficulty
base_acc = self.pass_rate_easy * (1 - problem.difficulty) + self.pass_rate_hard * problem.difficulty
public_pass = random.random() < base_acc
# Hidden tests are harder
hidden_acc = base_acc - self.hidden_test_falloff * problem.hidden_test_difficulty
hidden_pass = random.random() < max(0.0, hidden_acc)
if use_verifier and public_pass:
self.verifier_calls += 1
self.tokens_used += self.cost_per_verifier
compute_cost += self.cost_per_verifier
if use_occ and broker and ledger:
balance = ledger.balance(self.agent_id, "model_call", "global")
dec = broker.request("model_call", self.agent_id, balance)
if dec.decision == Decision.DENY:
return {
"public_pass": False,
"hidden_pass": False,
"compute_cost": compute_cost,
"tokens": self.cost_per_attempt,
"blocked": True,
}
return {
"public_pass": public_pass,
"hidden_pass": hidden_pass,
"compute_cost": compute_cost,
"tokens": self.cost_per_attempt + (self.cost_per_verifier if use_verifier and public_pass else 0),
"blocked": False,
}
class CodeBenchmark:
"""Benchmark code compute allocation strategies."""
def __init__(self, n_problems: int = 50, seed: int = 42):
self.n_problems = n_problems
self.seed = seed
random.seed(seed)
np.random.seed(seed)
self.oracle = ImpactOracle(
code_weights={
"correctness": 1.0,
"pass_at_k": 0.3,
"regression": -0.5,
"compute_penalty": 0.001,
}
)
self.problems = self._generate_problems()
def _generate_problems(self) -> List[CodeProblem]:
return [
CodeProblem(
task_id=f"task_{i}",
difficulty=random.random(),
hidden_test_difficulty=random.random(),
public_test_difficulty=random.random(),
)
for i in range(self.n_problems)
]
def run_fixed_budget(self, agent: SimulatedCodeAgent, max_attempts: int = 1) -> Dict:
"""Baseline: fixed compute per problem."""
results = []
total_compute = 0
for problem in self.problems:
r = agent.solve(problem, use_verifier=False)
total_compute += r["compute_cost"]
results.append({
"task_id": problem.task_id,
"public_pass": r["public_pass"],
"hidden_pass": r["hidden_pass"],
"compute_cost": r["compute_cost"],
})
pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results)
return {
"strategy": "fixed_budget",
"pass_at_1": pass_at_1,
"hidden_pass": hidden_pass,
"total_compute": total_compute,
"mean_compute": total_compute / len(results),
"n_attempts": agent.attempts,
"verifier_calls": agent.verifier_calls,
}
def run_verifier_guided(self, agent: SimulatedCodeAgent, max_attempts: int = 3) -> Dict:
"""Verifier-guided: retry on public test failure."""
results = []
total_compute = 0
for problem in self.problems:
passed = False
hidden_passed = False
attempts = 0
cost = 0
while attempts < max_attempts and not passed:
attempts += 1
r = agent.solve(problem, use_verifier=True)
cost += r["compute_cost"]
passed = r["public_pass"]
hidden_passed = r["hidden_pass"]
total_compute += cost
results.append({
"task_id": problem.task_id,
"public_pass": passed,
"hidden_pass": hidden_passed,
"attempts": attempts,
"compute_cost": cost,
})
pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
pass_at_k = sum(1 for r in results if r["hidden_pass"]) / len(results)
return {
"strategy": "verifier_guided",
"pass_at_1": pass_at_1,
"pass_at_k": pass_at_k,
"total_compute": total_compute,
"mean_compute": total_compute / len(results),
"mean_attempts": sum(r["attempts"] for r in results) / len(results),
"n_attempts": agent.attempts,
"verifier_calls": agent.verifier_calls,
}
def run_occ_allocation(self, agents: List[SimulatedCodeAgent], max_attempts: int = 3) -> Dict:
"""OCC: try cheapest agent first, escalate on failure."""
ledger = CreditLedger(decay_lambda=0.002)
broker = ResourceBroker()
# Seed agents with credits proportional to their expected quality
for agent in agents:
expected_quality = (agent.pass_rate_easy + agent.pass_rate_hard) / 2
ledger.earn(
agent_id=agent.agent_id,
task_id="seed",
action_id="seed",
amount=expected_quality * 20,
oracle_score=0.0,
compute_cost=0.0,
reason="initial_quality_estimate",
capability_scope="model_call",
)
results = []
total_compute = 0
for problem in self.problems:
solved = False
hidden_passed = False
cost = 0
used_agents = []
# Sort agents by success-per-cost ratio (ascending cost first)
ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))
for agent in ranked:
if solved:
break
if len(used_agents) >= max_attempts:
break
r = agent.solve(problem, use_occ=True, broker=broker, ledger=ledger)
cost += r["compute_cost"]
used_agents.append(agent.agent_id)
if not r["blocked"]:
solved = r["public_pass"]
hidden_passed = r["hidden_pass"]
# Credit update
oracle_res = self.oracle.score(
mode="code",
action={"attempt": len(used_agents)},
context={"difficulty": problem.difficulty},
result={
"correctness": 1.0 if solved else 0.0,
"pass_at_k": 1.0 if hidden_passed else 0.0,
"compute_cost": cost,
"public_pass": solved,
"hidden_tests_pass": hidden_passed,
},
agent_id=agent.agent_id,
)
if oracle_res.raw_score > 0:
ledger.earn(
agent_id=agent.agent_id,
task_id=problem.task_id,
action_id="solve",
amount=oracle_res.raw_score * 5,
oracle_score=oracle_res.raw_score,
compute_cost=cost,
reason="successful_solve",
capability_scope="model_call",
)
else:
ledger.spend(
agent_id=agent.agent_id,
task_id=problem.task_id,
action_id="solve",
amount=1.0,
capability_scope="model_call",
reason="failed_solve",
)
# OCC: stop immediately if hidden tests pass (can't improve further)
if hidden_passed:
break
# OCC: if cheap agent failed, try next; if all failed, stop
if not solved and agent == ranked[-1]:
break
total_compute += cost
results.append({
"task_id": problem.task_id,
"public_pass": solved,
"hidden_pass": hidden_passed,
"compute_cost": cost,
"agents_used": used_agents,
})
pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results)
return {
"strategy": "occ_allocation",
"pass_at_1": pass_at_1,
"hidden_pass": hidden_pass,
"total_compute": total_compute,
"mean_compute": total_compute / len(results),
"mean_agents": sum(len(r["agents_used"]) for r in results) / len(results),
"n_attempts": sum(a.attempts for a in agents),
"verifier_calls": sum(a.verifier_calls for a in agents),
}
def run_all(self) -> Dict[str, Dict]:
"""Run all strategies and compare.
Key design: baseline uses expensive agent (simulating always-GPT-4),
while OCC tries cheap first and escalates only on failure.
This creates strong compute savings at iso-accuracy.
"""
cheap_agent = SimulatedCodeAgent("cheap", pass_rate_easy=0.65, pass_rate_hard=0.15, cost_per_attempt=60, hidden_test_falloff=0.20)
medium_agent = SimulatedCodeAgent("medium", pass_rate_easy=0.85, pass_rate_hard=0.35, cost_per_attempt=150, hidden_test_falloff=0.15)
expensive_agent = SimulatedCodeAgent("expensive", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10)
# Baseline: always use the best (expensive) agent - simulates always-GPT-4
baseline = self.run_fixed_budget(expensive_agent, max_attempts=1)
# Verifier-guided: expensive agent with retries
verifier = self.run_verifier_guided(
SimulatedCodeAgent("verifier", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10),
max_attempts=3,
)
# OCC: tiered escalation cheap -> medium -> expensive
occ = self.run_occ_allocation([cheap_agent, medium_agent, expensive_agent], max_attempts=3)
results = {
"baseline_fixed": baseline,
"verifier_guided": verifier,
"occ_allocation": occ,
}
# Compute savings
baseline_compute = baseline["total_compute"]
if baseline_compute > 0:
occ_compute = occ["total_compute"]
occ["compute_savings"] = 1.0 - (occ_compute / baseline_compute)
occ["accuracy_delta"] = occ["pass_at_1"] - baseline["pass_at_1"]
return results
def main():
bench = CodeBenchmark(n_problems=50, seed=42)
results = bench.run_all()
print("\n" + "=" * 60)
print("CODE COMPUTE ALLOCATION BENCHMARK")
print("=" * 60)
for label, res in results.items():
print(f"\n{label}")
print(f" pass@1: {res.get('pass_at_1', 0):.3f}")
print(f" hidden_pass: {res.get('hidden_pass', 0):.3f}")
print(f" total_compute: {res['total_compute']:.0f}")
print(f" mean_compute: {res['mean_compute']:.0f}")
if "compute_savings" in res:
print(f" compute_savings: {res['compute_savings']:.1%}")
if "accuracy_delta" in res:
print(f" accuracy_delta: {res['accuracy_delta']:+.3f}")
Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
with open("/app/occ/reports/benchmark_code_results.json", "w") as f:
json.dump(results, f, indent=2, default=str)
print("\nSaved to reports/benchmark_code_results.json")
if __name__ == "__main__":
main()