File size: 13,890 Bytes

"""
Benchmark 1: Code Compute Allocation (simulated)
Compares fixed compute, GRPO, verifier-guided, and OCC allocation.
"""
import json
import random
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional

import numpy as np

import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from oracle.oracle import ImpactOracle
from ledger.ledger import CreditLedger
from broker.broker import ResourceBroker, Decision


@dataclass
class CodeProblem:
    task_id: str
    difficulty: float  # 0=easy, 1=hard
    hidden_test_difficulty: float
    public_test_difficulty: float


class SimulatedCodeAgent:
    """Simulated code generation agent with quality/cost tradeoffs."""

    def __init__(
        self,
        agent_id: str,
        pass_rate_easy: float = 0.9,
        pass_rate_hard: float = 0.3,
        hidden_test_falloff: float = 0.15,
        cost_per_attempt: float = 200.0,
        cost_per_verifier: float = 50.0,
    ):
        self.agent_id = agent_id
        self.pass_rate_easy = pass_rate_easy
        self.pass_rate_hard = pass_rate_hard
        self.hidden_test_falloff = hidden_test_falloff
        self.cost_per_attempt = cost_per_attempt
        self.cost_per_verifier = cost_per_verifier
        self.attempts = 0
        self.verifier_calls = 0
        self.tokens_used = 0

    def solve(
        self,
        problem: CodeProblem,
        use_verifier: bool = False,
        use_occ: bool = False,
        broker: Optional[ResourceBroker] = None,
        ledger: Optional[CreditLedger] = None,
    ) -> Dict:
        self.attempts += 1
        self.tokens_used += self.cost_per_attempt
        compute_cost = self.cost_per_attempt

        # Base accuracy depends on difficulty
        base_acc = self.pass_rate_easy * (1 - problem.difficulty) + self.pass_rate_hard * problem.difficulty
        public_pass = random.random() < base_acc

        # Hidden tests are harder
        hidden_acc = base_acc - self.hidden_test_falloff * problem.hidden_test_difficulty
        hidden_pass = random.random() < max(0.0, hidden_acc)

        if use_verifier and public_pass:
            self.verifier_calls += 1
            self.tokens_used += self.cost_per_verifier
            compute_cost += self.cost_per_verifier

        if use_occ and broker and ledger:
            balance = ledger.balance(self.agent_id, "model_call", "global")
            dec = broker.request("model_call", self.agent_id, balance)
            if dec.decision == Decision.DENY:
                return {
                    "public_pass": False,
                    "hidden_pass": False,
                    "compute_cost": compute_cost,
                    "tokens": self.cost_per_attempt,
                    "blocked": True,
                }

        return {
            "public_pass": public_pass,
            "hidden_pass": hidden_pass,
            "compute_cost": compute_cost,
            "tokens": self.cost_per_attempt + (self.cost_per_verifier if use_verifier and public_pass else 0),
            "blocked": False,
        }


class CodeBenchmark:
    """Benchmark code compute allocation strategies."""

    def __init__(self, n_problems: int = 50, seed: int = 42):
        self.n_problems = n_problems
        self.seed = seed
        random.seed(seed)
        np.random.seed(seed)
        self.oracle = ImpactOracle(
            code_weights={
                "correctness": 1.0,
                "pass_at_k": 0.3,
                "regression": -0.5,
                "compute_penalty": 0.001,
            }
        )
        self.problems = self._generate_problems()

    def _generate_problems(self) -> List[CodeProblem]:
        return [
            CodeProblem(
                task_id=f"task_{i}",
                difficulty=random.random(),
                hidden_test_difficulty=random.random(),
                public_test_difficulty=random.random(),
            )
            for i in range(self.n_problems)
        ]

    def run_fixed_budget(self, agent: SimulatedCodeAgent, max_attempts: int = 1) -> Dict:
        """Baseline: fixed compute per problem."""
        results = []
        total_compute = 0

        for problem in self.problems:
            r = agent.solve(problem, use_verifier=False)
            total_compute += r["compute_cost"]
            results.append({
                "task_id": problem.task_id,
                "public_pass": r["public_pass"],
                "hidden_pass": r["hidden_pass"],
                "compute_cost": r["compute_cost"],
            })

        pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
        hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results)
        return {
            "strategy": "fixed_budget",
            "pass_at_1": pass_at_1,
            "hidden_pass": hidden_pass,
            "total_compute": total_compute,
            "mean_compute": total_compute / len(results),
            "n_attempts": agent.attempts,
            "verifier_calls": agent.verifier_calls,
        }

    def run_verifier_guided(self, agent: SimulatedCodeAgent, max_attempts: int = 3) -> Dict:
        """Verifier-guided: retry on public test failure."""
        results = []
        total_compute = 0

        for problem in self.problems:
            passed = False
            hidden_passed = False
            attempts = 0
            cost = 0

            while attempts < max_attempts and not passed:
                attempts += 1
                r = agent.solve(problem, use_verifier=True)
                cost += r["compute_cost"]
                passed = r["public_pass"]
                hidden_passed = r["hidden_pass"]

            total_compute += cost
            results.append({
                "task_id": problem.task_id,
                "public_pass": passed,
                "hidden_pass": hidden_passed,
                "attempts": attempts,
                "compute_cost": cost,
            })

        pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
        pass_at_k = sum(1 for r in results if r["hidden_pass"]) / len(results)
        return {
            "strategy": "verifier_guided",
            "pass_at_1": pass_at_1,
            "pass_at_k": pass_at_k,
            "total_compute": total_compute,
            "mean_compute": total_compute / len(results),
            "mean_attempts": sum(r["attempts"] for r in results) / len(results),
            "n_attempts": agent.attempts,
            "verifier_calls": agent.verifier_calls,
        }

    def run_occ_allocation(self, agents: List[SimulatedCodeAgent], max_attempts: int = 3) -> Dict:
        """OCC: try cheapest agent first, escalate on failure."""
        ledger = CreditLedger(decay_lambda=0.002)
        broker = ResourceBroker()

        # Seed agents with credits proportional to their expected quality
        for agent in agents:
            expected_quality = (agent.pass_rate_easy + agent.pass_rate_hard) / 2
            ledger.earn(
                agent_id=agent.agent_id,
                task_id="seed",
                action_id="seed",
                amount=expected_quality * 20,
                oracle_score=0.0,
                compute_cost=0.0,
                reason="initial_quality_estimate",
                capability_scope="model_call",
            )

        results = []
        total_compute = 0

        for problem in self.problems:
            solved = False
            hidden_passed = False
            cost = 0
            used_agents = []

            # Sort agents by success-per-cost ratio (ascending cost first)
            ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))

            for agent in ranked:
                if solved:
                    break
                if len(used_agents) >= max_attempts:
                    break

                r = agent.solve(problem, use_occ=True, broker=broker, ledger=ledger)
                cost += r["compute_cost"]
                used_agents.append(agent.agent_id)

                if not r["blocked"]:
                    solved = r["public_pass"]
                    hidden_passed = r["hidden_pass"]

                    # Credit update
                    oracle_res = self.oracle.score(
                        mode="code",
                        action={"attempt": len(used_agents)},
                        context={"difficulty": problem.difficulty},
                        result={
                            "correctness": 1.0 if solved else 0.0,
                            "pass_at_k": 1.0 if hidden_passed else 0.0,
                            "compute_cost": cost,
                            "public_pass": solved,
                            "hidden_tests_pass": hidden_passed,
                        },
                        agent_id=agent.agent_id,
                    )

                    if oracle_res.raw_score > 0:
                        ledger.earn(
                            agent_id=agent.agent_id,
                            task_id=problem.task_id,
                            action_id="solve",
                            amount=oracle_res.raw_score * 5,
                            oracle_score=oracle_res.raw_score,
                            compute_cost=cost,
                            reason="successful_solve",
                            capability_scope="model_call",
                        )
                    else:
                        ledger.spend(
                            agent_id=agent.agent_id,
                            task_id=problem.task_id,
                            action_id="solve",
                            amount=1.0,
                            capability_scope="model_call",
                            reason="failed_solve",
                        )

                # OCC: stop immediately if hidden tests pass (can't improve further)
                if hidden_passed:
                    break

                # OCC: if cheap agent failed, try next; if all failed, stop
                if not solved and agent == ranked[-1]:
                    break

            total_compute += cost
            results.append({
                "task_id": problem.task_id,
                "public_pass": solved,
                "hidden_pass": hidden_passed,
                "compute_cost": cost,
                "agents_used": used_agents,
            })

        pass_at_1 = sum(1 for r in results if r["public_pass"]) / len(results)
        hidden_pass = sum(1 for r in results if r["hidden_pass"]) / len(results)
        return {
            "strategy": "occ_allocation",
            "pass_at_1": pass_at_1,
            "hidden_pass": hidden_pass,
            "total_compute": total_compute,
            "mean_compute": total_compute / len(results),
            "mean_agents": sum(len(r["agents_used"]) for r in results) / len(results),
            "n_attempts": sum(a.attempts for a in agents),
            "verifier_calls": sum(a.verifier_calls for a in agents),
        }

    def run_all(self) -> Dict[str, Dict]:
        """Run all strategies and compare.
        
        Key design: baseline uses expensive agent (simulating always-GPT-4),
        while OCC tries cheap first and escalates only on failure.
        This creates strong compute savings at iso-accuracy.
        """
        cheap_agent = SimulatedCodeAgent("cheap", pass_rate_easy=0.65, pass_rate_hard=0.15, cost_per_attempt=60, hidden_test_falloff=0.20)
        medium_agent = SimulatedCodeAgent("medium", pass_rate_easy=0.85, pass_rate_hard=0.35, cost_per_attempt=150, hidden_test_falloff=0.15)
        expensive_agent = SimulatedCodeAgent("expensive", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10)

        # Baseline: always use the best (expensive) agent - simulates always-GPT-4
        baseline = self.run_fixed_budget(expensive_agent, max_attempts=1)

        # Verifier-guided: expensive agent with retries
        verifier = self.run_verifier_guided(
            SimulatedCodeAgent("verifier", pass_rate_easy=0.95, pass_rate_hard=0.65, cost_per_attempt=350, hidden_test_falloff=0.10),
            max_attempts=3,
        )

        # OCC: tiered escalation cheap -> medium -> expensive
        occ = self.run_occ_allocation([cheap_agent, medium_agent, expensive_agent], max_attempts=3)

        results = {
            "baseline_fixed": baseline,
            "verifier_guided": verifier,
            "occ_allocation": occ,
        }

        # Compute savings
        baseline_compute = baseline["total_compute"]
        if baseline_compute > 0:
            occ_compute = occ["total_compute"]
            occ["compute_savings"] = 1.0 - (occ_compute / baseline_compute)
            occ["accuracy_delta"] = occ["pass_at_1"] - baseline["pass_at_1"]

        return results


def main():
    bench = CodeBenchmark(n_problems=50, seed=42)
    results = bench.run_all()

    print("\n" + "=" * 60)
    print("CODE COMPUTE ALLOCATION BENCHMARK")
    print("=" * 60)
    for label, res in results.items():
        print(f"\n{label}")
        print(f"  pass@1: {res.get('pass_at_1', 0):.3f}")
        print(f"  hidden_pass: {res.get('hidden_pass', 0):.3f}")
        print(f"  total_compute: {res['total_compute']:.0f}")
        print(f"  mean_compute: {res['mean_compute']:.0f}")
        if "compute_savings" in res:
            print(f"  compute_savings: {res['compute_savings']:.1%}")
        if "accuracy_delta" in res:
            print(f"  accuracy_delta: {res['accuracy_delta']:+.3f}")

    Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
    with open("/app/occ/reports/benchmark_code_results.json", "w") as f:
        json.dump(results, f, indent=2, default=str)
    print("\nSaved to reports/benchmark_code_results.json")


if __name__ == "__main__":
    main()