""" Unified evaluation runner: all ablations + anti-gaming tests. Runs simulated benchmarks under 10 ablation conditions and 6 anti-gaming attacks. """ import json import random import sys from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Tuple import numpy as np # Ensure imports work sys.path.insert(0, str(Path(__file__).parent)) from oracle.oracle import ImpactOracle from ledger.ledger import CreditLedger from broker.broker import ResourceBroker, Decision from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent from benchmarks.benchmark_retrieval_qa import ( QABenchmark, SimulatedAgent, create_qa_dataset, ) from benchmarks.benchmark_debate_v2 import ( DebateBenchmark, FactualAgent, OverconfidentAgent, UncertainAgent, SycophantAgent, ) @dataclass class AblConfig: name: str description: str oracle_weights: Dict[str, Any] broker_thresholds: Dict[str, float] decay_lambda: float gaming_penalty: float compute_penalty_rate: float anti_gaming_on: bool ABLATIONS = [ AblConfig("default", "Full OCC stack", {}, {}, 0.02, 2.0, 0.0001, True), AblConfig("no_decay", "No credit decay (lambda=0)", {}, {}, 0.0, 2.0, 0.0001, True), AblConfig("fast_decay", "Aggressive decay (lambda=0.1)", {}, {}, 0.1, 2.0, 0.0001, True), AblConfig("no_gaming_penalty", "No gaming penalties", {}, {}, 0.02, 0.0, 0.0001, True), AblConfig("high_gaming_penalty", "Severe gaming penalties (5.0)", {}, {}, 0.02, 5.0, 0.0001, True), AblConfig("lenient_broker", "Lenient broker (thresholds x0.5)", {}, {"low": 0.25, "medium": 1.0, "high": 2.5}, 0.02, 2.0, 0.0001, True), AblConfig("strict_broker", "Strict broker (thresholds x2.0)", {}, {"low": 1.0, "medium": 4.0, "high": 10.0}, 0.02, 2.0, 0.0001, True), AblConfig("high_compute_cost", "High compute penalty (x10)", {}, {}, 0.02, 2.0, 0.001, True), AblConfig("low_compute_cost", "Low compute penalty (x0.1)", {}, {}, 0.02, 2.0, 0.00001, True), AblConfig("anti_gaming_off", "Disable all anti-gaming detectors", {}, {}, 0.02, 2.0, 0.0001, False), ] def run_ablation_code(config: AblConfig, seed: int = 42, n_problems: int = 50) -> Dict: random.seed(seed) np.random.seed(seed) oracle = ImpactOracle( code_weights={"correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001}, compute_penalty_rate=config.compute_penalty_rate, gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0, ) ledger = CreditLedger(decay_lambda=config.decay_lambda) broker = ResourceBroker(thresholds=config.broker_thresholds) bench = CodeBenchmark(n_problems=n_problems, seed=seed) cheap = SimulatedCodeAgent("cheap", 0.65, 0.15, 0.20, 60) medium = SimulatedCodeAgent("medium", 0.85, 0.35, 0.15, 150) expensive = SimulatedCodeAgent("expensive", 0.95, 0.65, 0.10, 350) # Seed ledger for a in [cheap, medium, expensive]: q = (a.pass_rate_easy + a.pass_rate_hard) / 2 ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call") # Override benchmark's oracle/ledger/broker results = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3) # (the benchmark internally uses its own instances; we use the standalone below) # Actually the benchmark creates its own objects. Let's run standalone: return _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed) def _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed): random.seed(seed) np.random.seed(seed) bench = CodeBenchmark(n_problems=n_problems, seed=seed) agents = [cheap, medium, expensive] for a in agents: q = (a.pass_rate_easy + a.pass_rate_hard) / 2 ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call") total_compute = 0 results = [] for problem in bench.problems: solved = False cost = 0 used = [] ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2)) for agent in ranked: if solved or len(used) >= 3: break a.attempts += 1 r = agent.solve(problem) cost += r["compute_cost"] total_compute += r["compute_cost"] used.append(agent.agent_id) solved = r["public_pass"] hidden = r["hidden_pass"] oracle_res = oracle.score( "code", {"attempt": len(used)}, {}, {"correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden else 0.0, "compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden}, agent_id=agent.agent_id, ) if oracle_res.raw_score > 0: ledger.earn(agent.agent_id, problem.task_id, "solve", oracle_res.raw_score * 5, oracle_res.raw_score, cost, "pass", "model_call") else: ledger.spend(agent.agent_id, problem.task_id, "solve", 1.0, "model_call", "fail") if hidden: break results.append({"solved": solved, "cost": cost, "agents": used}) acc = sum(1 for r in results if r["solved"]) / len(results) return { "accuracy": acc, "total_compute": total_compute, "mean_compute": total_compute / len(results), "mean_agents": sum(len(r["agents"]) for r in results) / len(results), } def run_ablation_qa(config: AblConfig, seed: int = 42) -> Dict: random.seed(seed) np.random.seed(seed) oracle = ImpactOracle( compute_penalty_rate=config.compute_penalty_rate, gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0, ) ledger = CreditLedger(decay_lambda=config.decay_lambda) broker = ResourceBroker(thresholds=config.broker_thresholds) data = create_qa_dataset(seed=seed) bench = QABenchmark(data, oracle, ledger, broker, seed=seed) agent = SimulatedAgent("qa_agent", oracle, ledger, broker, 0.85) agent.budget = 50000 agent.strategy = "adaptive" results = bench.run_occ(agent) return { "accuracy": results["accuracy"], "total_compute": results["total_compute"], "mean_compute": results["mean_compute"], "precision": results.get("precision", 0), "recall": results.get("recall", 0), } def run_ablation_debate(config: AblConfig, seed: int = 42, n_debates: int = 20) -> Dict: random.seed(seed) np.random.seed(seed) oracle = ImpactOracle( compute_penalty_rate=config.compute_penalty_rate, gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0, ) ledger = CreditLedger(decay_lambda=config.decay_lambda) broker = ResourceBroker(thresholds=config.broker_thresholds) bench = DebateBenchmark( oracle=oracle, ledger=ledger, broker=broker, factual_accuracy=0.9, adversarial_ratio=0.0, n_agents=3, seed=seed ) agents = [FactualAgent(f"f{i}", 0.9) for i in range(3)] results = bench.run_debate(agents) return { "accuracy": results["accuracy"], "consensus_reached": results["consensus_reached"], "total_compute": results["total_compute"], "mean_compute": results["mean_compute"], } def run_anti_gaming_code(seed: int = 42, n_problems: int = 50) -> Dict: """Test anti-gaming: create a gaming agent that always passes public but fails hidden.""" random.seed(seed) np.random.seed(seed) oracle = ImpactOracle(gaming_penalty=2.0) ledger = CreditLedger() broker = ResourceBroker() bench = CodeBenchmark(n_problems=n_problems, seed=seed) # Normal agent normal = SimulatedCodeAgent("normal", 0.9, 0.5, 0.1, 150) # Gaming agent: high public pass, high hidden fail gamer = SimulatedCodeAgent("gamer", 0.95, 0.05, 0.8, 100) ledger.earn("normal", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call") ledger.earn("gamer", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call") results = {"normal": [], "gamer": []} for problem in bench.problems: for agent in [normal, gamer]: r = agent.solve(problem) oracle_res = oracle.score( "code", {}, {}, {"correctness": 1.0 if r["public_pass"] else 0.0, "pass_at_k": 1.0 if r["hidden_pass"] else 0.0, "compute_cost": r["compute_cost"], "public_pass": r["public_pass"], "hidden_tests_pass": r["hidden_pass"]}, agent_id=agent.agent_id, ) results[agent.agent_id].append({ "public_pass": r["public_pass"], "hidden_pass": r["hidden_pass"], "raw_score": oracle_res.raw_score, "penalized": "gaming_hidden_tests" in oracle_res.failure_tags, }) def summarize(who): rs = results[who] return { "public_acc": sum(1 for r in rs if r["public_pass"]) / len(rs), "hidden_acc": sum(1 for r in rs if r["hidden_pass"]) / len(rs), "mean_raw": sum(r["raw_score"] for r in rs) / len(rs), "penalized_rate": sum(1 for r in rs if r["penalized"]) / len(rs), } return {"normal": summarize("normal"), "gamer": summarize("gamer")} def run_anti_gaming_collusion(seed: int = 42) -> Dict: """Test that credit transfers are always blocked.""" ledger = CreditLedger() ledger.earn("alice", "seed", "seed", 10, 0.0, 0.0, "initial") ledger.earn("bob", "seed", "seed", 1, 0.0, 0.0, "initial") ok = ledger.transfer("alice", "bob", 5.0, "global") alice_bal = ledger.balance("alice") bob_bal = ledger.balance("bob") collusion = ledger.detect_collusion(window=10) return { "transfer_allowed": ok, "alice_balance": alice_bal, "bob_balance": bob_bal, "collusion_detected": bool(collusion), "transfer_blocked": not ok, } def run_anti_gaming_abstention(seed: int = 42) -> Dict: """Test over-abstention penalty.""" oracle = ImpactOracle() # Agent abstains on everything results = [] for i in range(10): res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"}, {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50}) results.append(res.reward_value) return {"mean_reward": sum(results) / len(results), "expected_negative": sum(results) < 0} def run_anti_gaming_spam(seed: int = 42) -> Dict: """Test spam detection: high compute, low score.""" oracle = ImpactOracle() # High compute but wrong answer res = oracle.score("retrieval_qa", {}, {"gold_answer": "paris"}, {"answer": "london", "confidence": 0.1, "evidence": {}, "compute_cost": 5000}) return {"reward": res.reward_value, "tagged": bool(res.failure_tags), "tags": res.failure_tags} def run_all() -> Dict: print("=" * 60) print("OCC UNIFIED EVALUATION RUNNER") print("=" * 60) all_results: Dict[str, Any] = {"ablations": {}, "anti_gaming": {}} # Ablations for abl in ABLATIONS: print(f"\n--- ABLATION: {abl.name} ---") print(f" {abl.description}") code_res = run_ablation_code(abl, seed=42, n_problems=50) qa_res = run_ablation_qa(abl, seed=42) debate_res = run_ablation_debate(abl, seed=42) print(f" Code: acc={code_res['accuracy']:.3f}, compute={code_res['total_compute']:.0f}") print(f" QA: acc={qa_res['accuracy']:.3f}, compute={qa_res['total_compute']:.0f}") print(f" Debate: acc={debate_res['accuracy']:.3f}, compute={debate_res['total_compute']:.0f}") all_results["ablations"][abl.name] = { "config": abl.__dict__, "code": code_res, "qa": qa_res, "debate": debate_res, } # Anti-gaming print("\n--- ANTI-GAMING TESTS ---") all_results["anti_gaming"]["hidden_test_gaming"] = run_anti_gaming_code(seed=42) all_results["anti_gaming"]["collusion"] = run_anti_gaming_collusion(seed=42) all_results["anti_gaming"]["abstention"] = run_anti_gaming_abstention(seed=42) all_results["anti_gaming"]["spam"] = run_anti_gaming_spam(seed=42) for test_name, res in all_results["anti_gaming"].items(): print(f"\n {test_name}: {json.dumps(res, indent=2, default=str)}") # Save out = Path("/app/occ/reports") out.mkdir(parents=True, exist_ok=True) with open(out / "eval_runner_results.json", "w") as f: json.dump(all_results, f, indent=2, default=str) print(f"\nSaved to {out / 'eval_runner_results.json'}") return all_results if __name__ == "__main__": run_all()