| """ |
| Unified evaluation runner: all ablations + anti-gaming tests. |
| Runs simulated benchmarks under 10 ablation conditions and 6 anti-gaming attacks. |
| """ |
| import json |
| import random |
| import sys |
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import Any, Dict, List, Tuple |
|
|
| import numpy as np |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent)) |
| from oracle.oracle import ImpactOracle |
| from ledger.ledger import CreditLedger |
| from broker.broker import ResourceBroker, Decision |
| from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent |
| from benchmarks.benchmark_retrieval_qa import ( |
| QABenchmark, |
| SimulatedAgent, |
| create_qa_dataset, |
| ) |
| from benchmarks.benchmark_debate_v2 import ( |
| DebateBenchmark, |
| FactualAgent, |
| OverconfidentAgent, |
| UncertainAgent, |
| SycophantAgent, |
| ) |
|
|
|
|
| @dataclass |
| class AblConfig: |
| name: str |
| description: str |
| oracle_weights: Dict[str, Any] |
| broker_thresholds: Dict[str, float] |
| decay_lambda: float |
| gaming_penalty: float |
| compute_penalty_rate: float |
| anti_gaming_on: bool |
|
|
|
|
| ABLATIONS = [ |
| AblConfig("default", "Full OCC stack", {}, {}, 0.02, 2.0, 0.0001, True), |
| AblConfig("no_decay", "No credit decay (lambda=0)", {}, {}, 0.0, 2.0, 0.0001, True), |
| AblConfig("fast_decay", "Aggressive decay (lambda=0.1)", {}, {}, 0.1, 2.0, 0.0001, True), |
| AblConfig("no_gaming_penalty", "No gaming penalties", {}, {}, 0.02, 0.0, 0.0001, True), |
| AblConfig("high_gaming_penalty", "Severe gaming penalties (5.0)", {}, {}, 0.02, 5.0, 0.0001, True), |
| AblConfig("lenient_broker", "Lenient broker (thresholds x0.5)", {}, {"low": 0.25, "medium": 1.0, "high": 2.5}, 0.02, 2.0, 0.0001, True), |
| AblConfig("strict_broker", "Strict broker (thresholds x2.0)", {}, {"low": 1.0, "medium": 4.0, "high": 10.0}, 0.02, 2.0, 0.0001, True), |
| AblConfig("high_compute_cost", "High compute penalty (x10)", {}, {}, 0.02, 2.0, 0.001, True), |
| AblConfig("low_compute_cost", "Low compute penalty (x0.1)", {}, {}, 0.02, 2.0, 0.00001, True), |
| AblConfig("anti_gaming_off", "Disable all anti-gaming detectors", {}, {}, 0.02, 2.0, 0.0001, False), |
| ] |
|
|
|
|
| def run_ablation_code(config: AblConfig, seed: int = 42, n_problems: int = 50) -> Dict: |
| random.seed(seed) |
| np.random.seed(seed) |
|
|
| oracle = ImpactOracle( |
| code_weights={"correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001}, |
| compute_penalty_rate=config.compute_penalty_rate, |
| gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0, |
| ) |
| ledger = CreditLedger(decay_lambda=config.decay_lambda) |
| broker = ResourceBroker(thresholds=config.broker_thresholds) |
|
|
| bench = CodeBenchmark(n_problems=n_problems, seed=seed) |
| cheap = SimulatedCodeAgent("cheap", 0.65, 0.15, 0.20, 60) |
| medium = SimulatedCodeAgent("medium", 0.85, 0.35, 0.15, 150) |
| expensive = SimulatedCodeAgent("expensive", 0.95, 0.65, 0.10, 350) |
|
|
| |
| for a in [cheap, medium, expensive]: |
| q = (a.pass_rate_easy + a.pass_rate_hard) / 2 |
| ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call") |
|
|
| |
| results = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3) |
| |
| |
| return _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed) |
|
|
|
|
| def _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed): |
| random.seed(seed) |
| np.random.seed(seed) |
| bench = CodeBenchmark(n_problems=n_problems, seed=seed) |
| agents = [cheap, medium, expensive] |
| for a in agents: |
| q = (a.pass_rate_easy + a.pass_rate_hard) / 2 |
| ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call") |
|
|
| total_compute = 0 |
| results = [] |
| for problem in bench.problems: |
| solved = False |
| cost = 0 |
| used = [] |
| ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2)) |
| for agent in ranked: |
| if solved or len(used) >= 3: |
| break |
| a.attempts += 1 |
| r = agent.solve(problem) |
| cost += r["compute_cost"] |
| total_compute += r["compute_cost"] |
| used.append(agent.agent_id) |
| solved = r["public_pass"] |
| hidden = r["hidden_pass"] |
| oracle_res = oracle.score( |
| "code", {"attempt": len(used)}, {}, |
| {"correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden else 0.0, |
| "compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden}, |
| agent_id=agent.agent_id, |
| ) |
| if oracle_res.raw_score > 0: |
| ledger.earn(agent.agent_id, problem.task_id, "solve", oracle_res.raw_score * 5, |
| oracle_res.raw_score, cost, "pass", "model_call") |
| else: |
| ledger.spend(agent.agent_id, problem.task_id, "solve", 1.0, "model_call", "fail") |
| if hidden: |
| break |
| results.append({"solved": solved, "cost": cost, "agents": used}) |
|
|
| acc = sum(1 for r in results if r["solved"]) / len(results) |
| return { |
| "accuracy": acc, |
| "total_compute": total_compute, |
| "mean_compute": total_compute / len(results), |
| "mean_agents": sum(len(r["agents"]) for r in results) / len(results), |
| } |
|
|
|
|
| def run_ablation_qa(config: AblConfig, seed: int = 42) -> Dict: |
| random.seed(seed) |
| np.random.seed(seed) |
| oracle = ImpactOracle( |
| compute_penalty_rate=config.compute_penalty_rate, |
| gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0, |
| ) |
| ledger = CreditLedger(decay_lambda=config.decay_lambda) |
| broker = ResourceBroker(thresholds=config.broker_thresholds) |
|
|
| data = create_qa_dataset(seed=seed) |
| bench = QABenchmark(data, oracle, ledger, broker, seed=seed) |
| agent = SimulatedAgent("qa_agent", oracle, ledger, broker, 0.85) |
| agent.budget = 50000 |
| agent.strategy = "adaptive" |
| results = bench.run_occ(agent) |
| return { |
| "accuracy": results["accuracy"], |
| "total_compute": results["total_compute"], |
| "mean_compute": results["mean_compute"], |
| "precision": results.get("precision", 0), |
| "recall": results.get("recall", 0), |
| } |
|
|
|
|
| def run_ablation_debate(config: AblConfig, seed: int = 42, n_debates: int = 20) -> Dict: |
| random.seed(seed) |
| np.random.seed(seed) |
| oracle = ImpactOracle( |
| compute_penalty_rate=config.compute_penalty_rate, |
| gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0, |
| ) |
| ledger = CreditLedger(decay_lambda=config.decay_lambda) |
| broker = ResourceBroker(thresholds=config.broker_thresholds) |
|
|
| bench = DebateBenchmark( |
| oracle=oracle, ledger=ledger, broker=broker, |
| factual_accuracy=0.9, adversarial_ratio=0.0, n_agents=3, seed=seed |
| ) |
| agents = [FactualAgent(f"f{i}", 0.9) for i in range(3)] |
| results = bench.run_debate(agents) |
| return { |
| "accuracy": results["accuracy"], |
| "consensus_reached": results["consensus_reached"], |
| "total_compute": results["total_compute"], |
| "mean_compute": results["mean_compute"], |
| } |
|
|
|
|
| def run_anti_gaming_code(seed: int = 42, n_problems: int = 50) -> Dict: |
| """Test anti-gaming: create a gaming agent that always passes public but fails hidden.""" |
| random.seed(seed) |
| np.random.seed(seed) |
| oracle = ImpactOracle(gaming_penalty=2.0) |
| ledger = CreditLedger() |
| broker = ResourceBroker() |
|
|
| bench = CodeBenchmark(n_problems=n_problems, seed=seed) |
| |
| normal = SimulatedCodeAgent("normal", 0.9, 0.5, 0.1, 150) |
| |
| gamer = SimulatedCodeAgent("gamer", 0.95, 0.05, 0.8, 100) |
|
|
| ledger.earn("normal", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call") |
| ledger.earn("gamer", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call") |
|
|
| results = {"normal": [], "gamer": []} |
| for problem in bench.problems: |
| for agent in [normal, gamer]: |
| r = agent.solve(problem) |
| oracle_res = oracle.score( |
| "code", {}, {}, |
| {"correctness": 1.0 if r["public_pass"] else 0.0, |
| "pass_at_k": 1.0 if r["hidden_pass"] else 0.0, |
| "compute_cost": r["compute_cost"], |
| "public_pass": r["public_pass"], |
| "hidden_tests_pass": r["hidden_pass"]}, |
| agent_id=agent.agent_id, |
| ) |
| results[agent.agent_id].append({ |
| "public_pass": r["public_pass"], |
| "hidden_pass": r["hidden_pass"], |
| "raw_score": oracle_res.raw_score, |
| "penalized": "gaming_hidden_tests" in oracle_res.failure_tags, |
| }) |
|
|
| def summarize(who): |
| rs = results[who] |
| return { |
| "public_acc": sum(1 for r in rs if r["public_pass"]) / len(rs), |
| "hidden_acc": sum(1 for r in rs if r["hidden_pass"]) / len(rs), |
| "mean_raw": sum(r["raw_score"] for r in rs) / len(rs), |
| "penalized_rate": sum(1 for r in rs if r["penalized"]) / len(rs), |
| } |
|
|
| return {"normal": summarize("normal"), "gamer": summarize("gamer")} |
|
|
|
|
| def run_anti_gaming_collusion(seed: int = 42) -> Dict: |
| """Test that credit transfers are always blocked.""" |
| ledger = CreditLedger() |
| ledger.earn("alice", "seed", "seed", 10, 0.0, 0.0, "initial") |
| ledger.earn("bob", "seed", "seed", 1, 0.0, 0.0, "initial") |
|
|
| ok = ledger.transfer("alice", "bob", 5.0, "global") |
| alice_bal = ledger.balance("alice") |
| bob_bal = ledger.balance("bob") |
|
|
| collusion = ledger.detect_collusion(window=10) |
| return { |
| "transfer_allowed": ok, |
| "alice_balance": alice_bal, |
| "bob_balance": bob_bal, |
| "collusion_detected": bool(collusion), |
| "transfer_blocked": not ok, |
| } |
|
|
|
|
| def run_anti_gaming_abstention(seed: int = 42) -> Dict: |
| """Test over-abstention penalty.""" |
| oracle = ImpactOracle() |
| |
| results = [] |
| for i in range(10): |
| res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"}, |
| {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50}) |
| results.append(res.reward_value) |
| return {"mean_reward": sum(results) / len(results), "expected_negative": sum(results) < 0} |
|
|
|
|
| def run_anti_gaming_spam(seed: int = 42) -> Dict: |
| """Test spam detection: high compute, low score.""" |
| oracle = ImpactOracle() |
| |
| res = oracle.score("retrieval_qa", {}, {"gold_answer": "paris"}, |
| {"answer": "london", "confidence": 0.1, "evidence": {}, "compute_cost": 5000}) |
| return {"reward": res.reward_value, "tagged": bool(res.failure_tags), "tags": res.failure_tags} |
|
|
|
|
| def run_all() -> Dict: |
| print("=" * 60) |
| print("OCC UNIFIED EVALUATION RUNNER") |
| print("=" * 60) |
|
|
| all_results: Dict[str, Any] = {"ablations": {}, "anti_gaming": {}} |
|
|
| |
| for abl in ABLATIONS: |
| print(f"\n--- ABLATION: {abl.name} ---") |
| print(f" {abl.description}") |
| code_res = run_ablation_code(abl, seed=42, n_problems=50) |
| qa_res = run_ablation_qa(abl, seed=42) |
| debate_res = run_ablation_debate(abl, seed=42) |
| print(f" Code: acc={code_res['accuracy']:.3f}, compute={code_res['total_compute']:.0f}") |
| print(f" QA: acc={qa_res['accuracy']:.3f}, compute={qa_res['total_compute']:.0f}") |
| print(f" Debate: acc={debate_res['accuracy']:.3f}, compute={debate_res['total_compute']:.0f}") |
| all_results["ablations"][abl.name] = { |
| "config": abl.__dict__, |
| "code": code_res, |
| "qa": qa_res, |
| "debate": debate_res, |
| } |
|
|
| |
| print("\n--- ANTI-GAMING TESTS ---") |
| all_results["anti_gaming"]["hidden_test_gaming"] = run_anti_gaming_code(seed=42) |
| all_results["anti_gaming"]["collusion"] = run_anti_gaming_collusion(seed=42) |
| all_results["anti_gaming"]["abstention"] = run_anti_gaming_abstention(seed=42) |
| all_results["anti_gaming"]["spam"] = run_anti_gaming_spam(seed=42) |
|
|
| for test_name, res in all_results["anti_gaming"].items(): |
| print(f"\n {test_name}: {json.dumps(res, indent=2, default=str)}") |
|
|
| |
| out = Path("/app/occ/reports") |
| out.mkdir(parents=True, exist_ok=True) |
| with open(out / "eval_runner_results.json", "w") as f: |
| json.dump(all_results, f, indent=2, default=str) |
| print(f"\nSaved to {out / 'eval_runner_results.json'}") |
| return all_results |
|
|
|
|
| if __name__ == "__main__": |
| run_all() |
|
|