occ-stack / eval_runner.py
narcolepticchicken's picture
Upload eval_runner.py
ae2b06a verified
"""
Unified evaluation runner: all ablations + anti-gaming tests.
Runs simulated benchmarks under 10 ablation conditions and 6 anti-gaming attacks.
"""
import json
import random
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Tuple
import numpy as np
# Ensure imports work
sys.path.insert(0, str(Path(__file__).parent))
from oracle.oracle import ImpactOracle
from ledger.ledger import CreditLedger
from broker.broker import ResourceBroker, Decision
from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent
from benchmarks.benchmark_retrieval_qa import (
QABenchmark,
SimulatedAgent,
create_qa_dataset,
)
from benchmarks.benchmark_debate_v2 import (
DebateBenchmark,
FactualAgent,
OverconfidentAgent,
UncertainAgent,
SycophantAgent,
)
@dataclass
class AblConfig:
name: str
description: str
oracle_weights: Dict[str, Any]
broker_thresholds: Dict[str, float]
decay_lambda: float
gaming_penalty: float
compute_penalty_rate: float
anti_gaming_on: bool
ABLATIONS = [
AblConfig("default", "Full OCC stack", {}, {}, 0.02, 2.0, 0.0001, True),
AblConfig("no_decay", "No credit decay (lambda=0)", {}, {}, 0.0, 2.0, 0.0001, True),
AblConfig("fast_decay", "Aggressive decay (lambda=0.1)", {}, {}, 0.1, 2.0, 0.0001, True),
AblConfig("no_gaming_penalty", "No gaming penalties", {}, {}, 0.02, 0.0, 0.0001, True),
AblConfig("high_gaming_penalty", "Severe gaming penalties (5.0)", {}, {}, 0.02, 5.0, 0.0001, True),
AblConfig("lenient_broker", "Lenient broker (thresholds x0.5)", {}, {"low": 0.25, "medium": 1.0, "high": 2.5}, 0.02, 2.0, 0.0001, True),
AblConfig("strict_broker", "Strict broker (thresholds x2.0)", {}, {"low": 1.0, "medium": 4.0, "high": 10.0}, 0.02, 2.0, 0.0001, True),
AblConfig("high_compute_cost", "High compute penalty (x10)", {}, {}, 0.02, 2.0, 0.001, True),
AblConfig("low_compute_cost", "Low compute penalty (x0.1)", {}, {}, 0.02, 2.0, 0.00001, True),
AblConfig("anti_gaming_off", "Disable all anti-gaming detectors", {}, {}, 0.02, 2.0, 0.0001, False),
]
def run_ablation_code(config: AblConfig, seed: int = 42, n_problems: int = 50) -> Dict:
random.seed(seed)
np.random.seed(seed)
oracle = ImpactOracle(
code_weights={"correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001},
compute_penalty_rate=config.compute_penalty_rate,
gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
)
ledger = CreditLedger(decay_lambda=config.decay_lambda)
broker = ResourceBroker(thresholds=config.broker_thresholds)
bench = CodeBenchmark(n_problems=n_problems, seed=seed)
cheap = SimulatedCodeAgent("cheap", 0.65, 0.15, 0.20, 60)
medium = SimulatedCodeAgent("medium", 0.85, 0.35, 0.15, 150)
expensive = SimulatedCodeAgent("expensive", 0.95, 0.65, 0.10, 350)
# Seed ledger
for a in [cheap, medium, expensive]:
q = (a.pass_rate_easy + a.pass_rate_hard) / 2
ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")
# Override benchmark's oracle/ledger/broker
results = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
# (the benchmark internally uses its own instances; we use the standalone below)
# Actually the benchmark creates its own objects. Let's run standalone:
return _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed)
def _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed):
random.seed(seed)
np.random.seed(seed)
bench = CodeBenchmark(n_problems=n_problems, seed=seed)
agents = [cheap, medium, expensive]
for a in agents:
q = (a.pass_rate_easy + a.pass_rate_hard) / 2
ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")
total_compute = 0
results = []
for problem in bench.problems:
solved = False
cost = 0
used = []
ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))
for agent in ranked:
if solved or len(used) >= 3:
break
a.attempts += 1
r = agent.solve(problem)
cost += r["compute_cost"]
total_compute += r["compute_cost"]
used.append(agent.agent_id)
solved = r["public_pass"]
hidden = r["hidden_pass"]
oracle_res = oracle.score(
"code", {"attempt": len(used)}, {},
{"correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden else 0.0,
"compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden},
agent_id=agent.agent_id,
)
if oracle_res.raw_score > 0:
ledger.earn(agent.agent_id, problem.task_id, "solve", oracle_res.raw_score * 5,
oracle_res.raw_score, cost, "pass", "model_call")
else:
ledger.spend(agent.agent_id, problem.task_id, "solve", 1.0, "model_call", "fail")
if hidden:
break
results.append({"solved": solved, "cost": cost, "agents": used})
acc = sum(1 for r in results if r["solved"]) / len(results)
return {
"accuracy": acc,
"total_compute": total_compute,
"mean_compute": total_compute / len(results),
"mean_agents": sum(len(r["agents"]) for r in results) / len(results),
}
def run_ablation_qa(config: AblConfig, seed: int = 42) -> Dict:
random.seed(seed)
np.random.seed(seed)
oracle = ImpactOracle(
compute_penalty_rate=config.compute_penalty_rate,
gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
)
ledger = CreditLedger(decay_lambda=config.decay_lambda)
broker = ResourceBroker(thresholds=config.broker_thresholds)
data = create_qa_dataset(seed=seed)
bench = QABenchmark(data, oracle, ledger, broker, seed=seed)
agent = SimulatedAgent("qa_agent", oracle, ledger, broker, 0.85)
agent.budget = 50000
agent.strategy = "adaptive"
results = bench.run_occ(agent)
return {
"accuracy": results["accuracy"],
"total_compute": results["total_compute"],
"mean_compute": results["mean_compute"],
"precision": results.get("precision", 0),
"recall": results.get("recall", 0),
}
def run_ablation_debate(config: AblConfig, seed: int = 42, n_debates: int = 20) -> Dict:
random.seed(seed)
np.random.seed(seed)
oracle = ImpactOracle(
compute_penalty_rate=config.compute_penalty_rate,
gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
)
ledger = CreditLedger(decay_lambda=config.decay_lambda)
broker = ResourceBroker(thresholds=config.broker_thresholds)
bench = DebateBenchmark(
oracle=oracle, ledger=ledger, broker=broker,
factual_accuracy=0.9, adversarial_ratio=0.0, n_agents=3, seed=seed
)
agents = [FactualAgent(f"f{i}", 0.9) for i in range(3)]
results = bench.run_debate(agents)
return {
"accuracy": results["accuracy"],
"consensus_reached": results["consensus_reached"],
"total_compute": results["total_compute"],
"mean_compute": results["mean_compute"],
}
def run_anti_gaming_code(seed: int = 42, n_problems: int = 50) -> Dict:
"""Test anti-gaming: create a gaming agent that always passes public but fails hidden."""
random.seed(seed)
np.random.seed(seed)
oracle = ImpactOracle(gaming_penalty=2.0)
ledger = CreditLedger()
broker = ResourceBroker()
bench = CodeBenchmark(n_problems=n_problems, seed=seed)
# Normal agent
normal = SimulatedCodeAgent("normal", 0.9, 0.5, 0.1, 150)
# Gaming agent: high public pass, high hidden fail
gamer = SimulatedCodeAgent("gamer", 0.95, 0.05, 0.8, 100)
ledger.earn("normal", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call")
ledger.earn("gamer", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call")
results = {"normal": [], "gamer": []}
for problem in bench.problems:
for agent in [normal, gamer]:
r = agent.solve(problem)
oracle_res = oracle.score(
"code", {}, {},
{"correctness": 1.0 if r["public_pass"] else 0.0,
"pass_at_k": 1.0 if r["hidden_pass"] else 0.0,
"compute_cost": r["compute_cost"],
"public_pass": r["public_pass"],
"hidden_tests_pass": r["hidden_pass"]},
agent_id=agent.agent_id,
)
results[agent.agent_id].append({
"public_pass": r["public_pass"],
"hidden_pass": r["hidden_pass"],
"raw_score": oracle_res.raw_score,
"penalized": "gaming_hidden_tests" in oracle_res.failure_tags,
})
def summarize(who):
rs = results[who]
return {
"public_acc": sum(1 for r in rs if r["public_pass"]) / len(rs),
"hidden_acc": sum(1 for r in rs if r["hidden_pass"]) / len(rs),
"mean_raw": sum(r["raw_score"] for r in rs) / len(rs),
"penalized_rate": sum(1 for r in rs if r["penalized"]) / len(rs),
}
return {"normal": summarize("normal"), "gamer": summarize("gamer")}
def run_anti_gaming_collusion(seed: int = 42) -> Dict:
"""Test that credit transfers are always blocked."""
ledger = CreditLedger()
ledger.earn("alice", "seed", "seed", 10, 0.0, 0.0, "initial")
ledger.earn("bob", "seed", "seed", 1, 0.0, 0.0, "initial")
ok = ledger.transfer("alice", "bob", 5.0, "global")
alice_bal = ledger.balance("alice")
bob_bal = ledger.balance("bob")
collusion = ledger.detect_collusion(window=10)
return {
"transfer_allowed": ok,
"alice_balance": alice_bal,
"bob_balance": bob_bal,
"collusion_detected": bool(collusion),
"transfer_blocked": not ok,
}
def run_anti_gaming_abstention(seed: int = 42) -> Dict:
"""Test over-abstention penalty."""
oracle = ImpactOracle()
# Agent abstains on everything
results = []
for i in range(10):
res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
{"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
results.append(res.reward_value)
return {"mean_reward": sum(results) / len(results), "expected_negative": sum(results) < 0}
def run_anti_gaming_spam(seed: int = 42) -> Dict:
"""Test spam detection: high compute, low score."""
oracle = ImpactOracle()
# High compute but wrong answer
res = oracle.score("retrieval_qa", {}, {"gold_answer": "paris"},
{"answer": "london", "confidence": 0.1, "evidence": {}, "compute_cost": 5000})
return {"reward": res.reward_value, "tagged": bool(res.failure_tags), "tags": res.failure_tags}
def run_all() -> Dict:
print("=" * 60)
print("OCC UNIFIED EVALUATION RUNNER")
print("=" * 60)
all_results: Dict[str, Any] = {"ablations": {}, "anti_gaming": {}}
# Ablations
for abl in ABLATIONS:
print(f"\n--- ABLATION: {abl.name} ---")
print(f" {abl.description}")
code_res = run_ablation_code(abl, seed=42, n_problems=50)
qa_res = run_ablation_qa(abl, seed=42)
debate_res = run_ablation_debate(abl, seed=42)
print(f" Code: acc={code_res['accuracy']:.3f}, compute={code_res['total_compute']:.0f}")
print(f" QA: acc={qa_res['accuracy']:.3f}, compute={qa_res['total_compute']:.0f}")
print(f" Debate: acc={debate_res['accuracy']:.3f}, compute={debate_res['total_compute']:.0f}")
all_results["ablations"][abl.name] = {
"config": abl.__dict__,
"code": code_res,
"qa": qa_res,
"debate": debate_res,
}
# Anti-gaming
print("\n--- ANTI-GAMING TESTS ---")
all_results["anti_gaming"]["hidden_test_gaming"] = run_anti_gaming_code(seed=42)
all_results["anti_gaming"]["collusion"] = run_anti_gaming_collusion(seed=42)
all_results["anti_gaming"]["abstention"] = run_anti_gaming_abstention(seed=42)
all_results["anti_gaming"]["spam"] = run_anti_gaming_spam(seed=42)
for test_name, res in all_results["anti_gaming"].items():
print(f"\n {test_name}: {json.dumps(res, indent=2, default=str)}")
# Save
out = Path("/app/occ/reports")
out.mkdir(parents=True, exist_ok=True)
with open(out / "eval_runner_results.json", "w") as f:
json.dump(all_results, f, indent=2, default=str)
print(f"\nSaved to {out / 'eval_runner_results.json'}")
return all_results
if __name__ == "__main__":
run_all()