occ-stack / eval_runner.py

Upload eval_runner.py

ae2b06a verified 20 days ago

13 kB

	"""
	Unified evaluation runner: all ablations + anti-gaming tests.
	Runs simulated benchmarks under 10 ablation conditions and 6 anti-gaming attacks.
	"""
	import json
	import random
	import sys
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any, Dict, List, Tuple

	import numpy as np

	# Ensure imports work
	sys.path.insert(0, str(Path(__file__).parent))
	from oracle.oracle import ImpactOracle
	from ledger.ledger import CreditLedger
	from broker.broker import ResourceBroker, Decision
	from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent
	from benchmarks.benchmark_retrieval_qa import (
	QABenchmark,
	SimulatedAgent,
	create_qa_dataset,
	)
	from benchmarks.benchmark_debate_v2 import (
	DebateBenchmark,
	FactualAgent,
	OverconfidentAgent,
	UncertainAgent,
	SycophantAgent,
	)


	@dataclass
	class AblConfig:
	name: str
	description: str
	oracle_weights: Dict[str, Any]
	broker_thresholds: Dict[str, float]
	decay_lambda: float
	gaming_penalty: float
	compute_penalty_rate: float
	anti_gaming_on: bool


	ABLATIONS = [
	AblConfig("default", "Full OCC stack", {}, {}, 0.02, 2.0, 0.0001, True),
	AblConfig("no_decay", "No credit decay (lambda=0)", {}, {}, 0.0, 2.0, 0.0001, True),
	AblConfig("fast_decay", "Aggressive decay (lambda=0.1)", {}, {}, 0.1, 2.0, 0.0001, True),
	AblConfig("no_gaming_penalty", "No gaming penalties", {}, {}, 0.02, 0.0, 0.0001, True),
	AblConfig("high_gaming_penalty", "Severe gaming penalties (5.0)", {}, {}, 0.02, 5.0, 0.0001, True),
	AblConfig("lenient_broker", "Lenient broker (thresholds x0.5)", {}, {"low": 0.25, "medium": 1.0, "high": 2.5}, 0.02, 2.0, 0.0001, True),
	AblConfig("strict_broker", "Strict broker (thresholds x2.0)", {}, {"low": 1.0, "medium": 4.0, "high": 10.0}, 0.02, 2.0, 0.0001, True),
	AblConfig("high_compute_cost", "High compute penalty (x10)", {}, {}, 0.02, 2.0, 0.001, True),
	AblConfig("low_compute_cost", "Low compute penalty (x0.1)", {}, {}, 0.02, 2.0, 0.00001, True),
	AblConfig("anti_gaming_off", "Disable all anti-gaming detectors", {}, {}, 0.02, 2.0, 0.0001, False),
	]


	def run_ablation_code(config: AblConfig, seed: int = 42, n_problems: int = 50) -> Dict:
	random.seed(seed)
	np.random.seed(seed)

	oracle = ImpactOracle(
	code_weights={"correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001},
	compute_penalty_rate=config.compute_penalty_rate,
	gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
	)
	ledger = CreditLedger(decay_lambda=config.decay_lambda)
	broker = ResourceBroker(thresholds=config.broker_thresholds)

	bench = CodeBenchmark(n_problems=n_problems, seed=seed)
	cheap = SimulatedCodeAgent("cheap", 0.65, 0.15, 0.20, 60)
	medium = SimulatedCodeAgent("medium", 0.85, 0.35, 0.15, 150)
	expensive = SimulatedCodeAgent("expensive", 0.95, 0.65, 0.10, 350)

	# Seed ledger
	for a in [cheap, medium, expensive]:
	q = (a.pass_rate_easy + a.pass_rate_hard) / 2
	ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")

	# Override benchmark's oracle/ledger/broker
	results = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
	# (the benchmark internally uses its own instances; we use the standalone below)
	# Actually the benchmark creates its own objects. Let's run standalone:
	return _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed)


	def _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed):
	random.seed(seed)
	np.random.seed(seed)
	bench = CodeBenchmark(n_problems=n_problems, seed=seed)
	agents = [cheap, medium, expensive]
	for a in agents:
	q = (a.pass_rate_easy + a.pass_rate_hard) / 2
	ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")

	total_compute = 0
	results = []
	for problem in bench.problems:
	solved = False
	cost = 0
	used = []
	ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))
	for agent in ranked:
	if solved or len(used) >= 3:
	break
	a.attempts += 1
	r = agent.solve(problem)
	cost += r["compute_cost"]
	total_compute += r["compute_cost"]
	used.append(agent.agent_id)
	solved = r["public_pass"]
	hidden = r["hidden_pass"]
	oracle_res = oracle.score(
	"code", {"attempt": len(used)}, {},
	{"correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden else 0.0,
	"compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden},
	agent_id=agent.agent_id,
	)
	if oracle_res.raw_score > 0:
	ledger.earn(agent.agent_id, problem.task_id, "solve", oracle_res.raw_score * 5,
	oracle_res.raw_score, cost, "pass", "model_call")
	else:
	ledger.spend(agent.agent_id, problem.task_id, "solve", 1.0, "model_call", "fail")
	if hidden:
	break
	results.append({"solved": solved, "cost": cost, "agents": used})

	acc = sum(1 for r in results if r["solved"]) / len(results)
	return {
	"accuracy": acc,
	"total_compute": total_compute,
	"mean_compute": total_compute / len(results),
	"mean_agents": sum(len(r["agents"]) for r in results) / len(results),
	}


	def run_ablation_qa(config: AblConfig, seed: int = 42) -> Dict:
	random.seed(seed)
	np.random.seed(seed)
	oracle = ImpactOracle(
	compute_penalty_rate=config.compute_penalty_rate,
	gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
	)
	ledger = CreditLedger(decay_lambda=config.decay_lambda)
	broker = ResourceBroker(thresholds=config.broker_thresholds)

	data = create_qa_dataset(seed=seed)
	bench = QABenchmark(data, oracle, ledger, broker, seed=seed)
	agent = SimulatedAgent("qa_agent", oracle, ledger, broker, 0.85)
	agent.budget = 50000
	agent.strategy = "adaptive"
	results = bench.run_occ(agent)
	return {
	"accuracy": results["accuracy"],
	"total_compute": results["total_compute"],
	"mean_compute": results["mean_compute"],
	"precision": results.get("precision", 0),
	"recall": results.get("recall", 0),
	}


	def run_ablation_debate(config: AblConfig, seed: int = 42, n_debates: int = 20) -> Dict:
	random.seed(seed)
	np.random.seed(seed)
	oracle = ImpactOracle(
	compute_penalty_rate=config.compute_penalty_rate,
	gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
	)
	ledger = CreditLedger(decay_lambda=config.decay_lambda)
	broker = ResourceBroker(thresholds=config.broker_thresholds)

	bench = DebateBenchmark(
	oracle=oracle, ledger=ledger, broker=broker,
	factual_accuracy=0.9, adversarial_ratio=0.0, n_agents=3, seed=seed
	)
	agents = [FactualAgent(f"f{i}", 0.9) for i in range(3)]
	results = bench.run_debate(agents)
	return {
	"accuracy": results["accuracy"],
	"consensus_reached": results["consensus_reached"],
	"total_compute": results["total_compute"],
	"mean_compute": results["mean_compute"],
	}


	def run_anti_gaming_code(seed: int = 42, n_problems: int = 50) -> Dict:
	"""Test anti-gaming: create a gaming agent that always passes public but fails hidden."""
	random.seed(seed)
	np.random.seed(seed)
	oracle = ImpactOracle(gaming_penalty=2.0)
	ledger = CreditLedger()
	broker = ResourceBroker()

	bench = CodeBenchmark(n_problems=n_problems, seed=seed)
	# Normal agent
	normal = SimulatedCodeAgent("normal", 0.9, 0.5, 0.1, 150)
	# Gaming agent: high public pass, high hidden fail
	gamer = SimulatedCodeAgent("gamer", 0.95, 0.05, 0.8, 100)

	ledger.earn("normal", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call")
	ledger.earn("gamer", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call")

	results = {"normal": [], "gamer": []}
	for problem in bench.problems:
	for agent in [normal, gamer]:
	r = agent.solve(problem)
	oracle_res = oracle.score(
	"code", {}, {},
	{"correctness": 1.0 if r["public_pass"] else 0.0,
	"pass_at_k": 1.0 if r["hidden_pass"] else 0.0,
	"compute_cost": r["compute_cost"],
	"public_pass": r["public_pass"],
	"hidden_tests_pass": r["hidden_pass"]},
	agent_id=agent.agent_id,
	)
	results[agent.agent_id].append({
	"public_pass": r["public_pass"],
	"hidden_pass": r["hidden_pass"],
	"raw_score": oracle_res.raw_score,
	"penalized": "gaming_hidden_tests" in oracle_res.failure_tags,
	})

	def summarize(who):
	rs = results[who]
	return {
	"public_acc": sum(1 for r in rs if r["public_pass"]) / len(rs),
	"hidden_acc": sum(1 for r in rs if r["hidden_pass"]) / len(rs),
	"mean_raw": sum(r["raw_score"] for r in rs) / len(rs),
	"penalized_rate": sum(1 for r in rs if r["penalized"]) / len(rs),
	}

	return {"normal": summarize("normal"), "gamer": summarize("gamer")}


	def run_anti_gaming_collusion(seed: int = 42) -> Dict:
	"""Test that credit transfers are always blocked."""
	ledger = CreditLedger()
	ledger.earn("alice", "seed", "seed", 10, 0.0, 0.0, "initial")
	ledger.earn("bob", "seed", "seed", 1, 0.0, 0.0, "initial")

	ok = ledger.transfer("alice", "bob", 5.0, "global")
	alice_bal = ledger.balance("alice")
	bob_bal = ledger.balance("bob")

	collusion = ledger.detect_collusion(window=10)
	return {
	"transfer_allowed": ok,
	"alice_balance": alice_bal,
	"bob_balance": bob_bal,
	"collusion_detected": bool(collusion),
	"transfer_blocked": not ok,
	}


	def run_anti_gaming_abstention(seed: int = 42) -> Dict:
	"""Test over-abstention penalty."""
	oracle = ImpactOracle()
	# Agent abstains on everything
	results = []
	for i in range(10):
	res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
	{"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
	results.append(res.reward_value)
	return {"mean_reward": sum(results) / len(results), "expected_negative": sum(results) < 0}


	def run_anti_gaming_spam(seed: int = 42) -> Dict:
	"""Test spam detection: high compute, low score."""
	oracle = ImpactOracle()
	# High compute but wrong answer
	res = oracle.score("retrieval_qa", {}, {"gold_answer": "paris"},
	{"answer": "london", "confidence": 0.1, "evidence": {}, "compute_cost": 5000})
	return {"reward": res.reward_value, "tagged": bool(res.failure_tags), "tags": res.failure_tags}


	def run_all() -> Dict:
	print("=" * 60)
	print("OCC UNIFIED EVALUATION RUNNER")
	print("=" * 60)

	all_results: Dict[str, Any] = {"ablations": {}, "anti_gaming": {}}

	# Ablations
	for abl in ABLATIONS:
	print(f"\n--- ABLATION: {abl.name} ---")
	print(f" {abl.description}")
	code_res = run_ablation_code(abl, seed=42, n_problems=50)
	qa_res = run_ablation_qa(abl, seed=42)
	debate_res = run_ablation_debate(abl, seed=42)
	print(f" Code: acc={code_res['accuracy']:.3f}, compute={code_res['total_compute']:.0f}")
	print(f" QA: acc={qa_res['accuracy']:.3f}, compute={qa_res['total_compute']:.0f}")
	print(f" Debate: acc={debate_res['accuracy']:.3f}, compute={debate_res['total_compute']:.0f}")
	all_results["ablations"][abl.name] = {
	"config": abl.__dict__,
	"code": code_res,
	"qa": qa_res,
	"debate": debate_res,
	}

	# Anti-gaming
	print("\n--- ANTI-GAMING TESTS ---")
	all_results["anti_gaming"]["hidden_test_gaming"] = run_anti_gaming_code(seed=42)
	all_results["anti_gaming"]["collusion"] = run_anti_gaming_collusion(seed=42)
	all_results["anti_gaming"]["abstention"] = run_anti_gaming_abstention(seed=42)
	all_results["anti_gaming"]["spam"] = run_anti_gaming_spam(seed=42)

	for test_name, res in all_results["anti_gaming"].items():
	print(f"\n {test_name}: {json.dumps(res, indent=2, default=str)}")

	# Save
	out = Path("/app/occ/reports")
	out.mkdir(parents=True, exist_ok=True)
	with open(out / "eval_runner_results.json", "w") as f:
	json.dump(all_results, f, indent=2, default=str)
	print(f"\nSaved to {out / 'eval_runner_results.json'}")
	return all_results


	if __name__ == "__main__":
	run_all()