FlakeForge / tests /visualize_agent_tool_usage.py
random70249's picture
Upload folder using huggingface_hub
ee933ab verified
"""Visualize how FlakeForge tool evidence is produced and fed to the agent.
This script does not require Docker. It uses a lightweight demo runner that
alternates pass/fail outcomes and emits traceback-like stderr so the same
pipeline pieces are exercised:
- preflight classification
- deep flakiness signals
- causal frontier extraction
- tools-based file targeting hints
- final prompt section shown to the agent
Usage:
c:/CodingNest/FlakeForge/venv/Scripts/python.exe tests/visualize_agent_tool_usage.py
c:/CodingNest/FlakeForge/venv/Scripts/python.exe tests/visualize_agent_tool_usage.py --repo-path test_repos/moderate_load_jitter_flaky --test-id tests/test_flaky.py::test_request_processing_should_succeed
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Optional
# Ensure project root imports resolve when running as a script.
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from agent.unified_agent import build_unified_prompt
from models import RunRecord
from server.FlakeForge_environment import FlakeForgeEnvironment
from server.tools import build_agent_targeting_hints
class DemoRunner:
"""Deterministic alternating pass/fail runner for visualization only."""
def __init__(self, repo_path: Path, test_identifier: str) -> None:
self.repo_path = repo_path
self.test_identifier = test_identifier
self.counter = 0
def run_test(self, test_identifier: str) -> RunRecord:
self.counter += 1
test_file = test_identifier.split("::", 1)[0]
source_file = self._best_source_candidate(test_file)
# Alternate outcomes so preflight can classify as flaky.
passed = (self.counter % 2 == 0)
if passed:
return RunRecord(
passed=True,
duration_ms=40 + (self.counter % 7),
error_type=None,
error_message=None,
stderr_excerpt=None,
)
trace = (
"Traceback (most recent call last):\n"
f" File \"{(self.repo_path / test_file).as_posix()}\", line 42, in test_case\n"
f" File \"{(self.repo_path / source_file).as_posix()}\", line 21, in process_request\n"
"AssertionError: transient mismatch\n"
)
return RunRecord(
passed=False,
duration_ms=95 + (self.counter % 13),
error_type="AssertionError",
error_message="transient mismatch",
stderr_excerpt=trace,
)
def _best_source_candidate(self, test_file: str) -> str:
direct = self.repo_path / "source.py"
if direct.exists():
return "source.py"
# Fallback: any .py file that is not a test file.
for path in self.repo_path.rglob("*.py"):
rel = path.relative_to(self.repo_path).as_posix()
if rel.startswith("tests/"):
continue
return rel
return test_file
def _extract_targeting_section(prompt: str) -> str:
header = "=== TARGETING HINTS ==="
if header not in prompt:
return "(TARGETING HINTS section not found in prompt)"
lines = prompt.splitlines()
start = None
for i, line in enumerate(lines):
if line.strip() == header:
start = i
break
if start is None:
return "(TARGETING HINTS section not found in prompt)"
collected = []
for line in lines[start:]:
if line.startswith("=== ") and line.strip() != header and collected:
break
collected.append(line)
return "\n".join(collected)
def main() -> None:
parser = argparse.ArgumentParser(description="Visualize FlakeForge tool-driven file targeting")
parser.add_argument(
"--repo-path",
default=os.environ.get("FF_REPO_PATH", "test_repos/moderate_load_jitter_flaky"),
help="Repo path used for environment reset",
)
parser.add_argument(
"--test-id",
default=os.environ.get("FF_TEST_ID", "tests/test_flaky.py::test_request_processing_should_succeed"),
help="Test identifier",
)
parser.add_argument(
"--quick-runs",
type=int,
default=6,
help="Preflight quick runs",
)
parser.add_argument(
"--confirm-runs",
type=int,
default=6,
help="Preflight confirm runs",
)
args = parser.parse_args()
repo_path = Path(args.repo_path)
runner = DemoRunner(repo_path=repo_path, test_identifier=args.test_id)
env = FlakeForgeEnvironment(
repo_path=str(repo_path),
test_identifier=args.test_id,
runner=runner,
max_steps=3,
num_runs=8,
)
obs = env.reset(
preflight_quick_runs=args.quick_runs,
preflight_confirm_runs=args.confirm_runs,
drop_deterministic_bugs=False,
)
print("\n=== PREFLIGHT SUMMARY ===")
print(json.dumps(obs.preflight_result, indent=2))
print("\n=== DEEP SIGNAL SNAPSHOT ===")
deep_counts = {
"module_cache_violations": len(obs.module_cache_violations),
"fixture_scope_risks": len(obs.fixture_scope_risks),
"mock_residue_sites": len(obs.mock_residue_sites),
"import_side_effect_files": len(obs.import_side_effect_files),
"async_contamination_alive": bool(obs.async_contamination_alive),
}
print(json.dumps(deep_counts, indent=2))
print("\n=== MERGED TARGETING HINTS IN OBSERVATION ===")
for idx, hint in enumerate(obs.causal_hints[:10], start=1):
print(f"{idx:02d}. {hint}")
extra_hints = build_agent_targeting_hints(
repo_path=str(repo_path),
test_identifier=obs.test_identifier,
failing_stack_trace=obs.failing_stack_trace,
source_under_test=obs.source_under_test,
causal_frontier=obs.failure_frontier,
deep_signals={
"module_cache_violations": obs.module_cache_violations,
"fixture_scope_risks": obs.fixture_scope_risks,
"mock_residue_sites": obs.mock_residue_sites,
"import_side_effect_files": obs.import_side_effect_files,
},
max_hints=8,
)
print("\n=== TOOLS-ONLY TARGETING HINTS (DIRECT CALL) ===")
for idx, hint in enumerate(extra_hints, start=1):
print(f"{idx:02d}. {hint}")
prompt = build_unified_prompt(obs)
print("\n=== PROMPT TARGETING SECTION SEEN BY AGENT ===")
print(_extract_targeting_section(prompt))
if __name__ == "__main__":
main()