| """Module for evaluating models on test set."""
|
|
|
| import argparse
|
| import json
|
| import os
|
| import time
|
|
|
| import dagshub
|
| import joblib
|
| import mlflow
|
| import numpy as np
|
| import pandas as pd
|
| from setfit import SetFitModel
|
| import torch
|
| from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
|
|
| from .utils import load_dataset_splits, parse_labels_column
|
|
|
| LABELS = {
|
| "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
|
| "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
|
| "pharo": [
|
| "Keyimplementationpoints",
|
| "Example",
|
| "Responsibilities",
|
| "Intent",
|
| "Keymessages",
|
| "Collaborators",
|
| ],
|
| }
|
|
|
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
| dagshub.init(repo_owner="se4ai2526-uniba", repo_name="TheClouds", mlflow=True)
|
|
|
|
|
| def evaluate_and_benchmark(lang, model_type, model_path, data_path, metrics_output_path):
|
| """Load a trained model, run detailed benchmarking for performance and metrics,
|
| and log the results to a new MLflow run.
|
|
|
| """
|
| mlflow.set_experiment("Model Benchmarking")
|
| print(f"Starting Evaluation & Benchmarking for language: {lang} and model: {model_type}")
|
|
|
| with mlflow.start_run(run_name=f"evaluation_local_{lang}_{model_type}"):
|
| mlflow.log_param("language", lang)
|
| mlflow.log_param("model_type", model_type)
|
| mlflow.log_param("model_path", model_path)
|
| mlflow.log_param("data_path", data_path)
|
|
|
| avg_runtime_sec = 0.0
|
| avg_gflops = 0.0
|
|
|
|
|
|
|
|
|
| if model_type == "setfit":
|
| ds = load_dataset_splits(base_dir=data_path, langs=[lang])
|
| eval_df = parse_labels_column(ds[f"{lang}_test"])
|
|
|
| x_eval = eval_df["combo"].astype(str).tolist()
|
| y_true = np.array(eval_df["labels"].tolist(), dtype=int)
|
|
|
| model = SetFitModel.from_pretrained(model_path)
|
|
|
| with torch.profiler.profile(with_flops=True) as p:
|
| begin = time.time()
|
| for _ in range(10):
|
| y_pred = model(x_eval)
|
| total_runtime = time.time() - begin
|
|
|
| avg_runtime_sec = total_runtime / 10
|
| avg_gflops = (sum(k.flops for k in p.key_averages()) / 1e9) / 10
|
|
|
| y_pred = np.array(y_pred)
|
|
|
|
|
|
|
|
|
| elif model_type == "random_forest":
|
| ds = load_dataset_splits(base_dir=data_path, langs=[lang])
|
| eval_df = parse_labels_column(ds[f"{lang}_test"])
|
|
|
| x_eval = eval_df["combo"].astype(str).tolist()
|
| y_true = np.array(eval_df["labels"].tolist(), dtype=int)
|
|
|
| model = joblib.load(f"{model_path}.joblib")
|
|
|
| begin = time.time()
|
| for _ in range(10):
|
| y_pred = model.predict(x_eval)
|
| total_runtime = time.time() - begin
|
|
|
| avg_runtime_sec = total_runtime / 10
|
| avg_gflops = 0.0
|
|
|
| y_pred = np.array(y_pred)
|
|
|
|
|
|
|
|
|
| elif model_type == "transformer":
|
| test_csv_path = os.path.join(data_path, f"{lang}_test.csv")
|
| if not os.path.exists(test_csv_path):
|
| raise FileNotFoundError(f"Test CSV for transformer not found: {test_csv_path}")
|
|
|
| df_test = pd.read_csv(test_csv_path)
|
| df_test = parse_labels_column(df_test)
|
|
|
|
|
| if "combo" not in df_test.columns:
|
| df_test["combo"] = (
|
| df_test["comment_sentence"].astype(str) + " | " + df_test["class"].astype(str)
|
| )
|
|
|
| texts = df_test["combo"].astype(str).tolist()
|
| y_true = np.array(df_test["labels"].tolist(), dtype=int)
|
|
|
| tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| model = AutoModelForSequenceClassification.from_pretrained(model_path).to(DEVICE)
|
| model.eval()
|
|
|
| enc = tokenizer(
|
| texts,
|
| padding=True,
|
| truncation=True,
|
| max_length=128,
|
| return_tensors="pt",
|
| )
|
| enc = {k: v.to(DEVICE) for k, v in enc.items()}
|
|
|
| with torch.no_grad():
|
| with torch.profiler.profile(with_flops=True) as p:
|
| begin = time.time()
|
| for _ in range(10):
|
| outputs = model(**enc)
|
| total_runtime = time.time() - begin
|
|
|
| logits = outputs.logits
|
| probs = torch.sigmoid(logits)
|
| y_pred = (probs > 0.5).long().cpu().numpy()
|
|
|
| avg_runtime_sec = total_runtime / 10
|
| avg_gflops = (sum(k.flops for k in p.key_averages()) / 1e9) / 10
|
|
|
| else:
|
| raise ValueError(f"Unsupported model_type: {model_type}")
|
|
|
| print(f"Avg runtime in seconds: {avg_runtime_sec:.4f}")
|
| mlflow.log_metric("avg_runtime_sec", avg_runtime_sec)
|
| mlflow.log_metric("avg_gflops", avg_gflops)
|
|
|
|
|
|
|
|
|
| scores = []
|
| y_true_transposed = y_true.T
|
| y_pred_transposed = y_pred.T
|
|
|
| for i in range(len(y_pred_transposed)):
|
| tp = np.logical_and(y_true_transposed[i] == 1, y_pred_transposed[i] == 1).sum()
|
| fp = np.logical_and(y_true_transposed[i] == 0, y_pred_transposed[i] == 1).sum()
|
| fn = np.logical_and(y_true_transposed[i] == 1, y_pred_transposed[i] == 0).sum()
|
|
|
| precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
| recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
| f1 = (2 * tp) / (2 * tp + fp + fn) if (2 * tp + fp + fn) > 0 else 0.0
|
|
|
| scores.append(
|
| {
|
| "lan": lang,
|
| "cat": LABELS[lang][i],
|
| "precision": precision,
|
| "recall": recall,
|
| "f1": f1,
|
| }
|
| )
|
|
|
| lan_scores_df = pd.DataFrame(scores)
|
|
|
| avg_f1 = lan_scores_df["f1"].mean()
|
| avg_precision = lan_scores_df["precision"].mean()
|
| avg_recall = lan_scores_df["recall"].mean()
|
|
|
| mlflow.log_metric("avg_f1_score", avg_f1)
|
| mlflow.log_metric("avg_precision", avg_precision)
|
| mlflow.log_metric("avg_recall", avg_recall)
|
|
|
| dvc_metrics = {
|
| "avg_f1_score": avg_f1,
|
| "avg_precision": avg_precision,
|
| "avg_recall": avg_recall,
|
| "avg_runtime_sec": avg_runtime_sec,
|
| "avg_gflops": avg_gflops,
|
| }
|
| os.makedirs(os.path.dirname(metrics_output_path), exist_ok=True)
|
| with open(metrics_output_path, "w") as f:
|
| json.dump(dvc_metrics, f, indent=4)
|
|
|
|
|
| if __name__ == "__main__":
|
| parser = argparse.ArgumentParser()
|
| parser.add_argument("--lang", type=str, required=True)
|
| parser.add_argument("--model_type", type=str, required=True)
|
| parser.add_argument(
|
| "--data_path",
|
| type=str,
|
| default="data/raw",
|
| help=(
|
| "Path to evaluation data. "
|
| "For setfit/random_forest: base dir with raw CSVs (e.g. data/raw). "
|
| "For transformer: directory with {lang}_test.csv (e.g. data/processed/transformer)."
|
| ),
|
| )
|
| args = parser.parse_args()
|
|
|
| evaluate_and_benchmark(
|
| lang=args.lang,
|
| model_type=args.model_type,
|
| model_path=f"models/{args.lang}/{args.model_type}",
|
| data_path=args.data_path,
|
| metrics_output_path=f"reports/metrics/{args.lang}/{args.model_type}_metrics.json",
|
| )
|
|
|