| """Module for training different types of models for code comment classification."""
|
|
|
| import argparse
|
| import logging
|
| import os
|
|
|
| import dagshub
|
| from datasets import Dataset
|
| import mlflow
|
| import yaml
|
|
|
| from .utils import load_dataset_splits, parse_labels_column
|
|
|
| logging.basicConfig(
|
| level=logging.INFO,
|
| format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| )
|
| logger = logging.getLogger(__name__)
|
|
|
|
|
| dagshub.init(repo_owner="se4ai2526-uniba", repo_name="TheClouds", mlflow=True)
|
|
|
|
|
| def train_model(lang, model_type, data_path, model_output_path, params):
|
| """Trains and saves a model for a specific language and model type."""
|
| print(f"--- Starting training for language: {lang} with model: {model_type} ---")
|
|
|
| ds = load_dataset_splits(data_path)
|
|
|
| train_df = ds[f"{lang}_train"]
|
| eval_df = ds[f"{lang}_test"]
|
|
|
| train_df = parse_labels_column(train_df)
|
| eval_df = parse_labels_column(eval_df)
|
|
|
|
|
| train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
|
| eval_dataset = Dataset.from_pandas(eval_df, preserve_index=False)
|
|
|
| if model_type == "setfit":
|
| from setfit import SetFitModel, Trainer, TrainingArguments
|
|
|
| mlflow.set_experiment("SetFit Training")
|
| with mlflow.start_run(run_name=f"train-{lang}-{model_type}"):
|
| mlflow.log_param("language", lang)
|
| mlflow.log_param("model_type", model_type)
|
| model = SetFitModel.from_pretrained(
|
| "sentence-transformers/paraphrase-MiniLM-L6-v2",
|
| multi_target_strategy="multi-output",
|
| )
|
| args = TrainingArguments(**params)
|
| trainer = Trainer(
|
| model=model,
|
| args=args,
|
| train_dataset=train_dataset,
|
| eval_dataset=eval_dataset,
|
| column_mapping={"combo": "text", "labels": "label"},
|
| )
|
|
|
| mlflow.log_param("num_epochs", args.num_epochs)
|
| mlflow.log_param("num_iterations", args.num_iterations)
|
|
|
| trainer.train()
|
|
|
| eval_metrics = trainer.evaluate()
|
| for metric_name, metric_value in eval_metrics.items():
|
| mlflow.log_metric(metric_name, metric_value)
|
|
|
| trainer.model.save_pretrained(model_output_path)
|
|
|
| mlflow.transformers.log_model(
|
| transformers_model=model_output_path,
|
| artifact_path=f"{lang}_setfit_model",
|
| task="text-classification",
|
| )
|
| mlflow.end_run()
|
|
|
| elif model_type == "random_forest":
|
| import joblib
|
| import numpy as np
|
| from sklearn.ensemble import RandomForestClassifier
|
| from sklearn.feature_extraction.text import TfidfVectorizer
|
| from sklearn.multioutput import MultiOutputClassifier
|
| from sklearn.pipeline import Pipeline
|
|
|
| mlflow.set_experiment("Random Forest Training")
|
| with mlflow.start_run(run_name=f"train-{lang}-{model_type}"):
|
| mlflow.log_param("language", lang)
|
| mlflow.log_param("model_type", model_type)
|
| mlflow.log_params(params)
|
|
|
| tfidf_params = {
|
| "ngram_range": tuple(params.pop("ngram_range", (1, 1))),
|
| "max_features": params.pop("max_features", None),
|
| "min_df": params.pop("min_df", 1),
|
| "max_df": params.pop("max_df", 1.0),
|
| }
|
|
|
| rf_params = params
|
| pipeline = Pipeline(
|
| [
|
| ("tfidf", TfidfVectorizer(**tfidf_params)),
|
| (
|
| "clf",
|
| MultiOutputClassifier(
|
| RandomForestClassifier(
|
| random_state=42, class_weight="balanced", **rf_params
|
| )
|
| ),
|
| ),
|
| ]
|
| )
|
|
|
| X_train = train_dataset["combo"]
|
| y_train = np.array(train_dataset["labels"])
|
|
|
| pipeline.fit(X_train, y_train)
|
|
|
| X_test = eval_dataset["combo"]
|
| y_test = np.array(eval_dataset["labels"])
|
|
|
| score = pipeline.score(X_test, y_test)
|
| mlflow.log_metric("accuracy", score)
|
|
|
| os.makedirs(os.path.dirname(model_output_path), exist_ok=True)
|
| joblib.dump(pipeline, f"{model_output_path}.joblib")
|
|
|
| mlflow.sklearn.log_model(
|
| sk_model=pipeline, artifact_path=f"{lang}_random_forest_model"
|
| )
|
| mlflow.end_run()
|
|
|
| elif model_type == "transformer":
|
| from .transformer import (
|
| TransformerConfig,
|
| TransformerTrainer,
|
| )
|
|
|
| mlflow.set_experiment("Transformer Training")
|
| with mlflow.start_run(run_name=f"train-{lang}-{model_type}"):
|
| mlflow.log_param("language", lang)
|
| mlflow.log_param("model_type", model_type)
|
| mlflow.log_params(params)
|
|
|
| cfg = TransformerConfig(
|
| lang=lang,
|
| raw_data_dir="data/raw",
|
| processed_data_dir="data/processed/transformer",
|
| model_output_path=model_output_path,
|
| pretrained_model_name=params.get(
|
| "pretrained_model_name", "microsoft/codebert-base"
|
| ),
|
| max_length=params.get("max_length", 128),
|
| batch_size=params.get("batch_size", 16),
|
| lr=params.get("lr", 2e-5),
|
| num_epochs=params.get("num_epochs", 5),
|
| warmup_ratio=params.get("warmup_ratio", 0.1),
|
| pos_weight_cap=params.get("pos_weight_cap", 30.0),
|
| threshold=params.get("threshold", 0.5),
|
| preprocessing=params.get("preprocessing", False),
|
| preprocessing_factor=params.get("preprocessing_factor", 1.0),
|
| )
|
|
|
| logger.info(
|
| "Starting transformer training for language '%s' with config: %s",
|
| lang,
|
| cfg,
|
| )
|
|
|
| trainer = TransformerTrainer(cfg)
|
| metrics = trainer.run()
|
|
|
| logger.info("Final transformer metrics for %s: %s", lang, metrics)
|
|
|
| for name, value in metrics.items():
|
| mlflow.log_metric(f"final_{name}", value)
|
|
|
| mlflow.end_run()
|
|
|
| else:
|
| raise ValueError(f"Unsupported model_type: {model_type}")
|
|
|
| print(f"Model for {lang}-{model_type} saved to {model_output_path}")
|
|
|
|
|
| if __name__ == "__main__":
|
| parser = argparse.ArgumentParser()
|
| parser.add_argument("--lang", type=str, required=True)
|
| parser.add_argument("--model_type", type=str, required=True)
|
| args = parser.parse_args()
|
|
|
| with open("params.yaml", "r") as f:
|
| all_params = yaml.safe_load(f)
|
|
|
| model_params = all_params[args.model_type].copy()
|
|
|
| train_model(
|
| lang=args.lang,
|
| model_type=args.model_type,
|
| data_path="data/raw",
|
| model_output_path=f"models/{args.lang}/{args.model_type}",
|
| params=model_params,
|
| )
|
|
|