Added evaluation pipeline

This commit is contained in:
2026-02-26 20:15:19 +00:00
parent 96a0c45e84
commit 99896c0873
4 changed files with 193 additions and 1 deletions

Binary file not shown.

View File

@@ -0,0 +1,186 @@
# evauluate.py
import os
import torch
import time
import argparse
import json
import torch.nn.functional as F
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from dataset import ReviewDataset
from model import Model, SingleTaskModel
# TODO: load checkpoint, produce tables of evaluation figures
SEED = 4321
torch.manual_seed(SEED)
np.random.seed(SEED)
# Label names for classification report, readable format instead of numeric
label_names = {
'bug_report': ['No', 'Yes'],
'feature_request': ['No', 'Yes'],
'aspect': ['App', 'Driver', 'General', 'Payment', 'Pricing', 'Service'],
'aspect_sentiment': ['Positive', 'Neutral', 'Negative']
}
def parse_args():
parser = argparse.ArgumentParser(description="RECLASS Evaluation Script")
parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl")
parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
parser.add_argument("--dataset", type=str, required=True, choices=["original", "boosted"])
parser.add_argument("--model_path", type=str, required=True, help=".pt file path")
parser.add_argument("--batch_size", type=int, default=16)
return parser.parse_args()
def main():
args = parse_args()
print(f"Evaluating {args.mode.upper()} model on {args.dataset} dataset for task: {args.task}")
os.makedirs("outputs/figures", exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test = f"data/processed/{args.dataset}_test.csv"
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
test_dataset = ReviewDataset(test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=args.batch_size)
if args.mode == "mtl":
model = Model().to(device)
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
else:
if args.task == "all":
raise ValueError("For STL, please specify a single task with --task")
task_classes = {
'bug_report': 2,
'feature_request': 2,
'aspect': 6,
'aspect_sentiment': 3
}
model = SingleTaskModel(args.task, task_classes[args.task]).to(device)
active_tasks = [args.task]
print(f"Loading weights from {args.model_path}...")
model.load_state_dict(torch.load(args.model_path, map_location=device))
model.eval()
all_labels = {task: [] for task in active_tasks}
all_preds = {task: [] for task in active_tasks}
all_confidences = {task: [] for task in active_tasks}
print("Running inference on test set").upper()
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
outputs = model(input_ids, attention_mask)
for task in active_tasks:
labels = batch[task].to_device()
logits = outputs[task]
preds = torch.argmax(logits, dim=1)
probs = F.softmax(logits, dim=1)
confidence = probs.max(dim=1).values
all_labels[task].extend(labels.cpu().numpy())
all_preds[task].extend(preds.cpu().numpy())
all_confidences[task].extend(confidence.cpu().numpy())
summary = {
"mode": args.mode,
"dataset": args.dataset,
"task": args.task,
"model_path": args.model_path,
"results": {}
}
test_df = pd.read_csv(test) # for later
for task in active_tasks:
print(f"\nFor Task: {task.upper()}\n")
labels_arr = np.array(all_labels[task])
preds_arr = np.array(all_preds[task])
conf_arr = np.array(all_confidences[task])
print(f"\nClassification Report")
report = classification_report(
labels_arr,
preds_arr,
target_names=label_names[task],
digits=4,
zero_division=0
)
print(report)
report_dict = classification_report(
labels_arr,
target_names=label_names[task],
output_dict=True,
zero_division=0
)
correct = (labels_arr == preds_arr)
mean_conf = conf_arr.mean()
mean_conf_correct = conf_arr[correct].mean() if correct.any() else 0
mean_conf_incorrect = conf_arr[~correct].mean() if (~correct).any() else 0
print(f"Overall Mean confidence: {mean_conf:.4f}")
print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}")
print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}")
# save summary to JSON
summary["results"][task] = {
"macro_f1": report_dict["macro avg"]["f1-score"],
"macro_precision": report_dict["macro avg"]["precision"],
"macro_recall": report_dict["macro avg"]["recall"],
"confidence": {
"overall": mean_conf,
"correct": mean_conf_correct,
"incorrect": mean_conf_incorrect
},
"per_class": report_dict
}
# Confusion matrix
cm = confusion_matrix(labels_arr, preds_arr)
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(
cm, annot=True, fmt="d", cmap="Blues", cbar=False,
xticklabels=label_names[task], yticklabels=label_names[task],
ax=ax
)
ax.set_xlabel("Predicted Label", fontweight="bold")
ax.set_ylabel("True Label", fontweight="bold")
ax.set_title(f"{task.replace("_", " ").title()} Confusion Matrix ({args.mode.upper()})", fontweight="bold")
run_name = args.task if args.mode == "stl" else "mtl"
cm_path = f"outputs/figures/cm_{args.mode}_{args.dataset}_{task}.png"
fig.savefig(cm_path, dpi = 150, bbox_inches='tight')
plt.close(fig)
print("Saved cm to path", cm_path)
test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable
test_df[f'{task}_confidence'] = conf_arr
# to JSON
run_name = args.task if args.mode == "stl" else "mtl"
json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json"
with open(json_path, "w") as f:
json.dump(summary, f, indent=4)
print(f"Saved evaluation summary to {json_path}")
csv_path = f"outputs/test_predictions_{args.mode}_{run_name}_{args.dataset}.csv"
test_df.to_csv(csv_path, index=False)
print("Saved raw predictions to CSV at", csv_path)
if __name__ == "__main__":
main()

View File

@@ -11,7 +11,7 @@ import torch.nn as nn
# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head # Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
# Each hidden representation is size 768 # Each hidden representation is size 768
class SingleTaskModel(nn.Module): # SINGLE TASK MODEL ARCHITECTURE class SingleTaskModel(nn.Module): # TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTURE
def __init__(self, task_name, num_classes, dropout_rate=0.2): def __init__(self, task_name, num_classes, dropout_rate=0.2):
super().__init__() super().__init__()
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base") self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")

View File

@@ -3,6 +3,7 @@
import argparse # argparse for later switching to boosted data import argparse # argparse for later switching to boosted data
import os import os
from datetime import datetime from datetime import datetime
import time
import torch import torch
import random import random
import numpy as np import numpy as np
@@ -153,6 +154,7 @@ def main():
) )
# ------------------- Training loop ------------------- # ------------------- Training loop -------------------
start_time = time.time()
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(f'runs/reclass_{run_name}_{timestamp}') writer = SummaryWriter(f'runs/reclass_{run_name}_{timestamp}')
@@ -256,6 +258,10 @@ def main():
writer.close() writer.close()
print("Training complete.") print("Training complete.")
end_time = time.time()
print(f"Total training time: {end_time - start_time:.2f} seconds")
if torch.cuda.is_available():
print(f"Peak GPU memory usage: {torch.cuda.max_memory_allocated(device) / (1024**3)} GB")
if __name__ == "__main__": if __name__ == "__main__":
main() main()