Fixed evaluation indentation and other bugs
This commit is contained in:
6
.gitignore
vendored
6
.gitignore
vendored
@@ -10,4 +10,8 @@ models/
|
|||||||
backup/*.csv
|
backup/*.csv
|
||||||
runs/
|
runs/
|
||||||
outputs/
|
outputs/
|
||||||
*.pt
|
*.pt
|
||||||
|
__pycache__/
|
||||||
|
*.png
|
||||||
|
*.jpg
|
||||||
|
*.json
|
||||||
|
|||||||
BIN
src/__pycache__/model.cpython-311.pyc
Normal file
BIN
src/__pycache__/model.cpython-311.pyc
Normal file
Binary file not shown.
211
src/evaluate.py
211
src/evaluate.py
@@ -67,120 +67,121 @@ def main():
|
|||||||
model = SingleTaskModel(args.task, task_classes[args.task]).to(device)
|
model = SingleTaskModel(args.task, task_classes[args.task]).to(device)
|
||||||
active_tasks = [args.task]
|
active_tasks = [args.task]
|
||||||
|
|
||||||
print(f"Loading weights from {args.model_path}...")
|
print(f"Loading weights from {args.model_path}...")
|
||||||
model.load_state_dict(torch.load(args.model_path, map_location=device))
|
model.load_state_dict(torch.load(args.model_path, map_location=device))
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
all_labels = {task: [] for task in active_tasks}
|
all_labels = {task: [] for task in active_tasks}
|
||||||
all_preds = {task: [] for task in active_tasks}
|
all_preds = {task: [] for task in active_tasks}
|
||||||
all_confidences = {task: [] for task in active_tasks}
|
all_confidences = {task: [] for task in active_tasks}
|
||||||
|
|
||||||
print("Running inference on test set").upper()
|
print("Running inference on test set")
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
for batch in test_loader:
|
for batch in test_loader:
|
||||||
input_ids = batch['input_ids'].to(device)
|
input_ids = batch['input_ids'].to(device)
|
||||||
attention_mask = batch['attention_mask'].to(device)
|
attention_mask = batch['attention_mask'].to(device)
|
||||||
outputs = model(input_ids, attention_mask)
|
outputs = model(input_ids, attention_mask)
|
||||||
for task in active_tasks:
|
for task in active_tasks:
|
||||||
labels = batch[task].to_device()
|
labels = batch[task].to(device)
|
||||||
logits = outputs[task]
|
logits = outputs[task]
|
||||||
preds = torch.argmax(logits, dim=1)
|
preds = torch.argmax(logits, dim=1)
|
||||||
|
|
||||||
probs = F.softmax(logits, dim=1)
|
probs = F.softmax(logits, dim=1)
|
||||||
confidence = probs.max(dim=1).values
|
confidence = probs.max(dim=1).values
|
||||||
|
|
||||||
all_labels[task].extend(labels.cpu().numpy())
|
all_labels[task].extend(labels.cpu().numpy())
|
||||||
all_preds[task].extend(preds.cpu().numpy())
|
all_preds[task].extend(preds.cpu().numpy())
|
||||||
all_confidences[task].extend(confidence.cpu().numpy())
|
all_confidences[task].extend(confidence.cpu().numpy())
|
||||||
|
|
||||||
summary = {
|
summary = {
|
||||||
"mode": args.mode,
|
"mode": args.mode,
|
||||||
"dataset": args.dataset,
|
"dataset": args.dataset,
|
||||||
"task": args.task,
|
"task": args.task,
|
||||||
"model_path": args.model_path,
|
"model_path": args.model_path,
|
||||||
"results": {}
|
"results": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
test_df = pd.read_csv(test) # for later
|
||||||
|
|
||||||
|
for task in active_tasks:
|
||||||
|
print(f"\nFor Task: {task.upper()}\n")
|
||||||
|
|
||||||
|
labels_arr = np.array(all_labels[task])
|
||||||
|
preds_arr = np.array(all_preds[task])
|
||||||
|
conf_arr = np.array(all_confidences[task])
|
||||||
|
|
||||||
|
print(f"\nClassification Report")
|
||||||
|
report = classification_report(
|
||||||
|
labels_arr,
|
||||||
|
preds_arr,
|
||||||
|
target_names=label_names[task],
|
||||||
|
digits=4,
|
||||||
|
zero_division=0
|
||||||
|
)
|
||||||
|
print(report)
|
||||||
|
|
||||||
|
report_dict = classification_report(
|
||||||
|
labels_arr,
|
||||||
|
preds_arr,
|
||||||
|
target_names=label_names[task],
|
||||||
|
output_dict=True,
|
||||||
|
zero_division=0
|
||||||
|
)
|
||||||
|
|
||||||
|
correct = (labels_arr == preds_arr)
|
||||||
|
mean_conf = conf_arr.mean()
|
||||||
|
mean_conf_correct = conf_arr[correct].mean() if correct.any() else 0
|
||||||
|
mean_conf_incorrect = conf_arr[~correct].mean() if (~correct).any() else 0
|
||||||
|
|
||||||
|
print(f"Overall Mean confidence: {mean_conf:.4f}")
|
||||||
|
print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}")
|
||||||
|
print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}")
|
||||||
|
|
||||||
|
# save summary to JSON
|
||||||
|
summary["results"][task] = {
|
||||||
|
"macro_f1": float(report_dict["macro avg"]["f1-score"]),
|
||||||
|
"macro_precision": float(report_dict["macro avg"]["precision"]),
|
||||||
|
"macro_recall": float(report_dict["macro avg"]["recall"]),
|
||||||
|
"confidence": {
|
||||||
|
"overall": float(mean_conf),
|
||||||
|
"correct": float(mean_conf_correct),
|
||||||
|
"incorrect": float(mean_conf_incorrect)
|
||||||
|
},
|
||||||
|
"per_class": report_dict
|
||||||
}
|
}
|
||||||
|
|
||||||
test_df = pd.read_csv(test) # for later
|
# Confusion matrix
|
||||||
|
|
||||||
for task in active_tasks:
|
cm = confusion_matrix(labels_arr, preds_arr)
|
||||||
print(f"\nFor Task: {task.upper()}\n")
|
fig, ax = plt.subplots(figsize=(8, 6))
|
||||||
|
sns.heatmap(
|
||||||
labels_arr = np.array(all_labels[task])
|
cm, annot=True, fmt="d", cmap="Blues", cbar=False,
|
||||||
preds_arr = np.array(all_preds[task])
|
xticklabels=label_names[task], yticklabels=label_names[task],
|
||||||
conf_arr = np.array(all_confidences[task])
|
ax=ax
|
||||||
|
)
|
||||||
print(f"\nClassification Report")
|
ax.set_xlabel("Predicted Label", fontweight="bold")
|
||||||
report = classification_report(
|
ax.set_ylabel("True Label", fontweight="bold")
|
||||||
labels_arr,
|
ax.set_title(f"{task.replace('_', ' ').title()} Confusion Matrix ({args.mode.upper()})", fontweight="bold")
|
||||||
preds_arr,
|
|
||||||
target_names=label_names[task],
|
|
||||||
digits=4,
|
|
||||||
zero_division=0
|
|
||||||
)
|
|
||||||
print(report)
|
|
||||||
|
|
||||||
report_dict = classification_report(
|
|
||||||
labels_arr,
|
|
||||||
target_names=label_names[task],
|
|
||||||
output_dict=True,
|
|
||||||
zero_division=0
|
|
||||||
)
|
|
||||||
|
|
||||||
correct = (labels_arr == preds_arr)
|
|
||||||
mean_conf = conf_arr.mean()
|
|
||||||
mean_conf_correct = conf_arr[correct].mean() if correct.any() else 0
|
|
||||||
mean_conf_incorrect = conf_arr[~correct].mean() if (~correct).any() else 0
|
|
||||||
|
|
||||||
print(f"Overall Mean confidence: {mean_conf:.4f}")
|
|
||||||
print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}")
|
|
||||||
print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}")
|
|
||||||
|
|
||||||
# save summary to JSON
|
|
||||||
summary["results"][task] = {
|
|
||||||
"macro_f1": report_dict["macro avg"]["f1-score"],
|
|
||||||
"macro_precision": report_dict["macro avg"]["precision"],
|
|
||||||
"macro_recall": report_dict["macro avg"]["recall"],
|
|
||||||
"confidence": {
|
|
||||||
"overall": mean_conf,
|
|
||||||
"correct": mean_conf_correct,
|
|
||||||
"incorrect": mean_conf_incorrect
|
|
||||||
},
|
|
||||||
"per_class": report_dict
|
|
||||||
}
|
|
||||||
|
|
||||||
# Confusion matrix
|
|
||||||
|
|
||||||
cm = confusion_matrix(labels_arr, preds_arr)
|
|
||||||
fig, ax = plt.subplots(figsize=(8, 6))
|
|
||||||
sns.heatmap(
|
|
||||||
cm, annot=True, fmt="d", cmap="Blues", cbar=False,
|
|
||||||
xticklabels=label_names[task], yticklabels=label_names[task],
|
|
||||||
ax=ax
|
|
||||||
)
|
|
||||||
ax.set_xlabel("Predicted Label", fontweight="bold")
|
|
||||||
ax.set_ylabel("True Label", fontweight="bold")
|
|
||||||
ax.set_title(f"{task.replace("_", " ").title()} Confusion Matrix ({args.mode.upper()})", fontweight="bold")
|
|
||||||
|
|
||||||
run_name = args.task if args.mode == "stl" else "mtl"
|
|
||||||
cm_path = f"outputs/figures/cm_{args.mode}_{args.dataset}_{task}.png"
|
|
||||||
fig.savefig(cm_path, dpi = 150, bbox_inches='tight')
|
|
||||||
plt.close(fig)
|
|
||||||
print("Saved cm to path", cm_path)
|
|
||||||
|
|
||||||
test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable
|
|
||||||
test_df[f'{task}_confidence'] = conf_arr
|
|
||||||
|
|
||||||
# to JSON
|
|
||||||
run_name = args.task if args.mode == "stl" else "mtl"
|
run_name = args.task if args.mode == "stl" else "mtl"
|
||||||
json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json"
|
cm_path = f"outputs/figures/cm_{args.mode}_{args.dataset}_{task}.png"
|
||||||
with open(json_path, "w") as f:
|
fig.savefig(cm_path, dpi = 150, bbox_inches='tight')
|
||||||
json.dump(summary, f, indent=4)
|
plt.close(fig)
|
||||||
print(f"Saved evaluation summary to {json_path}")
|
print("Saved cm to path", cm_path)
|
||||||
|
|
||||||
csv_path = f"outputs/test_predictions_{args.mode}_{run_name}_{args.dataset}.csv"
|
test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable
|
||||||
test_df.to_csv(csv_path, index=False)
|
test_df[f'{task}_confidence'] = conf_arr
|
||||||
print("Saved raw predictions to CSV at", csv_path)
|
|
||||||
|
# to JSON
|
||||||
|
run_name = args.task if args.mode == "stl" else "mtl"
|
||||||
|
json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json"
|
||||||
|
with open(json_path, "w") as f:
|
||||||
|
json.dump(summary, f, indent=4)
|
||||||
|
print(f"Saved evaluation summary to {json_path}")
|
||||||
|
|
||||||
|
csv_path = f"outputs/test_predictions_{args.mode}_{run_name}_{args.dataset}.csv"
|
||||||
|
test_df.to_csv(csv_path, index=False)
|
||||||
|
print("Saved raw predictions to CSV at", csv_path)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
Reference in New Issue
Block a user