Implemented initial training structure, adding further logic soon including loss, stopping, optimisation and loop
This commit is contained in:
BIN
src/__pycache__/model.cpython-313.pyc
Normal file
BIN
src/__pycache__/model.cpython-313.pyc
Normal file
Binary file not shown.
@@ -43,14 +43,16 @@ class ReviewDataset(Dataset):
|
|||||||
return {
|
return {
|
||||||
'input_ids': encoding['input_ids'].squeeze(0),
|
'input_ids': encoding['input_ids'].squeeze(0),
|
||||||
'attention_mask': encoding['attention_mask'].squeeze(0),
|
'attention_mask': encoding['attention_mask'].squeeze(0),
|
||||||
'bug_report': torch.tensor(self.df.iloc[idx]['bug_report']),
|
'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long),
|
||||||
'feature_request': torch.tensor(self.df.iloc[idx]['feature_request']),
|
'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long),
|
||||||
'aspect': torch.tensor(self.df.iloc[idx]['aspect']),
|
'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
|
||||||
'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment'])
|
'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment'], dtype=torch.long)
|
||||||
}
|
}
|
||||||
|
|
||||||
# uber = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
|
if __name__ == "__main__":
|
||||||
# print(uber.__getitem__(1))
|
dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
|
||||||
|
print(dataset.__getitem__(1))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
66
src/train.py
66
src/train.py
@@ -1,7 +1,69 @@
|
|||||||
# train.py
|
# train.py
|
||||||
|
import torch
|
||||||
|
from sklearn.utils.class_weight import compute_class_weight
|
||||||
|
import numpy as np
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from dataset import ReviewDataset
|
||||||
|
from model import Model
|
||||||
|
|
||||||
|
# class weights, training loop and early stopping
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
class multiTaskModel():
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||||
|
|
||||||
|
train = "data/processed/original_train.csv"
|
||||||
|
val = "data/processed/original_val.csv"
|
||||||
|
train_dataset = ReviewDataset(train, tokenizer)
|
||||||
|
val_dataset = ReviewDataset(val, tokenizer)
|
||||||
|
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
|
||||||
|
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
|
||||||
|
|
||||||
|
model = Model().to(device)
|
||||||
|
|
||||||
|
|
||||||
|
# move input_ids, attention_mask and labels to device in each batch
|
||||||
|
|
||||||
|
# ------------------- Class weights -------------------
|
||||||
|
# Using weights inversely proportional to class frequencies to avoid majority class bias,
|
||||||
|
# prioritize useful bug reports / feature requests
|
||||||
|
def compute_weights(train_df, column):
|
||||||
|
classes = np.unique(train_df[column])
|
||||||
|
weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_df[column])
|
||||||
|
return torch.tensor(weights, dtype=torch.float).to(device)
|
||||||
|
|
||||||
|
# -------------------- Loss functions -------------------
|
||||||
|
# just a later idea
|
||||||
|
# 1.0 * bug_loss +
|
||||||
|
# 1.0 * feature_loss +
|
||||||
|
# 0.5 * aspect_loss +
|
||||||
|
# 0.5 * sentiment_loss
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------- Optimizer and scheduler -------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------- Training loop -------------------
|
||||||
|
# For each epoch:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------- Stopping logic -------------------
|
||||||
|
# After each epoch, find mean of 4 macro f1 scores
|
||||||
|
# If there is no improvement for 3 epochs consecutively, stop training
|
||||||
|
# Prevents overfitting which saves time and resources
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
train_df = pd.read_csv(train)
|
||||||
|
bug_weights = compute_weights(train_df, 'bug_report')
|
||||||
|
feature_weights = compute_weights(train_df, 'feature_request')
|
||||||
|
aspect_weights = compute_weights(train_df, 'aspect')
|
||||||
|
aspect_sentiment_weights = compute_weights(train_df, 'aspect_sentiment')
|
||||||
Reference in New Issue
Block a user