diff --git a/src/__pycache__/model.cpython-313.pyc b/src/__pycache__/model.cpython-313.pyc new file mode 100644 index 0000000..2f87f93 Binary files /dev/null and b/src/__pycache__/model.cpython-313.pyc differ diff --git a/src/dataset.py b/src/dataset.py index 2be5173..379478e 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -43,14 +43,16 @@ class ReviewDataset(Dataset): return { 'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), - 'bug_report': torch.tensor(self.df.iloc[idx]['bug_report']), - 'feature_request': torch.tensor(self.df.iloc[idx]['feature_request']), - 'aspect': torch.tensor(self.df.iloc[idx]['aspect']), - 'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment']) + 'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long), + 'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long), + 'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long), + 'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment'], dtype=torch.long) } + +if __name__ == "__main__": + dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")) + print(dataset.__getitem__(1)) -# uber = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")) -# print(uber.__getitem__(1)) diff --git a/src/train.py b/src/train.py index 46a59f7..e7734d6 100644 --- a/src/train.py +++ b/src/train.py @@ -1,7 +1,69 @@ -#train.py - +# train.py +import torch +from sklearn.utils.class_weight import compute_class_weight +import numpy as np +import torch.nn as nn +from torch.utils.data import DataLoader from transformers import AutoTokenizer +import pandas as pd + +from dataset import ReviewDataset +from model import Model + +# class weights, training loop and early stopping +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") + +train = "data/processed/original_train.csv" +val = "data/processed/original_val.csv" +train_dataset = ReviewDataset(train, tokenizer) +val_dataset = ReviewDataset(val, tokenizer) +train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) +val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False) + +model = Model().to(device) -class multiTaskModel(): - tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") \ No newline at end of file +# move input_ids, attention_mask and labels to device in each batch + +# ------------------- Class weights ------------------- +# Using weights inversely proportional to class frequencies to avoid majority class bias, +# prioritize useful bug reports / feature requests +def compute_weights(train_df, column): + classes = np.unique(train_df[column]) + weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_df[column]) + return torch.tensor(weights, dtype=torch.float).to(device) + +# -------------------- Loss functions ------------------- +# just a later idea +# 1.0 * bug_loss + +# 1.0 * feature_loss + +# 0.5 * aspect_loss + +# 0.5 * sentiment_loss + + +# -------------------- Optimizer and scheduler ------------------- + + + + +# ------------------- Training loop ------------------- +# For each epoch: + + + + +# ------------------- Stopping logic ------------------- +# After each epoch, find mean of 4 macro f1 scores +# If there is no improvement for 3 epochs consecutively, stop training +# Prevents overfitting which saves time and resources + + + + +train_df = pd.read_csv(train) +bug_weights = compute_weights(train_df, 'bug_report') +feature_weights = compute_weights(train_df, 'feature_request') +aspect_weights = compute_weights(train_df, 'aspect') +aspect_sentiment_weights = compute_weights(train_df, 'aspect_sentiment') \ No newline at end of file