From 61df4e3e2664c08e2e09931f7b399abfdf593c75 Mon Sep 17 00:00:00 2001 From: charlie-rasberry Date: Thu, 19 Feb 2026 22:10:25 +0000 Subject: [PATCH] Implemented dataset.py which tokenises and returns tensors, ready to load the model now --- src/dataset.py | 35 ++++++++++++++++++++++++++++++++--- src/model.py | 0 2 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 src/model.py diff --git a/src/dataset.py b/src/dataset.py index ef35511..d3245f1 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -17,10 +17,39 @@ class ReviewDataset(Dataset): def __getitem__(self, idx): review = self.df.iloc[idx]['review'] - return review -uber = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")) -print(uber.__getitem__(1)) + # encoding['input_ids'] + # encoding['attention_mask'] + # Both have shape [1, max_length] because of return_tensors='pt' + # Squeeze them to [max_length] with .squeeze(0) + encoding = self.tokenizer( + review, + max_length=self.max_length, + padding='max_length', + truncation=True, + return_tensors='pt' + ) + + # Returns a dictionary with: + # 'input_ids': tensor of shape [max_length] + + # 'attention_mask': tensor of shape [max_length] + + # 'bug_report': tensor scalar (torch.tensor(label_value)) + # 'feature_request': tensor scalar (torch.tensor(label_value)) + # 'aspect': tensor scalar (torch.tensor(label_value)) + # 'aspect_sentiment': tensor scalar (torch.tensor(label_value)) + return { + 'input_ids': encoding['input_ids'].squeeze(0), + 'attention_mask': encoding['attention_mask'].squeeze(0), + 'bug_report': torch.tensor(self.df.iloc[idx]['bug_report']), + 'feature_request': torch.tensor(self.df.iloc[idx]['feature_request']), + 'aspect': torch.tensor(self.df.iloc[idx]['aspect']), + 'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment']) + } + +# uber = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")) +# print(uber.__getitem__(1)) diff --git a/src/model.py b/src/model.py new file mode 100644 index 0000000..e69de29