Implemented dataset.py which tokenises and returns tensors, ready to load the model now

2026-02-19 22:10:25 +00:00
parent 19c0d4bce3
commit 61df4e3e26
2 changed files with 32 additions and 3 deletions
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -17,10 +17,39 @@ class ReviewDataset(Dataset):
    
    def __getitem__(self, idx):
        review = self.df.iloc[idx]['review']
-        return review

-uber = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
-print(uber.__getitem__(1))
+        # encoding['input_ids'] 
+        # encoding['attention_mask']
+        # Both have shape [1, max_length] because of return_tensors='pt'
+        # Squeeze them to [max_length] with .squeeze(0)
+        encoding = self.tokenizer(
+                review,
+                max_length=self.max_length,
+                padding='max_length',
+                truncation=True,
+                return_tensors='pt'
+            )
+        
+        # Returns a dictionary with:
+        #   'input_ids': tensor of shape [max_length]
+        
+        #   'attention_mask': tensor of shape [max_length]
+
+        #   'bug_report': tensor scalar (torch.tensor(label_value))
+        #   'feature_request': tensor scalar (torch.tensor(label_value))
+        #   'aspect': tensor scalar (torch.tensor(label_value))
+        #   'aspect_sentiment': tensor scalar (torch.tensor(label_value))
+        return {
+            'input_ids': encoding['input_ids'].squeeze(0),
+            'attention_mask': encoding['attention_mask'].squeeze(0),
+            'bug_report': torch.tensor(self.df.iloc[idx]['bug_report']),
+            'feature_request': torch.tensor(self.df.iloc[idx]['feature_request']),
+            'aspect': torch.tensor(self.df.iloc[idx]['aspect']),
+            'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment'])
+        }
+
+# uber = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
+# print(uber.__getitem__(1))

    

--- a/src/model.py
+++ b/src/model.py