Added some comments and readability

2026-03-24 18:11:31 +00:00
parent afe61eaaa2
commit 753723694b
5 changed files with 103 additions and 84 deletions
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -1,5 +1,5 @@
 # dataset.py
-
+# tokenize data using (sentencepiece) XLM-RoBERTa
 # Takes a row from the csv, tokenizes the review and returns a tensor
 import torch
 import pandas as pd
@@ -7,6 +7,18 @@ from torch.utils.data import Dataset
 from transformers import AutoTokenizer

 class ReviewDataset(Dataset):
+    """Pytorch Dataset for loading tokenized reviews
+
+    Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
+    Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
+    input tensors and integer labels for all 4 tasks.
+     
+      Args:
+        path (str): Path to the csv file containing the reviews and labels.
+        tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
+        max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
+    """
+
    def __init__(self, path, tokenizer, max_length=256):
        self.df = pd.read_csv(path)
        self.tokenizer = tokenizer
@@ -22,13 +34,7 @@ class ReviewDataset(Dataset):
        # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
        # Both have shape [1, max_length] because of return_tensors='pt'
        # Squeeze them to [max_length] with .squeeze(0)
-        encoding = self.tokenizer(
-                review,
-                max_length=self.max_length,
-                padding='max_length',
-                truncation=True,
-                return_tensors='pt'
-            )
+        encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        
        # Returns a dictionary with:
        #   'input_ids': tensor of shape [max_length]