ReClass/src/dataset.py

# dataset.py
# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer
# Takes a row from the csv, tokenizes the review and returns a tensor
import torch
import pandas as pd
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class ReviewDataset(Dataset):
    """Pytorch Dataset for loading tokenized reviews

    Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
    Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
    input tensors and integer labels for all 4 tasks.

      Args:
        path (str): Path to the csv file containing the reviews and labels.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
        max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
    """

    def __init__(self, path, tokenizer, max_length=256):
        self.df = pd.read_csv(path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        review = self.df.iloc[idx]['review']

        # encoding['input_ids'] 1D tensor of token ids, shape [max_length]
        # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
        # Both have shape [1, max_length] because of return_tensors='pt'
        # Squeeze them to [max_length] with .squeeze(0)
        encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')

        # Returns a dictionary with:
        #   'input_ids': tensor of shape [max_length]

        #   'attention_mask': tensor of shape [max_length]

        # MTL structure labels as tensor scalars:
        #   'bug_report': tensor scalar (torch.tensor(label_value))
        #   'feature_request': tensor scalar (torch.tensor(label_value))
        #   'aspect': tensor scalar (torch.tensor(label_value))
        #   'aspect_sentiment': tensor scalar (torch.tensor(label_value))
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long),
            'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long),
            'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
            'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment'], dtype=torch.long)
        }
class InferenceDataset(Dataset):
        def __init__(self, path, tokenizer, text_column, max_length=256):
                self.df = pd.read_csv(path)
                self.tokenizer = tokenizer
                self.text_column = text_column
                self.max_length = max_length

        def __len__(self):
                return len(self.df)

        def __getitem__(self, idx):
                #review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews
                review = str(self.df.iloc[idx][self.text_column])
                if review == 'nan' or review.strip() == '':
                    review = ' '
                encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
                return {
                        'input_ids': encoding['input_ids'].squeeze(0),
                        'attention_mask': encoding['attention_mask'].squeeze(0),
                }


if __name__ == "__main__":
    dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
    print(dataset.__getitem__(1))