# dataset.py # tokenize data using (sentencepiece) XLM-RoBERTas tokenizer # Takes a row from the csv, tokenizes the review and returns a tensor import torch import pandas as pd from torch.utils.data import Dataset from transformers import AutoTokenizer class ReviewDataset(Dataset): """Pytorch Dataset for loading tokenized reviews Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams). Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of input tensors and integer labels for all 4 tasks. Args: path (str): Path to the csv file containing the reviews and labels. tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews. max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes """ def __init__(self, path, tokenizer, max_length=256): self.df = pd.read_csv(path) self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.df) def __getitem__(self, idx): review = self.df.iloc[idx]['review'] # encoding['input_ids'] 1D tensor of token ids, shape [max_length] # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length] # Both have shape [1, max_length] because of return_tensors='pt' # Squeeze them to [max_length] with .squeeze(0) encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') # Returns a dictionary with: # 'input_ids': tensor of shape [max_length] # 'attention_mask': tensor of shape [max_length] # MTL structure labels as tensor scalars: # 'bug_report': tensor scalar (torch.tensor(label_value)) # 'feature_request': tensor scalar (torch.tensor(label_value)) # 'aspect': tensor scalar (torch.tensor(label_value)) # 'aspect_sentiment': tensor scalar (torch.tensor(label_value)) return { 'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), 'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long), 'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long), 'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long), 'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment'], dtype=torch.long) } class InferenceDataset(Dataset): def __init__(self, path, tokenizer, text_column, max_length=256): self.df = pd.read_csv(path) self.tokenizer = tokenizer self.text_column = text_column self.max_length = max_length def __len__(self): return len(self.df) def __getitem__(self, idx): #review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews review = str(self.df.iloc[idx][self.text_column]) if review == 'nan' or review.strip() == '': review = ' ' encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') return { 'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), } if __name__ == "__main__": dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")) print(dataset.__getitem__(1))