Files
ReClass/src/dataset.py

89 lines
3.9 KiB
Python

# dataset.py
# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer
# Takes a row from the csv, tokenizes the review and returns a tensor
import torch
import pandas as pd
from torch.utils.data import Dataset
from transformers import AutoTokenizer
class ReviewDataset(Dataset):
"""Pytorch Dataset for loading tokenized reviews
Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
input tensors and integer labels for all 4 tasks.
Args:
path (str): Path to the csv file containing the reviews and labels.
tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
"""
def __init__(self, path, tokenizer, max_length=256):
self.df = pd.read_csv(path)
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
review = self.df.iloc[idx]['review']
# encoding['input_ids'] 1D tensor of token ids, shape [max_length]
# encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
# Both have shape [1, max_length] because of return_tensors='pt'
# Squeeze them to [max_length] with .squeeze(0)
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
# Returns a dictionary with:
# 'input_ids': tensor of shape [max_length]
# 'attention_mask': tensor of shape [max_length]
# MTL structure labels as tensor scalars:
# 'bug_report': tensor scalar (torch.tensor(label_value))
# 'feature_request': tensor scalar (torch.tensor(label_value))
# 'aspect': tensor scalar (torch.tensor(label_value))
# 'aspect_sentiment': tensor scalar (torch.tensor(label_value))
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long),
'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long),
'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment'], dtype=torch.long)
}
class InferenceDataset(Dataset):
def __init__(self, path, tokenizer, text_column, max_length=256):
self.df = pd.read_csv(path)
self.tokenizer = tokenizer
self.text_column = text_column
self.max_length = max_length
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
#review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews
review = str(self.df.iloc[idx][self.text_column])
if review == 'nan' or review.strip() == '':
review = ' '
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
}
if __name__ == "__main__":
dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
print(dataset.__getitem__(1))