89 lines
3.9 KiB
Python
89 lines
3.9 KiB
Python
# dataset.py
|
|
# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer
|
|
# Takes a row from the csv, tokenizes the review and returns a tensor
|
|
import torch
|
|
import pandas as pd
|
|
from torch.utils.data import Dataset
|
|
from transformers import AutoTokenizer
|
|
|
|
class ReviewDataset(Dataset):
|
|
"""Pytorch Dataset for loading tokenized reviews
|
|
|
|
Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
|
|
Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
|
|
input tensors and integer labels for all 4 tasks.
|
|
|
|
Args:
|
|
path (str): Path to the csv file containing the reviews and labels.
|
|
tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
|
|
max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
|
|
"""
|
|
|
|
def __init__(self, path, tokenizer, max_length=256):
|
|
self.df = pd.read_csv(path)
|
|
self.tokenizer = tokenizer
|
|
self.max_length = max_length
|
|
|
|
def __len__(self):
|
|
return len(self.df)
|
|
|
|
def __getitem__(self, idx):
|
|
review = self.df.iloc[idx]['review']
|
|
|
|
# encoding['input_ids'] 1D tensor of token ids, shape [max_length]
|
|
# encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
|
|
# Both have shape [1, max_length] because of return_tensors='pt'
|
|
# Squeeze them to [max_length] with .squeeze(0)
|
|
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
|
|
|
# Returns a dictionary with:
|
|
# 'input_ids': tensor of shape [max_length]
|
|
|
|
# 'attention_mask': tensor of shape [max_length]
|
|
|
|
# MTL structure labels as tensor scalars:
|
|
# 'bug_report': tensor scalar (torch.tensor(label_value))
|
|
# 'feature_request': tensor scalar (torch.tensor(label_value))
|
|
# 'aspect': tensor scalar (torch.tensor(label_value))
|
|
# 'aspect_sentiment': tensor scalar (torch.tensor(label_value))
|
|
return {
|
|
'input_ids': encoding['input_ids'].squeeze(0),
|
|
'attention_mask': encoding['attention_mask'].squeeze(0),
|
|
'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long),
|
|
'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long),
|
|
'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
|
|
'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment'], dtype=torch.long)
|
|
}
|
|
class InferenceDataset(Dataset):
|
|
def __init__(self, path, tokenizer, text_column, max_length=256):
|
|
self.df = pd.read_csv(path)
|
|
self.tokenizer = tokenizer
|
|
self.text_column = text_column
|
|
self.max_length = max_length
|
|
|
|
def __len__(self):
|
|
return len(self.df)
|
|
|
|
def __getitem__(self, idx):
|
|
#review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews
|
|
review = str(self.df.iloc[idx][self.text_column])
|
|
if review == 'nan' or review.strip() == '':
|
|
review = ' '
|
|
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
|
return {
|
|
'input_ids': encoding['input_ids'].squeeze(0),
|
|
'attention_mask': encoding['attention_mask'].squeeze(0),
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
|
|
print(dataset.__getitem__(1))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|