Added some comments and readability
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
# dataset.py
|
||||
|
||||
# tokenize data using (sentencepiece) XLM-RoBERTa
|
||||
# Takes a row from the csv, tokenizes the review and returns a tensor
|
||||
import torch
|
||||
import pandas as pd
|
||||
@@ -7,6 +7,18 @@ from torch.utils.data import Dataset
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
class ReviewDataset(Dataset):
|
||||
"""Pytorch Dataset for loading tokenized reviews
|
||||
|
||||
Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
|
||||
Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
|
||||
input tensors and integer labels for all 4 tasks.
|
||||
|
||||
Args:
|
||||
path (str): Path to the csv file containing the reviews and labels.
|
||||
tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
|
||||
max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
|
||||
"""
|
||||
|
||||
def __init__(self, path, tokenizer, max_length=256):
|
||||
self.df = pd.read_csv(path)
|
||||
self.tokenizer = tokenizer
|
||||
@@ -22,13 +34,7 @@ class ReviewDataset(Dataset):
|
||||
# encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
|
||||
# Both have shape [1, max_length] because of return_tensors='pt'
|
||||
# Squeeze them to [max_length] with .squeeze(0)
|
||||
encoding = self.tokenizer(
|
||||
review,
|
||||
max_length=self.max_length,
|
||||
padding='max_length',
|
||||
truncation=True,
|
||||
return_tensors='pt'
|
||||
)
|
||||
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
||||
|
||||
# Returns a dictionary with:
|
||||
# 'input_ids': tensor of shape [max_length]
|
||||
|
||||
Reference in New Issue
Block a user