Added some comments and readability

This commit is contained in:
2026-03-24 18:11:31 +00:00
parent afe61eaaa2
commit 753723694b
5 changed files with 103 additions and 84 deletions

View File

@@ -1,5 +1,5 @@
# dataset.py
# tokenize data using (sentencepiece) XLM-RoBERTa
# Takes a row from the csv, tokenizes the review and returns a tensor
import torch
import pandas as pd
@@ -7,6 +7,18 @@ from torch.utils.data import Dataset
from transformers import AutoTokenizer
class ReviewDataset(Dataset):
"""Pytorch Dataset for loading tokenized reviews
Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
input tensors and integer labels for all 4 tasks.
Args:
path (str): Path to the csv file containing the reviews and labels.
tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
"""
def __init__(self, path, tokenizer, max_length=256):
self.df = pd.read_csv(path)
self.tokenizer = tokenizer
@@ -22,13 +34,7 @@ class ReviewDataset(Dataset):
# encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
# Both have shape [1, max_length] because of return_tensors='pt'
# Squeeze them to [max_length] with .squeeze(0)
encoding = self.tokenizer(
review,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
# Returns a dictionary with:
# 'input_ids': tensor of shape [max_length]