Started dataset.py, added the ReviewDataset class and implemented the __init__, __len__ and __getitem__ methods. The __getitem__ method currently just returns the review text, but will be updated to return the tokenized review as a tensor

This commit is contained in:
2026-02-19 18:45:55 +00:00
parent 19bcf2aa18
commit 19c0d4bce3
2 changed files with 2 additions and 1 deletions

View File

@@ -1161,9 +1161,11 @@
],
"source": [
"# Let's also see if the longer reviews are more likely to be bug reports, feature requests, or have certain aspects/sentiments\n",
"\n",
"# This finding allows us to make a better decision on the max_length for the model, which will increase the quality of the model,\n",
"# the time to train will be longer but it is not worth removing valuable information from longer reviews which are mostly bug reports and feature requests \n",
"# with negative sentiment (the most important ones to classify correctly)\n",
"\n",
"for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
" lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n",
" mask_128 = lengths > 128\n",

View File

@@ -1,7 +1,6 @@
# dataset.py
# Takes a row from the csv, tokenizes the review and returns a tensor
import torch
import pandas as pd
from torch.utils.data import Dataset