From 19c0d4bce389ec402e307003476784e813fba5e4 Mon Sep 17 00:00:00 2001 From: charlie-rasberry Date: Thu, 19 Feb 2026 18:45:55 +0000 Subject: [PATCH] Started dataset.py, added the ReviewDataset class and implemented the __init__, __len__ and __getitem__ methods. The __getitem__ method currently just returns the review text, but will be updated to return the tokenized review as a tensor --- notebooks/preprocessing_tagged.ipynb | 2 ++ src/dataset.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/notebooks/preprocessing_tagged.ipynb b/notebooks/preprocessing_tagged.ipynb index 25ea8ed..a1675ee 100644 --- a/notebooks/preprocessing_tagged.ipynb +++ b/notebooks/preprocessing_tagged.ipynb @@ -1161,9 +1161,11 @@ ], "source": [ "# Let's also see if the longer reviews are more likely to be bug reports, feature requests, or have certain aspects/sentiments\n", + "\n", "# This finding allows us to make a better decision on the max_length for the model, which will increase the quality of the model,\n", "# the time to train will be longer but it is not worth removing valuable information from longer reviews which are mostly bug reports and feature requests \n", "# with negative sentiment (the most important ones to classify correctly)\n", + "\n", "for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n", " lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n", " mask_128 = lengths > 128\n", diff --git a/src/dataset.py b/src/dataset.py index 43b3b1e..ef35511 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -1,7 +1,6 @@ # dataset.py # Takes a row from the csv, tokenizes the review and returns a tensor - import torch import pandas as pd from torch.utils.data import Dataset