From 19c0d4bce389ec402e307003476784e813fba5e4 Mon Sep 17 00:00:00 2001
From: charlie-rasberry <charlie.rasberry@outlook.com>
Date: Thu, 19 Feb 2026 18:45:55 +0000
Subject: [PATCH] Started dataset.py, added the ReviewDataset class and
 implemented the __init__, __len__ and __getitem__ methods. The __getitem__
 method currently just returns the review text, but will be updated to return
 the tokenized review as a tensor

---
 notebooks/preprocessing_tagged.ipynb | 2 ++
 src/dataset.py                       | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/notebooks/preprocessing_tagged.ipynb b/notebooks/preprocessing_tagged.ipynb
index 25ea8ed..a1675ee 100644
--- a/notebooks/preprocessing_tagged.ipynb
+++ b/notebooks/preprocessing_tagged.ipynb
@@ -1161,9 +1161,11 @@
    ],
    "source": [
     "# Let's also see if the longer reviews are more likely to be bug reports, feature requests, or have certain aspects/sentiments\n",
+    "\n",
     "# This finding allows us to make a better decision on the max_length for the model, which will increase the quality of the model,\n",
     "# the time to train will be longer but it is not worth removing valuable information from longer reviews which are mostly bug reports and feature requests \n",
     "# with negative sentiment (the most important ones to classify correctly)\n",
+    "\n",
     "for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
     "    lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n",
     "    mask_128 = lengths > 128\n",
diff --git a/src/dataset.py b/src/dataset.py
index 43b3b1e..ef35511 100644
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -1,7 +1,6 @@
 # dataset.py
 
 # Takes a row from the csv, tokenizes the review and returns a tensor
-
 import torch
 import pandas as pd
 from torch.utils.data import Dataset