Started dataset.py, added the ReviewDataset class and implemented the __init__, __len__ and __getitem__ methods. The __getitem__ method currently just returns the review text, but will be updated to return the tokenized review as a tensor

2026-02-19 18:45:55 +00:00
parent 19bcf2aa18
commit 19c0d4bce3
2 changed files with 2 additions and 1 deletions
--- a/notebooks/preprocessing_tagged.ipynb
+++ b/notebooks/preprocessing_tagged.ipynb
@@ -1161,9 +1161,11 @@
   ],
   "source": [
    "# Let's also see if the longer reviews are more likely to be bug reports, feature requests, or have certain aspects/sentiments\n",
+    "\n",
    "# This finding allows us to make a better decision on the max_length for the model, which will increase the quality of the model,\n",
    "# the time to train will be longer but it is not worth removing valuable information from longer reviews which are mostly bug reports and feature requests \n",
    "# with negative sentiment (the most important ones to classify correctly)\n",
+    "\n",
    "for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
    "    lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n",
    "    mask_128 = lengths > 128\n",
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -1,7 +1,6 @@
 # dataset.py

 # Takes a row from the csv, tokenizes the review and returns a tensor
-
 import torch
 import pandas as pd
 from torch.utils.data import Dataset