diff --git a/notebooks/preprocessing_tagged.ipynb b/notebooks/preprocessing_tagged.ipynb index 25ea8ed..a1675ee 100644 --- a/notebooks/preprocessing_tagged.ipynb +++ b/notebooks/preprocessing_tagged.ipynb @@ -1161,9 +1161,11 @@ ], "source": [ "# Let's also see if the longer reviews are more likely to be bug reports, feature requests, or have certain aspects/sentiments\n", + "\n", "# This finding allows us to make a better decision on the max_length for the model, which will increase the quality of the model,\n", "# the time to train will be longer but it is not worth removing valuable information from longer reviews which are mostly bug reports and feature requests \n", "# with negative sentiment (the most important ones to classify correctly)\n", + "\n", "for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n", " lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n", " mask_128 = lengths > 128\n", diff --git a/src/dataset.py b/src/dataset.py index 43b3b1e..ef35511 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -1,7 +1,6 @@ # dataset.py # Takes a row from the csv, tokenizes the review and returns a tensor - import torch import pandas as pd from torch.utils.data import Dataset