From c5e91b79b29f7746054967dbe915899397e9ebb6 Mon Sep 17 00:00:00 2001 From: charlie-rasberry Date: Thu, 19 Feb 2026 01:28:10 +0000 Subject: [PATCH] Decided on max_length by finding out how many and which reviews would be truncated (it will be 256 tokens) --- notebooks/preprocessing_tagged.ipynb | 177 +++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/notebooks/preprocessing_tagged.ipynb b/notebooks/preprocessing_tagged.ipynb index af33f70..25ea8ed 100644 --- a/notebooks/preprocessing_tagged.ipynb +++ b/notebooks/preprocessing_tagged.ipynb @@ -1005,6 +1005,183 @@ " df.to_csv(f'{name.lower().replace(\" \", \"_\")}.csv', index=False) # 6 seperate files are saved" ] }, + { + "cell_type": "code", + "execution_count": 72, + "id": "0e469c90", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4dc1bbaab74f4de3a17640dfdbe18c59", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0%| | 0.00/615 [00:00 128).sum()} ({(lengths > 128).mean():.1%})\")\n", + " print(f\" Over 256: {(lengths > 256).sum()} ({(lengths > 256).mean():.1%})\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad011e5e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Boosted\n", + " Max tokens: 471\n", + " Mean tokens: 49.8\n", + "\n", + " Over 128: 180 (3.6%)\n", + " bug_report=1: 68 (37.8% of truncated)\n", + " feature_request=1: 17 (9.4% of truncated)\n", + " aspect counts: {1: 57, 0: 47, 5: 36, 3: 26, 4: 12, 2: 2}\n", + " sentiment counts: {2: 165, 0: 15}\n", + "\n", + " Over 256: 21 (0.4%)\n", + " bug_report=1: 6 (28.6% of truncated)\n", + " feature_request=1: 1 (4.8% of truncated)\n", + " aspect counts: {1: 6, 5: 6, 3: 5, 0: 4}\n", + " sentiment counts: {2: 21}\n", + "\n", + "Original\n", + " Max tokens: 327\n", + " Mean tokens: 29.3\n", + "\n", + " Over 128: 58 (1.2%)\n", + " bug_report=1: 24 (41.4% of truncated)\n", + " feature_request=1: 29 (50.0% of truncated)\n", + " aspect counts: {0: 25, 5: 13, 1: 8, 2: 5, 4: 4, 3: 3}\n", + " sentiment counts: {2: 45, 0: 10, 1: 3}\n", + "\n", + " Over 256: 5 (0.1%)\n", + " bug_report=1: 2 (40.0% of truncated)\n", + " feature_request=1: 2 (40.0% of truncated)\n", + " aspect counts: {0: 2, 4: 1, 1: 1, 5: 1}\n", + " sentiment counts: {2: 5}\n" + ] + } + ], + "source": [ + "# Let's also see if the longer reviews are more likely to be bug reports, feature requests, or have certain aspects/sentiments\n", + "# This finding allows us to make a better decision on the max_length for the model, which will increase the quality of the model,\n", + "# the time to train will be longer but it is not worth removing valuable information from longer reviews which are mostly bug reports and feature requests \n", + "# with negative sentiment (the most important ones to classify correctly)\n", + "for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n", + " lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n", + " mask_128 = lengths > 128\n", + " mask_256 = lengths > 256\n", + " \n", + " print(f\"\\n{name}\")\n", + " print(f\" Max tokens: {lengths.max()}\")\n", + " print(f\" Mean tokens: {lengths.mean():.1f}\")\n", + " \n", + " for mask, label in [(mask_128, 'Over 128'), (mask_256, 'Over 256')]:\n", + " truncated = df[mask]\n", + " print(f\"\\n {label}: {mask.sum()} ({mask.mean():.1%})\")\n", + " print(f\" bug_report=1: {truncated['bug_report'].sum()} ({truncated['bug_report'].mean():.1%} of truncated)\")\n", + " print(f\" feature_request=1: {truncated['feature_request'].sum()} ({truncated['feature_request'].mean():.1%} of truncated)\")\n", + " print(f\" aspect counts: {truncated['aspect'].value_counts().to_dict()}\")\n", + " print(f\" sentiment counts: {truncated['aspect_sentiment'].value_counts().to_dict()}\")" + ] + }, { "cell_type": "code", "execution_count": null,