From c5e91b79b29f7746054967dbe915899397e9ebb6 Mon Sep 17 00:00:00 2001
From: charlie-rasberry <charlie.rasberry@outlook.com>
Date: Thu, 19 Feb 2026 01:28:10 +0000
Subject: [PATCH] Decided on max_length by finding out how many and which
 reviews would be truncated (it will be 256 tokens)

---
 notebooks/preprocessing_tagged.ipynb | 177 +++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

diff --git a/notebooks/preprocessing_tagged.ipynb b/notebooks/preprocessing_tagged.ipynb
index af33f70..25ea8ed 100644
--- a/notebooks/preprocessing_tagged.ipynb
+++ b/notebooks/preprocessing_tagged.ipynb
@@ -1005,6 +1005,183 @@
     "    df.to_csv(f'{name.lower().replace(\" \", \"_\")}.csv', index=False)    # 6 seperate files are saved"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "0e469c90",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4dc1bbaab74f4de3a17640dfdbe18c59",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "838a239984484c589a333d4b7266233b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b3958943bb1e44bf816895b71fba43fa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4ea21d8b6a8043e5b571d260a825269e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Boosted\n",
+      "  Max tokens:    471\n",
+      "  Mean tokens:   49.8\n",
+      "  Over 128:      180 (3.6%)\n",
+      "  Over 256:      21 (0.4%)\n",
+      "\n",
+      "Original\n",
+      "  Max tokens:    327\n",
+      "  Mean tokens:   29.3\n",
+      "  Over 128:      58 (1.2%)\n",
+      "  Over 256:      5 (0.1%)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')\n",
+    "\n",
+    "for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
+    "    lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n",
+    "    print(f\"\\n{name}\")\n",
+    "    print(f\"  Max tokens:    {lengths.max()}\")\n",
+    "    print(f\"  Mean tokens:   {lengths.mean():.1f}\")\n",
+    "    print(f\"  Over 128:      {(lengths > 128).sum()} ({(lengths > 128).mean():.1%})\")\n",
+    "    print(f\"  Over 256:      {(lengths > 256).sum()} ({(lengths > 256).mean():.1%})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad011e5e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Boosted\n",
+      "  Max tokens:    471\n",
+      "  Mean tokens:   49.8\n",
+      "\n",
+      "  Over 128: 180 (3.6%)\n",
+      "    bug_report=1:      68 (37.8% of truncated)\n",
+      "    feature_request=1: 17 (9.4% of truncated)\n",
+      "    aspect counts:     {1: 57, 0: 47, 5: 36, 3: 26, 4: 12, 2: 2}\n",
+      "    sentiment counts:  {2: 165, 0: 15}\n",
+      "\n",
+      "  Over 256: 21 (0.4%)\n",
+      "    bug_report=1:      6 (28.6% of truncated)\n",
+      "    feature_request=1: 1 (4.8% of truncated)\n",
+      "    aspect counts:     {1: 6, 5: 6, 3: 5, 0: 4}\n",
+      "    sentiment counts:  {2: 21}\n",
+      "\n",
+      "Original\n",
+      "  Max tokens:    327\n",
+      "  Mean tokens:   29.3\n",
+      "\n",
+      "  Over 128: 58 (1.2%)\n",
+      "    bug_report=1:      24 (41.4% of truncated)\n",
+      "    feature_request=1: 29 (50.0% of truncated)\n",
+      "    aspect counts:     {0: 25, 5: 13, 1: 8, 2: 5, 4: 4, 3: 3}\n",
+      "    sentiment counts:  {2: 45, 0: 10, 1: 3}\n",
+      "\n",
+      "  Over 256: 5 (0.1%)\n",
+      "    bug_report=1:      2 (40.0% of truncated)\n",
+      "    feature_request=1: 2 (40.0% of truncated)\n",
+      "    aspect counts:     {0: 2, 4: 1, 1: 1, 5: 1}\n",
+      "    sentiment counts:  {2: 5}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Let's also see if the longer reviews are more likely to be bug reports, feature requests, or have certain aspects/sentiments\n",
+    "# This finding allows us to make a better decision on the max_length for the model, which will increase the quality of the model,\n",
+    "# the time to train will be longer but it is not worth removing valuable information from longer reviews which are mostly bug reports and feature requests \n",
+    "# with negative sentiment (the most important ones to classify correctly)\n",
+    "for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
+    "    lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n",
+    "    mask_128 = lengths > 128\n",
+    "    mask_256 = lengths > 256\n",
+    "    \n",
+    "    print(f\"\\n{name}\")\n",
+    "    print(f\"  Max tokens:    {lengths.max()}\")\n",
+    "    print(f\"  Mean tokens:   {lengths.mean():.1f}\")\n",
+    "    \n",
+    "    for mask, label in [(mask_128, 'Over 128'), (mask_256, 'Over 256')]:\n",
+    "        truncated = df[mask]\n",
+    "        print(f\"\\n  {label}: {mask.sum()} ({mask.mean():.1%})\")\n",
+    "        print(f\"    bug_report=1:      {truncated['bug_report'].sum()} ({truncated['bug_report'].mean():.1%} of truncated)\")\n",
+    "        print(f\"    feature_request=1: {truncated['feature_request'].sum()} ({truncated['feature_request'].mean():.1%} of truncated)\")\n",
+    "        print(f\"    aspect counts:     {truncated['aspect'].value_counts().to_dict()}\")\n",
+    "        print(f\"    sentiment counts:  {truncated['aspect_sentiment'].value_counts().to_dict()}\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,