Decided on max_length by finding out how many and which reviews would be truncated (it will be 256 tokens)

2026-02-19 01:28:10 +00:00
parent 0be7da2dde
commit c5e91b79b2
1 changed files with 177 additions and 0 deletions
--- a/notebooks/preprocessing_tagged.ipynb
+++ b/notebooks/preprocessing_tagged.ipynb
@@ -1005,6 +1005,183 @@
    "    df.to_csv(f'{name.lower().replace(\" \", \"_\")}.csv', index=False)    # 6 seperate files are saved"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "0e469c90",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4dc1bbaab74f4de3a17640dfdbe18c59",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "838a239984484c589a333d4b7266233b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b3958943bb1e44bf816895b71fba43fa",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4ea21d8b6a8043e5b571d260a825269e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Boosted\n",
      "  Max tokens:    471\n",
      "  Mean tokens:   49.8\n",
      "  Over 128:      180 (3.6%)\n",
      "  Over 256:      21 (0.4%)\n",
      "\n",
      "Original\n",
      "  Max tokens:    327\n",
      "  Mean tokens:   29.3\n",
      "  Over 128:      58 (1.2%)\n",
      "  Over 256:      5 (0.1%)\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')\n",
    "\n",
    "for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
    "    lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n",
    "    print(f\"\\n{name}\")\n",
    "    print(f\"  Max tokens:    {lengths.max()}\")\n",
    "    print(f\"  Mean tokens:   {lengths.mean():.1f}\")\n",
    "    print(f\"  Over 128:      {(lengths > 128).sum()} ({(lengths > 128).mean():.1%})\")\n",
    "    print(f\"  Over 256:      {(lengths > 256).sum()} ({(lengths > 256).mean():.1%})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad011e5e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Boosted\n",
      "  Max tokens:    471\n",
      "  Mean tokens:   49.8\n",
      "\n",
      "  Over 128: 180 (3.6%)\n",
      "    bug_report=1:      68 (37.8% of truncated)\n",
      "    feature_request=1: 17 (9.4% of truncated)\n",
      "    aspect counts:     {1: 57, 0: 47, 5: 36, 3: 26, 4: 12, 2: 2}\n",
      "    sentiment counts:  {2: 165, 0: 15}\n",
      "\n",
      "  Over 256: 21 (0.4%)\n",
      "    bug_report=1:      6 (28.6% of truncated)\n",
      "    feature_request=1: 1 (4.8% of truncated)\n",
      "    aspect counts:     {1: 6, 5: 6, 3: 5, 0: 4}\n",
      "    sentiment counts:  {2: 21}\n",
      "\n",
      "Original\n",
      "  Max tokens:    327\n",
      "  Mean tokens:   29.3\n",
      "\n",
      "  Over 128: 58 (1.2%)\n",
      "    bug_report=1:      24 (41.4% of truncated)\n",
      "    feature_request=1: 29 (50.0% of truncated)\n",
      "    aspect counts:     {0: 25, 5: 13, 1: 8, 2: 5, 4: 4, 3: 3}\n",
      "    sentiment counts:  {2: 45, 0: 10, 1: 3}\n",
      "\n",
      "  Over 256: 5 (0.1%)\n",
      "    bug_report=1:      2 (40.0% of truncated)\n",
      "    feature_request=1: 2 (40.0% of truncated)\n",
      "    aspect counts:     {0: 2, 4: 1, 1: 1, 5: 1}\n",
      "    sentiment counts:  {2: 5}\n"
     ]
    }
   ],
   "source": [
    "# Let's also see if the longer reviews are more likely to be bug reports, feature requests, or have certain aspects/sentiments\n",
    "# This finding allows us to make a better decision on the max_length for the model, which will increase the quality of the model,\n",
    "# the time to train will be longer but it is not worth removing valuable information from longer reviews which are mostly bug reports and feature requests \n",
    "# with negative sentiment (the most important ones to classify correctly)\n",
    "for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
    "    lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n",
    "    mask_128 = lengths > 128\n",
    "    mask_256 = lengths > 256\n",
    "    \n",
    "    print(f\"\\n{name}\")\n",
    "    print(f\"  Max tokens:    {lengths.max()}\")\n",
    "    print(f\"  Mean tokens:   {lengths.mean():.1f}\")\n",
    "    \n",
    "    for mask, label in [(mask_128, 'Over 128'), (mask_256, 'Over 256')]:\n",
    "        truncated = df[mask]\n",
    "        print(f\"\\n  {label}: {mask.sum()} ({mask.mean():.1%})\")\n",
    "        print(f\"    bug_report=1:      {truncated['bug_report'].sum()} ({truncated['bug_report'].mean():.1%} of truncated)\")\n",
    "        print(f\"    feature_request=1: {truncated['feature_request'].sum()} ({truncated['feature_request'].mean():.1%} of truncated)\")\n",
    "        print(f\"    aspect counts:     {truncated['aspect'].value_counts().to_dict()}\")\n",
    "        print(f\"    sentiment counts:  {truncated['aspect_sentiment'].value_counts().to_dict()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,