Decided on max_length by finding out how many and which reviews would be truncated (it will be 256 tokens)

This commit is contained in:
2026-02-19 01:28:10 +00:00
parent 0be7da2dde
commit c5e91b79b2

View File

@@ -1005,6 +1005,183 @@
" df.to_csv(f'{name.lower().replace(\" \", \"_\")}.csv', index=False) # 6 seperate files are saved" " df.to_csv(f'{name.lower().replace(\" \", \"_\")}.csv', index=False) # 6 seperate files are saved"
] ]
}, },
{
"cell_type": "code",
"execution_count": 72,
"id": "0e469c90",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4dc1bbaab74f4de3a17640dfdbe18c59",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"config.json: 0%| | 0.00/615 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "838a239984484c589a333d4b7266233b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer_config.json: 0%| | 0.00/25.0 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b3958943bb1e44bf816895b71fba43fa",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"sentencepiece.bpe.model: 0%| | 0.00/5.07M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4ea21d8b6a8043e5b571d260a825269e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.json: 0%| | 0.00/9.10M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Boosted\n",
" Max tokens: 471\n",
" Mean tokens: 49.8\n",
" Over 128: 180 (3.6%)\n",
" Over 256: 21 (0.4%)\n",
"\n",
"Original\n",
" Max tokens: 327\n",
" Mean tokens: 29.3\n",
" Over 128: 58 (1.2%)\n",
" Over 256: 5 (0.1%)\n"
]
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')\n",
"\n",
"for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
" lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n",
" print(f\"\\n{name}\")\n",
" print(f\" Max tokens: {lengths.max()}\")\n",
" print(f\" Mean tokens: {lengths.mean():.1f}\")\n",
" print(f\" Over 128: {(lengths > 128).sum()} ({(lengths > 128).mean():.1%})\")\n",
" print(f\" Over 256: {(lengths > 256).sum()} ({(lengths > 256).mean():.1%})\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad011e5e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Boosted\n",
" Max tokens: 471\n",
" Mean tokens: 49.8\n",
"\n",
" Over 128: 180 (3.6%)\n",
" bug_report=1: 68 (37.8% of truncated)\n",
" feature_request=1: 17 (9.4% of truncated)\n",
" aspect counts: {1: 57, 0: 47, 5: 36, 3: 26, 4: 12, 2: 2}\n",
" sentiment counts: {2: 165, 0: 15}\n",
"\n",
" Over 256: 21 (0.4%)\n",
" bug_report=1: 6 (28.6% of truncated)\n",
" feature_request=1: 1 (4.8% of truncated)\n",
" aspect counts: {1: 6, 5: 6, 3: 5, 0: 4}\n",
" sentiment counts: {2: 21}\n",
"\n",
"Original\n",
" Max tokens: 327\n",
" Mean tokens: 29.3\n",
"\n",
" Over 128: 58 (1.2%)\n",
" bug_report=1: 24 (41.4% of truncated)\n",
" feature_request=1: 29 (50.0% of truncated)\n",
" aspect counts: {0: 25, 5: 13, 1: 8, 2: 5, 4: 4, 3: 3}\n",
" sentiment counts: {2: 45, 0: 10, 1: 3}\n",
"\n",
" Over 256: 5 (0.1%)\n",
" bug_report=1: 2 (40.0% of truncated)\n",
" feature_request=1: 2 (40.0% of truncated)\n",
" aspect counts: {0: 2, 4: 1, 1: 1, 5: 1}\n",
" sentiment counts: {2: 5}\n"
]
}
],
"source": [
"# Let's also see if the longer reviews are more likely to be bug reports, feature requests, or have certain aspects/sentiments\n",
"# This finding allows us to make a better decision on the max_length for the model, which will increase the quality of the model,\n",
"# the time to train will be longer but it is not worth removing valuable information from longer reviews which are mostly bug reports and feature requests \n",
"# with negative sentiment (the most important ones to classify correctly)\n",
"for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
" lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n",
" mask_128 = lengths > 128\n",
" mask_256 = lengths > 256\n",
" \n",
" print(f\"\\n{name}\")\n",
" print(f\" Max tokens: {lengths.max()}\")\n",
" print(f\" Mean tokens: {lengths.mean():.1f}\")\n",
" \n",
" for mask, label in [(mask_128, 'Over 128'), (mask_256, 'Over 256')]:\n",
" truncated = df[mask]\n",
" print(f\"\\n {label}: {mask.sum()} ({mask.mean():.1%})\")\n",
" print(f\" bug_report=1: {truncated['bug_report'].sum()} ({truncated['bug_report'].mean():.1%} of truncated)\")\n",
" print(f\" feature_request=1: {truncated['feature_request'].sum()} ({truncated['feature_request'].mean():.1%} of truncated)\")\n",
" print(f\" aspect counts: {truncated['aspect'].value_counts().to_dict()}\")\n",
" print(f\" sentiment counts: {truncated['aspect_sentiment'].value_counts().to_dict()}\")"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,