Decided on max_length by finding out how many and which reviews would be truncated (it will be 256 tokens)
This commit is contained in:
@@ -1005,6 +1005,183 @@
|
|||||||
" df.to_csv(f'{name.lower().replace(\" \", \"_\")}.csv', index=False) # 6 seperate files are saved"
|
" df.to_csv(f'{name.lower().replace(\" \", \"_\")}.csv', index=False) # 6 seperate files are saved"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 72,
|
||||||
|
"id": "0e469c90",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "4dc1bbaab74f4de3a17640dfdbe18c59",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"config.json: 0%| | 0.00/615 [00:00<?, ?B/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "838a239984484c589a333d4b7266233b",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"tokenizer_config.json: 0%| | 0.00/25.0 [00:00<?, ?B/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "b3958943bb1e44bf816895b71fba43fa",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"sentencepiece.bpe.model: 0%| | 0.00/5.07M [00:00<?, ?B/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "4ea21d8b6a8043e5b571d260a825269e",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"tokenizer.json: 0%| | 0.00/9.10M [00:00<?, ?B/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"Boosted\n",
|
||||||
|
" Max tokens: 471\n",
|
||||||
|
" Mean tokens: 49.8\n",
|
||||||
|
" Over 128: 180 (3.6%)\n",
|
||||||
|
" Over 256: 21 (0.4%)\n",
|
||||||
|
"\n",
|
||||||
|
"Original\n",
|
||||||
|
" Max tokens: 327\n",
|
||||||
|
" Mean tokens: 29.3\n",
|
||||||
|
" Over 128: 58 (1.2%)\n",
|
||||||
|
" Over 256: 5 (0.1%)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from transformers import AutoTokenizer\n",
|
||||||
|
"\n",
|
||||||
|
"tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')\n",
|
||||||
|
"\n",
|
||||||
|
"for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
|
||||||
|
" lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n",
|
||||||
|
" print(f\"\\n{name}\")\n",
|
||||||
|
" print(f\" Max tokens: {lengths.max()}\")\n",
|
||||||
|
" print(f\" Mean tokens: {lengths.mean():.1f}\")\n",
|
||||||
|
" print(f\" Over 128: {(lengths > 128).sum()} ({(lengths > 128).mean():.1%})\")\n",
|
||||||
|
" print(f\" Over 256: {(lengths > 256).sum()} ({(lengths > 256).mean():.1%})\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "ad011e5e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"Boosted\n",
|
||||||
|
" Max tokens: 471\n",
|
||||||
|
" Mean tokens: 49.8\n",
|
||||||
|
"\n",
|
||||||
|
" Over 128: 180 (3.6%)\n",
|
||||||
|
" bug_report=1: 68 (37.8% of truncated)\n",
|
||||||
|
" feature_request=1: 17 (9.4% of truncated)\n",
|
||||||
|
" aspect counts: {1: 57, 0: 47, 5: 36, 3: 26, 4: 12, 2: 2}\n",
|
||||||
|
" sentiment counts: {2: 165, 0: 15}\n",
|
||||||
|
"\n",
|
||||||
|
" Over 256: 21 (0.4%)\n",
|
||||||
|
" bug_report=1: 6 (28.6% of truncated)\n",
|
||||||
|
" feature_request=1: 1 (4.8% of truncated)\n",
|
||||||
|
" aspect counts: {1: 6, 5: 6, 3: 5, 0: 4}\n",
|
||||||
|
" sentiment counts: {2: 21}\n",
|
||||||
|
"\n",
|
||||||
|
"Original\n",
|
||||||
|
" Max tokens: 327\n",
|
||||||
|
" Mean tokens: 29.3\n",
|
||||||
|
"\n",
|
||||||
|
" Over 128: 58 (1.2%)\n",
|
||||||
|
" bug_report=1: 24 (41.4% of truncated)\n",
|
||||||
|
" feature_request=1: 29 (50.0% of truncated)\n",
|
||||||
|
" aspect counts: {0: 25, 5: 13, 1: 8, 2: 5, 4: 4, 3: 3}\n",
|
||||||
|
" sentiment counts: {2: 45, 0: 10, 1: 3}\n",
|
||||||
|
"\n",
|
||||||
|
" Over 256: 5 (0.1%)\n",
|
||||||
|
" bug_report=1: 2 (40.0% of truncated)\n",
|
||||||
|
" feature_request=1: 2 (40.0% of truncated)\n",
|
||||||
|
" aspect counts: {0: 2, 4: 1, 1: 1, 5: 1}\n",
|
||||||
|
" sentiment counts: {2: 5}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Let's also see if the longer reviews are more likely to be bug reports, feature requests, or have certain aspects/sentiments\n",
|
||||||
|
"# This finding allows us to make a better decision on the max_length for the model, which will increase the quality of the model,\n",
|
||||||
|
"# the time to train will be longer but it is not worth removing valuable information from longer reviews which are mostly bug reports and feature requests \n",
|
||||||
|
"# with negative sentiment (the most important ones to classify correctly)\n",
|
||||||
|
"for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
|
||||||
|
" lengths = df['review'].apply(lambda x: len(tokenizer.encode(x)))\n",
|
||||||
|
" mask_128 = lengths > 128\n",
|
||||||
|
" mask_256 = lengths > 256\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"\\n{name}\")\n",
|
||||||
|
" print(f\" Max tokens: {lengths.max()}\")\n",
|
||||||
|
" print(f\" Mean tokens: {lengths.mean():.1f}\")\n",
|
||||||
|
" \n",
|
||||||
|
" for mask, label in [(mask_128, 'Over 128'), (mask_256, 'Over 256')]:\n",
|
||||||
|
" truncated = df[mask]\n",
|
||||||
|
" print(f\"\\n {label}: {mask.sum()} ({mask.mean():.1%})\")\n",
|
||||||
|
" print(f\" bug_report=1: {truncated['bug_report'].sum()} ({truncated['bug_report'].mean():.1%} of truncated)\")\n",
|
||||||
|
" print(f\" feature_request=1: {truncated['feature_request'].sum()} ({truncated['feature_request'].mean():.1%} of truncated)\")\n",
|
||||||
|
" print(f\" aspect counts: {truncated['aspect'].value_counts().to_dict()}\")\n",
|
||||||
|
" print(f\" sentiment counts: {truncated['aspect_sentiment'].value_counts().to_dict()}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
|||||||
Reference in New Issue
Block a user