Finally processed the data fully and tested. Moving on to dataset.py and model.py
This commit is contained in:
@@ -541,7 +541,474 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2d914c7b",
|
||||
"id": "03bc252c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Now I can finally split into train/test sets\n",
|
||||
"from sklearn.model_selection import train_test_split"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "562bf9cd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train: 3,492 (0.6999398677089597)\n",
|
||||
"Val: 748 (0.149929845660453)\n",
|
||||
"Test: 749 (0.15013028663058728)\n",
|
||||
"Train: 3,499 (0.6999399879975995)\n",
|
||||
"Val: 750 (0.15003000600120023)\n",
|
||||
"Test: 750 (0.15003000600120023)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def split(df, name, seed=67):\n",
|
||||
" train, temp = train_test_split(df, test_size=0.30, random_state=seed, stratify=df['aspect_sentiment'])\n",
|
||||
" val, test = train_test_split(temp, test_size=0.50, random_state=seed, stratify=temp['aspect_sentiment'])\n",
|
||||
"\n",
|
||||
" print(f\"Train: {len(train):,} ({len(train)/len(df)})\")\n",
|
||||
" print(f\"Val: {len(val):,} ({len(val)/len(df)})\")\n",
|
||||
" print(f\"Test: {len(test):,} ({len(test)/len(df)})\")\n",
|
||||
" return train, val, test\n",
|
||||
"\n",
|
||||
"tagged_boosted_df = pd.read_csv('../data/tagged_boosted_cleaned.csv')\n",
|
||||
"tagged_original_df = pd.read_csv('../data/tagged_original_cleaned.csv')\n",
|
||||
"\n",
|
||||
"train_b, val_b, test_b = split(tagged_boosted_df, 'Boosted')\n",
|
||||
"train_o, val_o, test_o = split(tagged_original_df, 'Original')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"id": "b3595063",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=========== Boosted Train ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.648053\n",
|
||||
"1 0.351947\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.909507\n",
|
||||
"1 0.090493\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.285223\n",
|
||||
"1 0.251432\n",
|
||||
"5 0.154066\n",
|
||||
"2 0.118557\n",
|
||||
"3 0.108820\n",
|
||||
"4 0.081901\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"2 0.808706\n",
|
||||
"0 0.182703\n",
|
||||
"1 0.008591\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"\n",
|
||||
"=========== Boosted Val ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.649733\n",
|
||||
"1 0.350267\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.895722\n",
|
||||
"1 0.104278\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.283422\n",
|
||||
"1 0.236631\n",
|
||||
"5 0.168449\n",
|
||||
"2 0.112299\n",
|
||||
"3 0.108289\n",
|
||||
"4 0.090909\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"2 0.808824\n",
|
||||
"0 0.183155\n",
|
||||
"1 0.008021\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"\n",
|
||||
"=========== Boosted Test ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.670227\n",
|
||||
"1 0.329773\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.895861\n",
|
||||
"1 0.104139\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.269693\n",
|
||||
"1 0.244326\n",
|
||||
"5 0.146862\n",
|
||||
"2 0.117490\n",
|
||||
"3 0.116155\n",
|
||||
"4 0.105474\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"2 0.809079\n",
|
||||
"0 0.182911\n",
|
||||
"1 0.008011\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"\n",
|
||||
"=========== Original Train ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.818805\n",
|
||||
"1 0.181195\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.84367\n",
|
||||
"1 0.15633\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.293512\n",
|
||||
"5 0.214347\n",
|
||||
"1 0.192626\n",
|
||||
"2 0.157759\n",
|
||||
"4 0.096599\n",
|
||||
"3 0.045156\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"0 0.514433\n",
|
||||
"2 0.433267\n",
|
||||
"1 0.052301\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"\n",
|
||||
"=========== Original Val ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.805333\n",
|
||||
"1 0.194667\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.849333\n",
|
||||
"1 0.150667\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.282667\n",
|
||||
"5 0.225333\n",
|
||||
"1 0.192000\n",
|
||||
"2 0.180000\n",
|
||||
"4 0.076000\n",
|
||||
"3 0.044000\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"0 0.514667\n",
|
||||
"2 0.433333\n",
|
||||
"1 0.052000\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"\n",
|
||||
"=========== Original Test ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.813333\n",
|
||||
"1 0.186667\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.841333\n",
|
||||
"1 0.158667\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.308000\n",
|
||||
"5 0.208000\n",
|
||||
"1 0.206667\n",
|
||||
"2 0.148000\n",
|
||||
"4 0.106667\n",
|
||||
"3 0.022667\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"0 0.513333\n",
|
||||
"2 0.433333\n",
|
||||
"1 0.053333\n",
|
||||
"Name: proportion, dtype: float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"splits = [\n",
|
||||
" (train_b, 'Boosted Train'), (val_b, 'Boosted Val'), (test_b, 'Boosted Test'),\n",
|
||||
" (train_o, 'Original Train'), (val_o, 'Original Val'), (test_o, 'Original Test')\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for df, name in splits:\n",
|
||||
" print(f\"\\n=========== {name} ===========\")\n",
|
||||
" print(\"bug_report:\\n\", df['bug_report'].value_counts(normalize=True))\n",
|
||||
" print(\"feature_request:\\n\", df['feature_request'].value_counts(normalize=True))\n",
|
||||
" print(\"aspect:\\n\", df['aspect'].value_counts(normalize=True))\n",
|
||||
" print(\"aspect_sentiment:\\n\", df['aspect_sentiment'].value_counts(normalize=True))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 69,
|
||||
"id": "2a524db6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train: 3,492 (0.6999398677089597)\n",
|
||||
"Val: 748 (0.149929845660453)\n",
|
||||
"Test: 749 (0.15013028663058728)\n",
|
||||
"Train: 3,499 (0.6999399879975995)\n",
|
||||
"Val: 750 (0.15003000600120023)\n",
|
||||
"Test: 750 (0.15003000600120023)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def split(df, name, seed=67):\n",
|
||||
" composite = df['bug_report'].astype(str) + \"_\" + df['feature_request'].astype(str) + \"_\" + df['aspect_sentiment'].astype(str)\n",
|
||||
" train, temp = train_test_split(df, test_size=0.30, random_state=seed, stratify=composite)\n",
|
||||
" \n",
|
||||
" composite_temp = temp['bug_report'].astype(str) + \"_\" + temp['feature_request'].astype(str) + \"_\" + temp['aspect_sentiment'].astype(str)\n",
|
||||
" val, test = train_test_split(temp, test_size=0.50, random_state=seed, stratify=composite_temp)\n",
|
||||
"\n",
|
||||
" print(f\"Train: {len(train):,} ({len(train)/len(df)})\")\n",
|
||||
" print(f\"Val: {len(val):,} ({len(val)/len(df)})\")\n",
|
||||
" print(f\"Test: {len(test):,} ({len(test)/len(df)})\")\n",
|
||||
" return train, val, test\n",
|
||||
"\n",
|
||||
"tagged_boosted_df = pd.read_csv('../data/tagged_boosted_cleaned.csv')\n",
|
||||
"tagged_original_df = pd.read_csv('../data/tagged_original_cleaned.csv')\n",
|
||||
"\n",
|
||||
"train_b, val_b, test_b = split(tagged_boosted_df, 'Boosted')\n",
|
||||
"train_o, val_o, test_o = split(tagged_original_df, 'Original')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"id": "bc486e8f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=========== Boosted Train ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.651489\n",
|
||||
"1 0.348511\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.905212\n",
|
||||
"1 0.094788\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.280928\n",
|
||||
"1 0.250286\n",
|
||||
"5 0.152348\n",
|
||||
"2 0.118270\n",
|
||||
"3 0.114261\n",
|
||||
"4 0.083906\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"2 0.808706\n",
|
||||
"0 0.182990\n",
|
||||
"1 0.008305\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"\n",
|
||||
"=========== Boosted Val ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.65107\n",
|
||||
"1 0.34893\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.906417\n",
|
||||
"1 0.093583\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.286096\n",
|
||||
"1 0.251337\n",
|
||||
"5 0.159091\n",
|
||||
"2 0.113636\n",
|
||||
"3 0.097594\n",
|
||||
"4 0.092246\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"2 0.807487\n",
|
||||
"0 0.183155\n",
|
||||
"1 0.009358\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"\n",
|
||||
"=========== Boosted Test ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.65287\n",
|
||||
"1 0.34713\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.905207\n",
|
||||
"1 0.094793\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.287049\n",
|
||||
"1 0.234980\n",
|
||||
"5 0.164219\n",
|
||||
"2 0.117490\n",
|
||||
"3 0.101469\n",
|
||||
"4 0.094793\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"2 0.810414\n",
|
||||
"0 0.181575\n",
|
||||
"1 0.008011\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"\n",
|
||||
"=========== Original Train ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.815947\n",
|
||||
"1 0.184053\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.844241\n",
|
||||
"1 0.155759\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.294941\n",
|
||||
"5 0.215490\n",
|
||||
"1 0.199486\n",
|
||||
"2 0.153472\n",
|
||||
"4 0.097456\n",
|
||||
"3 0.039154\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"0 0.514433\n",
|
||||
"2 0.433267\n",
|
||||
"1 0.052301\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"\n",
|
||||
"=========== Original Val ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.816\n",
|
||||
"1 0.184\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.844\n",
|
||||
"1 0.156\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.284000\n",
|
||||
"5 0.205333\n",
|
||||
"1 0.190667\n",
|
||||
"2 0.181333\n",
|
||||
"4 0.088000\n",
|
||||
"3 0.050667\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"0 0.513333\n",
|
||||
"2 0.433333\n",
|
||||
"1 0.053333\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"\n",
|
||||
"=========== Original Test ===========\n",
|
||||
"bug_report:\n",
|
||||
" bug_report\n",
|
||||
"0 0.816\n",
|
||||
"1 0.184\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"feature_request:\n",
|
||||
" feature_request\n",
|
||||
"0 0.844\n",
|
||||
"1 0.156\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect:\n",
|
||||
" aspect\n",
|
||||
"0 0.300000\n",
|
||||
"5 0.222667\n",
|
||||
"1 0.176000\n",
|
||||
"2 0.166667\n",
|
||||
"4 0.090667\n",
|
||||
"3 0.044000\n",
|
||||
"Name: proportion, dtype: float64\n",
|
||||
"aspect_sentiment:\n",
|
||||
" aspect_sentiment\n",
|
||||
"0 0.514667\n",
|
||||
"2 0.433333\n",
|
||||
"1 0.052000\n",
|
||||
"Name: proportion, dtype: float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Using composite key\n",
|
||||
"\n",
|
||||
"splits = [\n",
|
||||
" (train_b, 'Boosted Train'), (val_b, 'Boosted Val'), (test_b, 'Boosted Test'),\n",
|
||||
" (train_o, 'Original Train'), (val_o, 'Original Val'), (test_o, 'Original Test')\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for df, name in splits:\n",
|
||||
" print(f\"\\n=========== {name} ===========\")\n",
|
||||
" print(\"bug_report:\\n\", df['bug_report'].value_counts(normalize=True))\n",
|
||||
" print(\"feature_request:\\n\", df['feature_request'].value_counts(normalize=True))\n",
|
||||
" print(\"aspect:\\n\", df['aspect'].value_counts(normalize=True))\n",
|
||||
" print(\"aspect_sentiment:\\n\", df['aspect_sentiment'].value_counts(normalize=True))\n",
|
||||
" df.to_csv(f'{name.lower().replace(\" \", \"_\")}.csv', index=False) # 6 seperate files are saved"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "67004c13",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
|
||||
Reference in New Issue
Block a user