diff --git a/notebooks/preprocessing_tagged.ipynb b/notebooks/preprocessing_tagged.ipynb index 1ca5fbc..af33f70 100644 --- a/notebooks/preprocessing_tagged.ipynb +++ b/notebooks/preprocessing_tagged.ipynb @@ -541,7 +541,474 @@ { "cell_type": "code", "execution_count": null, - "id": "2d914c7b", + "id": "03bc252c", + "metadata": {}, + "outputs": [], + "source": [ + "# Now I can finally split into train/test sets\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "562bf9cd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train: 3,492 (0.6999398677089597)\n", + "Val: 748 (0.149929845660453)\n", + "Test: 749 (0.15013028663058728)\n", + "Train: 3,499 (0.6999399879975995)\n", + "Val: 750 (0.15003000600120023)\n", + "Test: 750 (0.15003000600120023)\n" + ] + } + ], + "source": [ + "def split(df, name, seed=67):\n", + " train, temp = train_test_split(df, test_size=0.30, random_state=seed, stratify=df['aspect_sentiment'])\n", + " val, test = train_test_split(temp, test_size=0.50, random_state=seed, stratify=temp['aspect_sentiment'])\n", + "\n", + " print(f\"Train: {len(train):,} ({len(train)/len(df)})\")\n", + " print(f\"Val: {len(val):,} ({len(val)/len(df)})\")\n", + " print(f\"Test: {len(test):,} ({len(test)/len(df)})\")\n", + " return train, val, test\n", + "\n", + "tagged_boosted_df = pd.read_csv('../data/tagged_boosted_cleaned.csv')\n", + "tagged_original_df = pd.read_csv('../data/tagged_original_cleaned.csv')\n", + "\n", + "train_b, val_b, test_b = split(tagged_boosted_df, 'Boosted')\n", + "train_o, val_o, test_o = split(tagged_original_df, 'Original')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "b3595063", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=========== Boosted Train ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.648053\n", + "1 0.351947\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.909507\n", + "1 0.090493\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.285223\n", + "1 0.251432\n", + "5 0.154066\n", + "2 0.118557\n", + "3 0.108820\n", + "4 0.081901\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "2 0.808706\n", + "0 0.182703\n", + "1 0.008591\n", + "Name: proportion, dtype: float64\n", + "\n", + "=========== Boosted Val ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.649733\n", + "1 0.350267\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.895722\n", + "1 0.104278\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.283422\n", + "1 0.236631\n", + "5 0.168449\n", + "2 0.112299\n", + "3 0.108289\n", + "4 0.090909\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "2 0.808824\n", + "0 0.183155\n", + "1 0.008021\n", + "Name: proportion, dtype: float64\n", + "\n", + "=========== Boosted Test ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.670227\n", + "1 0.329773\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.895861\n", + "1 0.104139\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.269693\n", + "1 0.244326\n", + "5 0.146862\n", + "2 0.117490\n", + "3 0.116155\n", + "4 0.105474\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "2 0.809079\n", + "0 0.182911\n", + "1 0.008011\n", + "Name: proportion, dtype: float64\n", + "\n", + "=========== Original Train ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.818805\n", + "1 0.181195\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.84367\n", + "1 0.15633\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.293512\n", + "5 0.214347\n", + "1 0.192626\n", + "2 0.157759\n", + "4 0.096599\n", + "3 0.045156\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "0 0.514433\n", + "2 0.433267\n", + "1 0.052301\n", + "Name: proportion, dtype: float64\n", + "\n", + "=========== Original Val ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.805333\n", + "1 0.194667\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.849333\n", + "1 0.150667\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.282667\n", + "5 0.225333\n", + "1 0.192000\n", + "2 0.180000\n", + "4 0.076000\n", + "3 0.044000\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "0 0.514667\n", + "2 0.433333\n", + "1 0.052000\n", + "Name: proportion, dtype: float64\n", + "\n", + "=========== Original Test ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.813333\n", + "1 0.186667\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.841333\n", + "1 0.158667\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.308000\n", + "5 0.208000\n", + "1 0.206667\n", + "2 0.148000\n", + "4 0.106667\n", + "3 0.022667\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "0 0.513333\n", + "2 0.433333\n", + "1 0.053333\n", + "Name: proportion, dtype: float64\n" + ] + } + ], + "source": [ + "splits = [\n", + " (train_b, 'Boosted Train'), (val_b, 'Boosted Val'), (test_b, 'Boosted Test'),\n", + " (train_o, 'Original Train'), (val_o, 'Original Val'), (test_o, 'Original Test')\n", + "]\n", + "\n", + "for df, name in splits:\n", + " print(f\"\\n=========== {name} ===========\")\n", + " print(\"bug_report:\\n\", df['bug_report'].value_counts(normalize=True))\n", + " print(\"feature_request:\\n\", df['feature_request'].value_counts(normalize=True))\n", + " print(\"aspect:\\n\", df['aspect'].value_counts(normalize=True))\n", + " print(\"aspect_sentiment:\\n\", df['aspect_sentiment'].value_counts(normalize=True))" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "2a524db6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train: 3,492 (0.6999398677089597)\n", + "Val: 748 (0.149929845660453)\n", + "Test: 749 (0.15013028663058728)\n", + "Train: 3,499 (0.6999399879975995)\n", + "Val: 750 (0.15003000600120023)\n", + "Test: 750 (0.15003000600120023)\n" + ] + } + ], + "source": [ + "def split(df, name, seed=67):\n", + " composite = df['bug_report'].astype(str) + \"_\" + df['feature_request'].astype(str) + \"_\" + df['aspect_sentiment'].astype(str)\n", + " train, temp = train_test_split(df, test_size=0.30, random_state=seed, stratify=composite)\n", + " \n", + " composite_temp = temp['bug_report'].astype(str) + \"_\" + temp['feature_request'].astype(str) + \"_\" + temp['aspect_sentiment'].astype(str)\n", + " val, test = train_test_split(temp, test_size=0.50, random_state=seed, stratify=composite_temp)\n", + "\n", + " print(f\"Train: {len(train):,} ({len(train)/len(df)})\")\n", + " print(f\"Val: {len(val):,} ({len(val)/len(df)})\")\n", + " print(f\"Test: {len(test):,} ({len(test)/len(df)})\")\n", + " return train, val, test\n", + "\n", + "tagged_boosted_df = pd.read_csv('../data/tagged_boosted_cleaned.csv')\n", + "tagged_original_df = pd.read_csv('../data/tagged_original_cleaned.csv')\n", + "\n", + "train_b, val_b, test_b = split(tagged_boosted_df, 'Boosted')\n", + "train_o, val_o, test_o = split(tagged_original_df, 'Original')" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "bc486e8f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=========== Boosted Train ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.651489\n", + "1 0.348511\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.905212\n", + "1 0.094788\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.280928\n", + "1 0.250286\n", + "5 0.152348\n", + "2 0.118270\n", + "3 0.114261\n", + "4 0.083906\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "2 0.808706\n", + "0 0.182990\n", + "1 0.008305\n", + "Name: proportion, dtype: float64\n", + "\n", + "=========== Boosted Val ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.65107\n", + "1 0.34893\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.906417\n", + "1 0.093583\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.286096\n", + "1 0.251337\n", + "5 0.159091\n", + "2 0.113636\n", + "3 0.097594\n", + "4 0.092246\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "2 0.807487\n", + "0 0.183155\n", + "1 0.009358\n", + "Name: proportion, dtype: float64\n", + "\n", + "=========== Boosted Test ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.65287\n", + "1 0.34713\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.905207\n", + "1 0.094793\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.287049\n", + "1 0.234980\n", + "5 0.164219\n", + "2 0.117490\n", + "3 0.101469\n", + "4 0.094793\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "2 0.810414\n", + "0 0.181575\n", + "1 0.008011\n", + "Name: proportion, dtype: float64\n", + "\n", + "=========== Original Train ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.815947\n", + "1 0.184053\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.844241\n", + "1 0.155759\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.294941\n", + "5 0.215490\n", + "1 0.199486\n", + "2 0.153472\n", + "4 0.097456\n", + "3 0.039154\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "0 0.514433\n", + "2 0.433267\n", + "1 0.052301\n", + "Name: proportion, dtype: float64\n", + "\n", + "=========== Original Val ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.816\n", + "1 0.184\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.844\n", + "1 0.156\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.284000\n", + "5 0.205333\n", + "1 0.190667\n", + "2 0.181333\n", + "4 0.088000\n", + "3 0.050667\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "0 0.513333\n", + "2 0.433333\n", + "1 0.053333\n", + "Name: proportion, dtype: float64\n", + "\n", + "=========== Original Test ===========\n", + "bug_report:\n", + " bug_report\n", + "0 0.816\n", + "1 0.184\n", + "Name: proportion, dtype: float64\n", + "feature_request:\n", + " feature_request\n", + "0 0.844\n", + "1 0.156\n", + "Name: proportion, dtype: float64\n", + "aspect:\n", + " aspect\n", + "0 0.300000\n", + "5 0.222667\n", + "1 0.176000\n", + "2 0.166667\n", + "4 0.090667\n", + "3 0.044000\n", + "Name: proportion, dtype: float64\n", + "aspect_sentiment:\n", + " aspect_sentiment\n", + "0 0.514667\n", + "2 0.433333\n", + "1 0.052000\n", + "Name: proportion, dtype: float64\n" + ] + } + ], + "source": [ + "# Using composite key\n", + "\n", + "splits = [\n", + " (train_b, 'Boosted Train'), (val_b, 'Boosted Val'), (test_b, 'Boosted Test'),\n", + " (train_o, 'Original Train'), (val_o, 'Original Val'), (test_o, 'Original Test')\n", + "]\n", + "\n", + "for df, name in splits:\n", + " print(f\"\\n=========== {name} ===========\")\n", + " print(\"bug_report:\\n\", df['bug_report'].value_counts(normalize=True))\n", + " print(\"feature_request:\\n\", df['feature_request'].value_counts(normalize=True))\n", + " print(\"aspect:\\n\", df['aspect'].value_counts(normalize=True))\n", + " print(\"aspect_sentiment:\\n\", df['aspect_sentiment'].value_counts(normalize=True))\n", + " df.to_csv(f'{name.lower().replace(\" \", \"_\")}.csv', index=False) # 6 seperate files are saved" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67004c13", "metadata": {}, "outputs": [], "source": [] diff --git a/src/dataset.py b/src/dataset.py new file mode 100644 index 0000000..e69de29 diff --git a/src/evaluate.py b/src/evaluate.py new file mode 100644 index 0000000..e69de29