diff --git a/notebooks/preprocessing_tagged.ipynb b/notebooks/preprocessing_tagged.ipynb
index 1ca5fbc..af33f70 100644
--- a/notebooks/preprocessing_tagged.ipynb
+++ b/notebooks/preprocessing_tagged.ipynb
@@ -541,7 +541,474 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2d914c7b",
+   "id": "03bc252c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Now I can finally split into train/test sets\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "562bf9cd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train: 3,492 (0.6999398677089597)\n",
+      "Val:   748   (0.149929845660453)\n",
+      "Test:  749  (0.15013028663058728)\n",
+      "Train: 3,499 (0.6999399879975995)\n",
+      "Val:   750   (0.15003000600120023)\n",
+      "Test:  750  (0.15003000600120023)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def split(df, name, seed=67):\n",
+    "    train, temp = train_test_split(df, test_size=0.30, random_state=seed, stratify=df['aspect_sentiment'])\n",
+    "    val, test = train_test_split(temp, test_size=0.50, random_state=seed, stratify=temp['aspect_sentiment'])\n",
+    "\n",
+    "    print(f\"Train: {len(train):,} ({len(train)/len(df)})\")\n",
+    "    print(f\"Val:   {len(val):,}   ({len(val)/len(df)})\")\n",
+    "    print(f\"Test:  {len(test):,}  ({len(test)/len(df)})\")\n",
+    "    return train, val, test\n",
+    "\n",
+    "tagged_boosted_df = pd.read_csv('../data/tagged_boosted_cleaned.csv')\n",
+    "tagged_original_df = pd.read_csv('../data/tagged_original_cleaned.csv')\n",
+    "\n",
+    "train_b, val_b, test_b = split(tagged_boosted_df, 'Boosted')\n",
+    "train_o, val_o, test_o = split(tagged_original_df, 'Original')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "b3595063",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=========== Boosted Train ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.648053\n",
+      "1    0.351947\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.909507\n",
+      "1    0.090493\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.285223\n",
+      "1    0.251432\n",
+      "5    0.154066\n",
+      "2    0.118557\n",
+      "3    0.108820\n",
+      "4    0.081901\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "2    0.808706\n",
+      "0    0.182703\n",
+      "1    0.008591\n",
+      "Name: proportion, dtype: float64\n",
+      "\n",
+      "=========== Boosted Val ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.649733\n",
+      "1    0.350267\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.895722\n",
+      "1    0.104278\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.283422\n",
+      "1    0.236631\n",
+      "5    0.168449\n",
+      "2    0.112299\n",
+      "3    0.108289\n",
+      "4    0.090909\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "2    0.808824\n",
+      "0    0.183155\n",
+      "1    0.008021\n",
+      "Name: proportion, dtype: float64\n",
+      "\n",
+      "=========== Boosted Test ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.670227\n",
+      "1    0.329773\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.895861\n",
+      "1    0.104139\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.269693\n",
+      "1    0.244326\n",
+      "5    0.146862\n",
+      "2    0.117490\n",
+      "3    0.116155\n",
+      "4    0.105474\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "2    0.809079\n",
+      "0    0.182911\n",
+      "1    0.008011\n",
+      "Name: proportion, dtype: float64\n",
+      "\n",
+      "=========== Original Train ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.818805\n",
+      "1    0.181195\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.84367\n",
+      "1    0.15633\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.293512\n",
+      "5    0.214347\n",
+      "1    0.192626\n",
+      "2    0.157759\n",
+      "4    0.096599\n",
+      "3    0.045156\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "0    0.514433\n",
+      "2    0.433267\n",
+      "1    0.052301\n",
+      "Name: proportion, dtype: float64\n",
+      "\n",
+      "=========== Original Val ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.805333\n",
+      "1    0.194667\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.849333\n",
+      "1    0.150667\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.282667\n",
+      "5    0.225333\n",
+      "1    0.192000\n",
+      "2    0.180000\n",
+      "4    0.076000\n",
+      "3    0.044000\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "0    0.514667\n",
+      "2    0.433333\n",
+      "1    0.052000\n",
+      "Name: proportion, dtype: float64\n",
+      "\n",
+      "=========== Original Test ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.813333\n",
+      "1    0.186667\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.841333\n",
+      "1    0.158667\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.308000\n",
+      "5    0.208000\n",
+      "1    0.206667\n",
+      "2    0.148000\n",
+      "4    0.106667\n",
+      "3    0.022667\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "0    0.513333\n",
+      "2    0.433333\n",
+      "1    0.053333\n",
+      "Name: proportion, dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "splits = [\n",
+    "    (train_b, 'Boosted Train'), (val_b, 'Boosted Val'), (test_b, 'Boosted Test'),\n",
+    "    (train_o, 'Original Train'), (val_o, 'Original Val'), (test_o, 'Original Test')\n",
+    "]\n",
+    "\n",
+    "for df, name in splits:\n",
+    "    print(f\"\\n=========== {name} ===========\")\n",
+    "    print(\"bug_report:\\n\", df['bug_report'].value_counts(normalize=True))\n",
+    "    print(\"feature_request:\\n\", df['feature_request'].value_counts(normalize=True))\n",
+    "    print(\"aspect:\\n\", df['aspect'].value_counts(normalize=True))\n",
+    "    print(\"aspect_sentiment:\\n\", df['aspect_sentiment'].value_counts(normalize=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "2a524db6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train: 3,492 (0.6999398677089597)\n",
+      "Val:   748   (0.149929845660453)\n",
+      "Test:  749  (0.15013028663058728)\n",
+      "Train: 3,499 (0.6999399879975995)\n",
+      "Val:   750   (0.15003000600120023)\n",
+      "Test:  750  (0.15003000600120023)\n"
+     ]
+    }
+   ],
+   "source": [
+    "def split(df, name, seed=67):\n",
+    "    composite = df['bug_report'].astype(str) + \"_\" + df['feature_request'].astype(str) + \"_\" + df['aspect_sentiment'].astype(str)\n",
+    "    train, temp = train_test_split(df, test_size=0.30, random_state=seed, stratify=composite)\n",
+    "    \n",
+    "    composite_temp = temp['bug_report'].astype(str) + \"_\" + temp['feature_request'].astype(str) + \"_\" + temp['aspect_sentiment'].astype(str)\n",
+    "    val, test = train_test_split(temp, test_size=0.50, random_state=seed, stratify=composite_temp)\n",
+    "\n",
+    "    print(f\"Train: {len(train):,} ({len(train)/len(df)})\")\n",
+    "    print(f\"Val:   {len(val):,}   ({len(val)/len(df)})\")\n",
+    "    print(f\"Test:  {len(test):,}  ({len(test)/len(df)})\")\n",
+    "    return train, val, test\n",
+    "\n",
+    "tagged_boosted_df = pd.read_csv('../data/tagged_boosted_cleaned.csv')\n",
+    "tagged_original_df = pd.read_csv('../data/tagged_original_cleaned.csv')\n",
+    "\n",
+    "train_b, val_b, test_b = split(tagged_boosted_df, 'Boosted')\n",
+    "train_o, val_o, test_o = split(tagged_original_df, 'Original')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "bc486e8f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=========== Boosted Train ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.651489\n",
+      "1    0.348511\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.905212\n",
+      "1    0.094788\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.280928\n",
+      "1    0.250286\n",
+      "5    0.152348\n",
+      "2    0.118270\n",
+      "3    0.114261\n",
+      "4    0.083906\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "2    0.808706\n",
+      "0    0.182990\n",
+      "1    0.008305\n",
+      "Name: proportion, dtype: float64\n",
+      "\n",
+      "=========== Boosted Val ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.65107\n",
+      "1    0.34893\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.906417\n",
+      "1    0.093583\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.286096\n",
+      "1    0.251337\n",
+      "5    0.159091\n",
+      "2    0.113636\n",
+      "3    0.097594\n",
+      "4    0.092246\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "2    0.807487\n",
+      "0    0.183155\n",
+      "1    0.009358\n",
+      "Name: proportion, dtype: float64\n",
+      "\n",
+      "=========== Boosted Test ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.65287\n",
+      "1    0.34713\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.905207\n",
+      "1    0.094793\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.287049\n",
+      "1    0.234980\n",
+      "5    0.164219\n",
+      "2    0.117490\n",
+      "3    0.101469\n",
+      "4    0.094793\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "2    0.810414\n",
+      "0    0.181575\n",
+      "1    0.008011\n",
+      "Name: proportion, dtype: float64\n",
+      "\n",
+      "=========== Original Train ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.815947\n",
+      "1    0.184053\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.844241\n",
+      "1    0.155759\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.294941\n",
+      "5    0.215490\n",
+      "1    0.199486\n",
+      "2    0.153472\n",
+      "4    0.097456\n",
+      "3    0.039154\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "0    0.514433\n",
+      "2    0.433267\n",
+      "1    0.052301\n",
+      "Name: proportion, dtype: float64\n",
+      "\n",
+      "=========== Original Val ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.816\n",
+      "1    0.184\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.844\n",
+      "1    0.156\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.284000\n",
+      "5    0.205333\n",
+      "1    0.190667\n",
+      "2    0.181333\n",
+      "4    0.088000\n",
+      "3    0.050667\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "0    0.513333\n",
+      "2    0.433333\n",
+      "1    0.053333\n",
+      "Name: proportion, dtype: float64\n",
+      "\n",
+      "=========== Original Test ===========\n",
+      "bug_report:\n",
+      " bug_report\n",
+      "0    0.816\n",
+      "1    0.184\n",
+      "Name: proportion, dtype: float64\n",
+      "feature_request:\n",
+      " feature_request\n",
+      "0    0.844\n",
+      "1    0.156\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect:\n",
+      " aspect\n",
+      "0    0.300000\n",
+      "5    0.222667\n",
+      "1    0.176000\n",
+      "2    0.166667\n",
+      "4    0.090667\n",
+      "3    0.044000\n",
+      "Name: proportion, dtype: float64\n",
+      "aspect_sentiment:\n",
+      " aspect_sentiment\n",
+      "0    0.514667\n",
+      "2    0.433333\n",
+      "1    0.052000\n",
+      "Name: proportion, dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Using composite key\n",
+    "\n",
+    "splits = [\n",
+    "    (train_b, 'Boosted Train'), (val_b, 'Boosted Val'), (test_b, 'Boosted Test'),\n",
+    "    (train_o, 'Original Train'), (val_o, 'Original Val'), (test_o, 'Original Test')\n",
+    "]\n",
+    "\n",
+    "for df, name in splits:\n",
+    "    print(f\"\\n=========== {name} ===========\")\n",
+    "    print(\"bug_report:\\n\", df['bug_report'].value_counts(normalize=True))\n",
+    "    print(\"feature_request:\\n\", df['feature_request'].value_counts(normalize=True))\n",
+    "    print(\"aspect:\\n\", df['aspect'].value_counts(normalize=True))\n",
+    "    print(\"aspect_sentiment:\\n\", df['aspect_sentiment'].value_counts(normalize=True))\n",
+    "    df.to_csv(f'{name.lower().replace(\" \", \"_\")}.csv', index=False)    # 6 seperate files are saved"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67004c13",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/src/dataset.py b/src/dataset.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/evaluate.py b/src/evaluate.py
new file mode 100644
index 0000000..e69de29