Preprocessed tagged datasets, fixed CSV formatting issues, and added integrity checks. Also saved mappings for later inference use.

2026-02-18 22:36:58 +00:00
parent 94a9fa1f17
commit 608588f023
4 changed files with 766 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
 <<<<<<< Updated upstream
 data/*.csv
 raw_data/
 *.ipynb_checkpoints
@@ -7,10 +6,5 @@ models/
 *.pt
 *.pth
 .ipynb_checkpoints/
-=======
+*.csv
-multitag/data/*.csv
+backup/*.csv
 multitag/raw_data/
 multitag/.ipynb_checkpoints
 multitag/.vscode
 Uber Customer Reviews.csv
 >>>>>>> Stashed changes
--- a/notebooks/preprocessing_tagged.ipynb
+++ b/notebooks/preprocessing_tagged.ipynb
@@ -0,0 +1,571 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2b7cfa1a",
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'sklearn'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01msklearn\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mmodel_selection\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[32m      3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnp\u001b[39;00m\n",
      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'sklearn'"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "667df51d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "========TAGGED ORIGINAL========\n",
      "                                              review  rating  word_count  \\\n",
      "0      their have many problem but also best service       5           8   \n",
      "1           it's excellent i loved it thank you uber       5           8   \n",
      "2    it does the job as it should be, in a nice way!       5          12   \n",
      "3  i support my family members with the help of uber       5          10   \n",
      "4         it's good bt it is only.for 1 man or woman       5          10   \n",
      "\n",
      "   tagged feature_request bug_report   aspect aspect_sentiment  \n",
      "0       1              No         No  Service         Positive  \n",
      "1       1              No         No  General         Positive  \n",
      "2       1              No         No  General         Positive  \n",
      "3       1              No         No  General         Positive  \n",
      "4       1             Yes         No  General         Positive  \n",
      "\n",
      "========TAGGED BOOSTED========\n",
      "                                              review  word_count  rating  \\\n",
      "0  \"\"*the worst customer care and worst transport...          51     NaN   \n",
      "1                guy was excellent give him q raise!           7     1.0   \n",
      "2  \"\"poor service provider company, i have an err...          50     NaN   \n",
      "3  \"\"this app did not let me schedule a ride for ...          99     NaN   \n",
      "4  \"\"worst app.always high prices and drivers alw...          25     NaN   \n",
      "\n",
      "   tagged feature_request bug_report   aspect aspect_sentiment  \n",
      "0       1              No         No  Service         Negative  \n",
      "1       1              No         No   Driver         Positive  \n",
      "2       1              No        Yes      App         Negative  \n",
      "3       1              No        Yes      App         Negative  \n",
      "4       1              No         No  Pricing         Negative  \n"
     ]
    }
   ],
   "source": [
    "\n",
    "tagged_orignal_df = pd.read_csv('../data/uber_reviews_taggedOriginal.csv')\n",
    "print(f\"\\n========TAGGED ORIGINAL========\")\n",
    "print(tagged_orignal_df.head())\n",
    "tagged_boosted_df = pd.read_csv('../data/uber_reviews_taggedBoosted.csv')\n",
    "print(f\"\\n========TAGGED BOOSTED========\")\n",
    "print(tagged_boosted_df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "5cf6b62b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of duplicate rows was tagged_orignal_df: 0\n",
      "Number of duplicate rows was tagged_boosted_df: 3\n",
      "(These were removed in the next cell)\n",
      "\n",
      "Current duplicates: \n",
      "Number of duplicate rows in tagged_orignal_df: 0\n",
      "Number of duplicate rows in tagged_boosted_df: 0\n"
     ]
    }
   ],
   "source": [
    "print(\"Number of duplicate rows was tagged_orignal_df: 0\")\n",
    "print(\"Number of duplicate rows was tagged_boosted_df: 3\\n(These were removed in the next cell)\\n\\nCurrent duplicates: \")\n",
    "\n",
    "\n",
    "duplicates = tagged_orignal_df.duplicated()\n",
    "print(f\"Number of duplicate rows in tagged_orignal_df: {duplicates.sum()}\")\n",
    "\n",
    "duplicates = tagged_boosted_df.duplicated()\n",
    "print(f\"Number of duplicate rows in tagged_boosted_df: {duplicates.sum()}\")\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "a0ad690d",
   "metadata": {},
   "outputs": [],
   "source": [
    "tagged_orignal_df = tagged_orignal_df.drop_duplicates()\n",
    "tagged_boosted_df = tagged_boosted_df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "a89755b2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of exact duplicates in tagged_boosted_df: 0\n",
      "Number of exact duplicates in tagged_orignal_df: 0\n"
     ]
    }
   ],
   "source": [
    "print(\"Number of exact duplicates in tagged_boosted_df:\", tagged_boosted_df[tagged_boosted_df.duplicated(subset=['review'])].shape[0])\n",
    "print(\"Number of exact duplicates in tagged_orignal_df:\", tagged_orignal_df[tagged_orignal_df.duplicated(subset=['review'])].shape[0])\n",
    "\n",
    "tagged_boosted_df = tagged_boosted_df.drop_duplicates(subset=['review']).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "f7d58696",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unique reviews: 4989\n",
      "Rows involved in exact duplication: 0\n",
      "Distinct reviews that are duplicated: 0\n"
     ]
    }
   ],
   "source": [
    "# How many unique reviews remain if you were to drop exact dupes\n",
    "print(\"Unique reviews:\", tagged_boosted_df['review'].nunique())\n",
    "\n",
    "# How many duplicate pairs exist (not rows, but pairs)\n",
    "exact_dupe_rows = tagged_boosted_df[tagged_boosted_df.duplicated(subset=['review'], keep=False)]\n",
    "print(\"Rows involved in exact duplication:\", exact_dupe_rows.shape[0])\n",
    "print(\"Distinct reviews that are duplicated:\", tagged_boosted_df.duplicated(subset=['review']).sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "596d0f5d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Boosted sample:\n",
      "0    \"\"*the worst customer care and worst transport...\n",
      "1                  guy was excellent give him q raise!\n",
      "2    \"\"poor service provider company, i have an err...\n",
      "3    \"\"this app did not let me schedule a ride for ...\n",
      "4    \"\"worst app.always high prices and drivers alw...\n",
      "5     \"\"ac not worked, driver call the whole journey\"\"\n",
      "6                                  i think ola is best\n",
      "7    \"\"when it comes to rides, they're great, but w...\n",
      "8    am using note 3 samsung. every time this app n...\n",
      "9    حرامية زفت ضيع وقتى واخد الطريق الطويل عشان يض...\n",
      "Name: review, dtype: object\n",
      "\n",
      "Original sample:\n",
      "0        their have many problem but also best service\n",
      "1             it's excellent i loved it thank you uber\n",
      "2      it does the job as it should be, in a nice way!\n",
      "3    i support my family members with the help of uber\n",
      "4           it's good bt it is only.for 1 man or woman\n",
      "5                   easy to use and much more accurate\n",
      "6         good and comfortable drive also save for us.\n",
      "7                            best ride for dhaka city.\n",
      "8              friendly person thanks for your support\n",
      "9         a very good conversationalist! good driving.\n",
      "Name: review, dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(\"Boosted sample:\")\n",
    "print(tagged_boosted_df['review'].head(10))\n",
    "print(\"\\nOriginal sample:\")\n",
    "print(tagged_orignal_df['review'].head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61e6b97a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Boosted - leading quote artifacts: 29\n",
      "Original - leading quote artifacts: 4\n",
      "Boosted - doubled internal quotes: 29\n",
      "Original - doubled internal quotes: 0\n"
     ]
    }
   ],
   "source": [
    "print(\"Boosted - leading quote artifacts:\", tagged_boosted_df[tagged_boosted_df['review'].str.startswith('\"')].shape[0])\n",
    "print(\"Original - leading quote artifacts:\", tagged_orignal_df[tagged_orignal_df['review'].str.startswith('\"')].shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "926849ec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Boosted - leading quote artifacts: 29\n",
      "Original - leading quote artifacts: 4\n",
      "Boosted - doubled internal quotes: 0\n",
      "Original - doubled internal quotes: 0\n"
     ]
    }
   ],
   "source": [
    "print(\"Boosted - leading quote artifacts:\", tagged_boosted_df[tagged_boosted_df['review'].str.startswith('\"')].shape[0])\n",
    "print(\"Original - leading quote artifacts:\", tagged_orignal_df[tagged_orignal_df['review'].str.startswith('\"')].shape[0])\n",
    "\n",
    "import re\n",
    "\n",
    "def clean_quote_artifacts(text):\n",
    "    if not isinstance(text, str):\n",
    "        return text\n",
    "    # Strip any number of leading/trailing quote chars and whitespace\n",
    "    text = text.strip('\" \\t')\n",
    "    # Collapse runs of 2+ quotes down to a single quote (for internal ones like \"\"\"\"\"\"\"\")\n",
    "    text = re.sub(r'\"{2,}', '\"', text)\n",
    "    # Clean up any quotes now left dangling at edges after internal collapse\n",
    "    text = text.strip('\" ')\n",
    "    return text\n",
    "\n",
    "tagged_boosted_df['review'] = tagged_boosted_df['review'].apply(clean_quote_artifacts)\n",
    "tagged_orignal_df['review'] = tagged_orignal_df['review'].apply(clean_quote_artifacts)\n",
    "\n",
    "print(\"Boosted - doubled internal quotes:\", tagged_boosted_df[tagged_boosted_df['review'].str.contains('\"{2,}', regex=True)].shape[0])\n",
    "print(\"Original - doubled internal quotes:\", tagged_orignal_df[tagged_orignal_df['review'].str.contains('\"{2,}', regex=True)].shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "96c0520b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Boosted:  4,989 rows\n",
      "Original: 4,999 rows\n",
      "Boosted nulls:\n",
      " review               0\n",
      "word_count           0\n",
      "rating              84\n",
      "tagged               0\n",
      "feature_request      0\n",
      "bug_report           0\n",
      "aspect               0\n",
      "aspect_sentiment     0\n",
      "dtype: int64\n",
      "\n",
      "Original nulls:\n",
      " review              0\n",
      "rating              0\n",
      "word_count          0\n",
      "tagged              0\n",
      "feature_request     0\n",
      "bug_report          0\n",
      "aspect              0\n",
      "aspect_sentiment    0\n",
      "dtype: int64\n",
      "Boosted - empty reviews: 0\n",
      "Original - empty reviews: 0\n",
      "Boosted - leading quote artifacts: 0\n",
      "Original - leading quote artifacts: 0\n",
      "Boosted - doubled internal quotes: 0\n",
      "Original - doubled internal quotes: 0\n",
      "Boosted columns match: True\n",
      "Original columns match: False\n",
      "[\"*the worst customer care and worst transportation app in sri lanka because of it's staff service. *uber service in sri lanka is a shame to uber technologies, inc. *i expected better service because it's international service but in sri lanka it's better to use local transportation services than using uber! uber!\"\n",
      " 'worst customer care service they app stole my 140rs and not even replying'\n",
      " 'worst customer care service.cab service are good and needs to pay attention to the dynamic pricing. i feel ola is better in most of the cases. feb16 2019: bill duplicate and charged twice. customer care is so patatic and request for all the statements with no solution. i feel that there is no mechanism to track if the payment is successful or not. this is happening only with uber and second of such instance. i seen most of the online site does nice error handling and instantly give updates.'\n",
      " \"i hate this app. not working from 10 days and worst customer care service. no one is resolving. one of my uber account is disabled. and there unable to provide any reason and the other account. it says error processing your request. i tried to open another account. it is not even registring. the worst customer care service i have seen in my life is uber. 1 don't want to give even that 1 star to u. but it is the minimum. can't help😠😠😠\"\n",
      " \"worst customer care. worst service sometimes. the cab arrives 1 hour late yet they charge the cancellation amount. when complaint is made to customer care, they don't speak proper english or they were unable to understand. they don't listen to understand what the customer says. they are very keen on closing the issue and not interested in caring for the customer.\"\n",
      " \"you have the worst customer care support. you simply waste people's time without taking any option. rapido is far better atleast they have a helpline number where they call. i asked your customer support to call me on phone,but you always insist on messaging and keep on asking same thing again and again. i would give you minus review had there been options.\"\n",
      " \"worst customer care support possible. i lost my phone in uber and i couldn't contact anyone to help me out. only if uber was caring enough for their customers i would have not lost my phone and got it back!\"\n",
      " 'they have the worst payment system and worst customer care service too. i had a 50rs outstanding and 50 rs credits. but it was not balancing those while booking and didnt let me book. so i paid through credit card 50 rs. that 50 rs is nowhere in their system as well as got deducted from my account. even irctc has much better refund system than theirs. asking the customer care you get same automated reply irrespective of whatever you ask. no telephone customer care. what are they running, a local grocery shop!']\n"
     ]
    }
   ],
   "source": [
    "print(f\"Boosted:  {len(tagged_boosted_df):,} rows\")\n",
    "print(f\"Original: {len(tagged_orignal_df):,} rows\")\n",
    "print(\"Boosted nulls:\\n\", tagged_boosted_df.isnull().sum())\n",
    "print(\"\\nOriginal nulls:\\n\", tagged_orignal_df.isnull().sum())\n",
    "print(\"Boosted - empty reviews:\", tagged_boosted_df['review'].str.strip().eq('').sum())\n",
    "print(\"Original - empty reviews:\", tagged_orignal_df['review'].str.strip().eq('').sum())\n",
    "print(\"Boosted - leading quote artifacts:\", tagged_boosted_df[tagged_boosted_df['review'].str.startswith('\"')].shape[0])\n",
    "print(\"Original - leading quote artifacts:\", tagged_orignal_df[tagged_orignal_df['review'].str.startswith('\"')].shape[0])\n",
    "print(\"Boosted - doubled internal quotes:\", tagged_boosted_df[tagged_boosted_df['review'].str.contains('\"{2,}', regex=True)].shape[0])\n",
    "print(\"Original - doubled internal quotes:\", tagged_orignal_df[tagged_orignal_df['review'].str.contains('\"{2,}', regex=True)].shape[0])\n",
    "expected_cols = ['review', 'word_count', 'rating', 'tagged', 'feature_request', 'bug_report', 'aspect', 'aspect_sentiment']\n",
    "print(\"Boosted columns match:\", list(tagged_boosted_df.columns) == expected_cols)\n",
    "print(\"Original columns match:\", list(tagged_orignal_df.columns) == expected_cols)\n",
    "print(tagged_boosted_df[tagged_boosted_df['review'].str.contains('worst customer care', na=False)]['review'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27362604",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "8b76f032",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "aspect_sentiment\n",
      "Negative    75\n",
      "Positive     9\n",
      "Name: count, dtype: int64\n",
      "aspect\n",
      "Driver     23\n",
      "App        21\n",
      "Service    20\n",
      "Payment    12\n",
      "Pricing     7\n",
      "General     1\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Are the 84 null ratings clustered in a particular aspect/sentiment or spread across?\n",
    "print(tagged_boosted_df[tagged_boosted_df['rating'].isnull()]['aspect_sentiment'].value_counts())\n",
    "print(tagged_boosted_df[tagged_boosted_df['rating'].isnull()]['aspect'].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0aade3fb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Boosted saved:  4,989 rows\n",
      "Original saved: 4,999 rows\n"
     ]
    }
   ],
   "source": [
    "tagged_boosted_df.to_csv('tagged_boosted_cleaned.csv', index=False)\n",
    "tagged_orignal_df.to_csv('tagged_original_cleaned.csv', index=False)\n",
    "# I can confirm, the saved files have no malformed rows/number of columns is correct\n",
    "print(f\"Boosted saved:  {len(tagged_boosted_df):,} rows\")\n",
    "print(f\"Original saved: {len(tagged_orignal_df):,} rows\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "7e814adb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=========== Boosted ===========\n",
      "bug_report          0\n",
      "feature_request     0\n",
      "aspect              0\n",
      "aspect_sentiment    0\n",
      "dtype: int64\n",
      "bug_report          int64\n",
      "feature_request     int64\n",
      "aspect              int64\n",
      "aspect_sentiment    int64\n",
      "dtype: object\n",
      "   bug_report  feature_request  aspect  aspect_sentiment\n",
      "0           0                0       5                 2\n",
      "1           0                0       1                 0\n",
      "2           1                0       0                 2\n",
      "\n",
      "=========== Original ===========\n",
      "bug_report          0\n",
      "feature_request     0\n",
      "aspect              0\n",
      "aspect_sentiment    0\n",
      "dtype: int64\n",
      "bug_report          int64\n",
      "feature_request     int64\n",
      "aspect              int64\n",
      "aspect_sentiment    int64\n",
      "dtype: object\n",
      "   bug_report  feature_request  aspect  aspect_sentiment\n",
      "0           0                0       5                 0\n",
      "1           0                0       2                 0\n",
      "2           0                0       2                 0\n"
     ]
    }
   ],
   "source": [
    "# Now I need to convert/map yes / no to integers, same for all  tasks.\n",
    "# How did everything run earlier? there was a spelling mistake but everything looked fine\n",
    "tagged_original_df = pd.read_csv('../data/tagged_original_cleaned.csv')\n",
    "tagged_boosted_df = pd.read_csv('../data/tagged_boosted_cleaned.csv')\n",
    "# mappings\n",
    "binary_map = {'Yes': 1, 'No': 0}\n",
    "aspect_map = {'App': 0, 'Driver': 1, 'General': 2, 'Payment': 3, 'Pricing': 4, 'Service': 5}\n",
    "sentiment_map = {'Positive': 0, 'Neutral': 1, 'Negative': 2}\n",
    "\n",
    "tagged_original_df = pd.read_csv('../data/tagged_original_cleaned.csv')\n",
    "tagged_boosted_df  = pd.read_csv('../data/tagged_boosted_cleaned.csv')\n",
    "\n",
    "for df in [tagged_boosted_df, tagged_original_df]:\n",
    "    df['bug_report']       = df['bug_report'].map(binary_map)\n",
    "    df['feature_request']  = df['feature_request'].map(binary_map)\n",
    "    df['aspect']           = df['aspect'].map(aspect_map)\n",
    "    df['aspect_sentiment'] = df['aspect_sentiment'].map(sentiment_map)\n",
    "    \n",
    "for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
    "\n",
    "    # Verification after mapping\n",
    "    print(f\"\\n=========== {name} ===========\")\n",
    "    print(df[['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']].isnull().sum())\n",
    "    print(df[['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']].dtypes)\n",
    "    print(df[['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']].head(3))\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "093432db",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Boosted saved:  4,989 rows\n",
      "Original saved: 4,999 rows\n"
     ]
    }
   ],
   "source": [
    "tagged_boosted_df.to_csv('../data/tagged_boosted_cleaned.csv', index=False)\n",
    "tagged_original_df.to_csv('../data/tagged_original_cleaned.csv', index=False)\n",
    "\n",
    "print(f\"Boosted saved:  {len(tagged_boosted_df):,} rows\")\n",
    "print(f\"Original saved: {len(tagged_original_df):,} rows\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe8fb12c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0 1]\n",
      "[0 1]\n",
      "\n",
      "../data/tagged_boosted_cleaned.csv\n",
      "  Expected columns: 8\n",
      "  Malformed rows:   0\n",
      "\n",
      "../data/tagged_original_cleaned.csv\n",
      "  Expected columns: 8\n",
      "  Malformed rows:   0\n"
     ]
    }
   ],
   "source": [
    "print(tagged_boosted_df['bug_report'].unique())\n",
    "print(tagged_original_df['bug_report'].unique())\n",
    "\n",
    "# CSV structure checks "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d914c7b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "multitag",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/verify_tagged_distributions.ipynb
+++ b/notebooks/verify_tagged_distributions.ipynb
@@ -0,0 +1,189 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3203f7f9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                              review  rating  word_count  \\\n",
      "0      their have many problem but also best service       5           8   \n",
      "1           it's excellent i loved it thank you uber       5           8   \n",
      "2    it does the job as it should be, in a nice way!       5          12   \n",
      "3  i support my family members with the help of uber       5          10   \n",
      "4         it's good bt it is only.for 1 man or woman       5          10   \n",
      "\n",
      "   tagged feature_request bug_report   aspect aspect_sentiment  \n",
      "0       1              No         No  Service         Positive  \n",
      "1       1              No         No  General         Positive  \n",
      "2       1              No         No  General         Positive  \n",
      "3       1              No         No  General         Positive  \n",
      "4       1             Yes         No  General         Positive  \n",
      "review              object\n",
      "rating               int64\n",
      "word_count           int64\n",
      "tagged               int64\n",
      "feature_request     object\n",
      "bug_report          object\n",
      "aspect              object\n",
      "aspect_sentiment    object\n",
      "dtype: object\n",
      "        count  percentage\n",
      "rating                   \n",
      "1        1325       26.51\n",
      "2         195        3.90\n",
      "3         235        4.70\n",
      "4         390        7.80\n",
      "5        2854       57.09\n"
     ]
    }
   ],
   "source": [
    "# verify distribution of ratings before model training\n",
    "\n",
    "# As we can see the the ratings are pretty much the same as the original dataset, \n",
    "# we can proceed with model training without any concerns about the distribution of ratings being altered during the tagging process.\n",
    "import pandas as pd\n",
    "df = pd.read_csv(\"../data/uber_reviews_taggedOriginal.csv\")\n",
    "print(df.head())\n",
    "print(df.dtypes)\n",
    "rating_counts = df[\"rating\"].value_counts().sort_index()\n",
    "rating_percent = df[\"rating\"].value_counts(normalize=True).sort_index() * 100\n",
    "rating_dist = pd.DataFrame({\n",
    "    \"count\": rating_counts,\n",
    "    \"percentage\": rating_percent.round(2)\n",
    "})\n",
    "print(rating_dist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ae6a3737",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                              review  word_count  rating  \\\n",
      "0  \"\"*the worst customer care and worst transport...          51     NaN   \n",
      "1                guy was excellent give him q raise!           7     1.0   \n",
      "2  \"\"poor service provider company, i have an err...          50     NaN   \n",
      "3  \"\"this app did not let me schedule a ride for ...          99     NaN   \n",
      "4  \"\"worst app.always high prices and drivers alw...          25     NaN   \n",
      "\n",
      "   tagged feature_request bug_report   aspect aspect_sentiment  \n",
      "0       1              No         No  Service         Negative  \n",
      "1       1              No         No   Driver         Positive  \n",
      "2       1              No        Yes      App         Negative  \n",
      "3       1              No        Yes      App         Negative  \n",
      "4       1              No         No  Pricing         Negative  \n",
      "review               object\n",
      "word_count            int64\n",
      "rating              float64\n",
      "tagged                int64\n",
      "feature_request      object\n",
      "bug_report           object\n",
      "aspect               object\n",
      "aspect_sentiment     object\n",
      "dtype: object\n",
      "        count  percentage\n",
      "rating                   \n",
      "1.0      2118       43.11\n",
      "2.0       703       14.31\n",
      "3.0       697       14.19\n",
      "4.0       606       12.33\n",
      "5.0       789       16.06\n"
     ]
    }
   ],
   "source": [
    "# verify distribution of ratings before model training\n",
    "\n",
    "# Expecting the distribution to be the same as the keyword boosted sample, so heavily negative ~%80 one star.\n",
    "# Nevermind actually I just realized it is set to use a custom sample of the boosted to ensure it isn't heavily skewed.\n",
    "\n",
    "import pandas as pd\n",
    "df = pd.read_csv(\"../data/uber_reviews_ISboostedAndFixed.csv\")\n",
    "print(df.head())\n",
    "print(df.dtypes)\n",
    "rating_counts = df[\"rating\"].value_counts().sort_index()\n",
    "rating_percent = df[\"rating\"].value_counts(normalize=True).sort_index() * 100\n",
    "rating_dist = pd.DataFrame({\n",
    "    \"count\": rating_counts,\n",
    "    \"percentage\": rating_percent.round(2)\n",
    "})\n",
    "print(rating_dist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d7c2da78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   rating\n",
      "0       1\n",
      "1       5\n",
      "2       5\n",
      "3       5\n",
      "4       5\n",
      "rating    int64\n",
      "dtype: object\n",
      "         count  percentage\n",
      "rating                    \n",
      "1       283895       26.54\n",
      "2        41707        3.90\n",
      "3        49928        4.67\n",
      "4        82953        7.76\n",
      "5       611133       57.14\n"
     ]
    }
   ],
   "source": [
    "# check distribution of ratings for raw dataset\n",
    "import pandas as pd\n",
    "df = pd.read_csv(\"../data/uber_reviews.csv\", usecols=[\"rating\"])\n",
    "print(df.head())\n",
    "print(df.dtypes)\n",
    "rating_counts = df[\"rating\"].value_counts().sort_index()\n",
    "rating_percent = df[\"rating\"].value_counts(normalize=True).sort_index() * 100\n",
    "rating_dist = pd.DataFrame({\n",
    "    \"count\": rating_counts,\n",
    "    \"percentage\": rating_percent.round(2)\n",
    "})\n",
    "print(rating_dist)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "multitag",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/src/infer.py
+++ b/src/infer.py
@@ -0,0 +1,4 @@
 # mappings
 binary_map = {'Yes': 1, 'No': 0}
 aspect_map = {'App': 0, 'Driver': 1, 'General': 2, 'Payment': 3, 'Pricing': 4, 'Service': 5}
 sentiment_map = {'Positive': 0, 'Neutral': 1, 'Negative': 2}