Preprocessed tagged datasets, fixed CSV formatting issues, and added integrity checks. Also saved mappings for later inference use.

This commit is contained in:
2026-02-18 22:36:58 +00:00
parent 94a9fa1f17
commit 608588f023
4 changed files with 766 additions and 8 deletions

10
.gitignore vendored
View File

@@ -1,4 +1,3 @@
<<<<<<< Updated upstream
data/*.csv data/*.csv
raw_data/ raw_data/
*.ipynb_checkpoints *.ipynb_checkpoints
@@ -7,10 +6,5 @@ models/
*.pt *.pt
*.pth *.pth
.ipynb_checkpoints/ .ipynb_checkpoints/
======= *.csv
multitag/data/*.csv backup/*.csv
multitag/raw_data/
multitag/.ipynb_checkpoints
multitag/.vscode
Uber Customer Reviews.csv
>>>>>>> Stashed changes

View File

@@ -0,0 +1,571 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "2b7cfa1a",
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'sklearn'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01msklearn\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mmodel_selection\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnp\u001b[39;00m\n",
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'sklearn'"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "667df51d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"========TAGGED ORIGINAL========\n",
" review rating word_count \\\n",
"0 their have many problem but also best service 5 8 \n",
"1 it's excellent i loved it thank you uber 5 8 \n",
"2 it does the job as it should be, in a nice way! 5 12 \n",
"3 i support my family members with the help of uber 5 10 \n",
"4 it's good bt it is only.for 1 man or woman 5 10 \n",
"\n",
" tagged feature_request bug_report aspect aspect_sentiment \n",
"0 1 No No Service Positive \n",
"1 1 No No General Positive \n",
"2 1 No No General Positive \n",
"3 1 No No General Positive \n",
"4 1 Yes No General Positive \n",
"\n",
"========TAGGED BOOSTED========\n",
" review word_count rating \\\n",
"0 \"\"*the worst customer care and worst transport... 51 NaN \n",
"1 guy was excellent give him q raise! 7 1.0 \n",
"2 \"\"poor service provider company, i have an err... 50 NaN \n",
"3 \"\"this app did not let me schedule a ride for ... 99 NaN \n",
"4 \"\"worst app.always high prices and drivers alw... 25 NaN \n",
"\n",
" tagged feature_request bug_report aspect aspect_sentiment \n",
"0 1 No No Service Negative \n",
"1 1 No No Driver Positive \n",
"2 1 No Yes App Negative \n",
"3 1 No Yes App Negative \n",
"4 1 No No Pricing Negative \n"
]
}
],
"source": [
"\n",
"tagged_orignal_df = pd.read_csv('../data/uber_reviews_taggedOriginal.csv')\n",
"print(f\"\\n========TAGGED ORIGINAL========\")\n",
"print(tagged_orignal_df.head())\n",
"tagged_boosted_df = pd.read_csv('../data/uber_reviews_taggedBoosted.csv')\n",
"print(f\"\\n========TAGGED BOOSTED========\")\n",
"print(tagged_boosted_df.head())"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "5cf6b62b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of duplicate rows was tagged_orignal_df: 0\n",
"Number of duplicate rows was tagged_boosted_df: 3\n",
"(These were removed in the next cell)\n",
"\n",
"Current duplicates: \n",
"Number of duplicate rows in tagged_orignal_df: 0\n",
"Number of duplicate rows in tagged_boosted_df: 0\n"
]
}
],
"source": [
"print(\"Number of duplicate rows was tagged_orignal_df: 0\")\n",
"print(\"Number of duplicate rows was tagged_boosted_df: 3\\n(These were removed in the next cell)\\n\\nCurrent duplicates: \")\n",
"\n",
"\n",
"duplicates = tagged_orignal_df.duplicated()\n",
"print(f\"Number of duplicate rows in tagged_orignal_df: {duplicates.sum()}\")\n",
"\n",
"duplicates = tagged_boosted_df.duplicated()\n",
"print(f\"Number of duplicate rows in tagged_boosted_df: {duplicates.sum()}\")\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a0ad690d",
"metadata": {},
"outputs": [],
"source": [
"tagged_orignal_df = tagged_orignal_df.drop_duplicates()\n",
"tagged_boosted_df = tagged_boosted_df.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "a89755b2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of exact duplicates in tagged_boosted_df: 0\n",
"Number of exact duplicates in tagged_orignal_df: 0\n"
]
}
],
"source": [
"print(\"Number of exact duplicates in tagged_boosted_df:\", tagged_boosted_df[tagged_boosted_df.duplicated(subset=['review'])].shape[0])\n",
"print(\"Number of exact duplicates in tagged_orignal_df:\", tagged_orignal_df[tagged_orignal_df.duplicated(subset=['review'])].shape[0])\n",
"\n",
"tagged_boosted_df = tagged_boosted_df.drop_duplicates(subset=['review']).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "f7d58696",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unique reviews: 4989\n",
"Rows involved in exact duplication: 0\n",
"Distinct reviews that are duplicated: 0\n"
]
}
],
"source": [
"# How many unique reviews remain if you were to drop exact dupes\n",
"print(\"Unique reviews:\", tagged_boosted_df['review'].nunique())\n",
"\n",
"# How many duplicate pairs exist (not rows, but pairs)\n",
"exact_dupe_rows = tagged_boosted_df[tagged_boosted_df.duplicated(subset=['review'], keep=False)]\n",
"print(\"Rows involved in exact duplication:\", exact_dupe_rows.shape[0])\n",
"print(\"Distinct reviews that are duplicated:\", tagged_boosted_df.duplicated(subset=['review']).sum())"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "596d0f5d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Boosted sample:\n",
"0 \"\"*the worst customer care and worst transport...\n",
"1 guy was excellent give him q raise!\n",
"2 \"\"poor service provider company, i have an err...\n",
"3 \"\"this app did not let me schedule a ride for ...\n",
"4 \"\"worst app.always high prices and drivers alw...\n",
"5 \"\"ac not worked, driver call the whole journey\"\"\n",
"6 i think ola is best\n",
"7 \"\"when it comes to rides, they're great, but w...\n",
"8 am using note 3 samsung. every time this app n...\n",
"9 حرامية زفت ضيع وقتى واخد الطريق الطويل عشان يض...\n",
"Name: review, dtype: object\n",
"\n",
"Original sample:\n",
"0 their have many problem but also best service\n",
"1 it's excellent i loved it thank you uber\n",
"2 it does the job as it should be, in a nice way!\n",
"3 i support my family members with the help of uber\n",
"4 it's good bt it is only.for 1 man or woman\n",
"5 easy to use and much more accurate\n",
"6 good and comfortable drive also save for us.\n",
"7 best ride for dhaka city.\n",
"8 friendly person thanks for your support\n",
"9 a very good conversationalist! good driving.\n",
"Name: review, dtype: object\n"
]
}
],
"source": [
"print(\"Boosted sample:\")\n",
"print(tagged_boosted_df['review'].head(10))\n",
"print(\"\\nOriginal sample:\")\n",
"print(tagged_orignal_df['review'].head(10))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "61e6b97a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Boosted - leading quote artifacts: 29\n",
"Original - leading quote artifacts: 4\n",
"Boosted - doubled internal quotes: 29\n",
"Original - doubled internal quotes: 0\n"
]
}
],
"source": [
"print(\"Boosted - leading quote artifacts:\", tagged_boosted_df[tagged_boosted_df['review'].str.startswith('\"')].shape[0])\n",
"print(\"Original - leading quote artifacts:\", tagged_orignal_df[tagged_orignal_df['review'].str.startswith('\"')].shape[0])"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "926849ec",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Boosted - leading quote artifacts: 29\n",
"Original - leading quote artifacts: 4\n",
"Boosted - doubled internal quotes: 0\n",
"Original - doubled internal quotes: 0\n"
]
}
],
"source": [
"print(\"Boosted - leading quote artifacts:\", tagged_boosted_df[tagged_boosted_df['review'].str.startswith('\"')].shape[0])\n",
"print(\"Original - leading quote artifacts:\", tagged_orignal_df[tagged_orignal_df['review'].str.startswith('\"')].shape[0])\n",
"\n",
"import re\n",
"\n",
"def clean_quote_artifacts(text):\n",
" if not isinstance(text, str):\n",
" return text\n",
" # Strip any number of leading/trailing quote chars and whitespace\n",
" text = text.strip('\" \\t')\n",
" # Collapse runs of 2+ quotes down to a single quote (for internal ones like \"\"\"\"\"\"\"\")\n",
" text = re.sub(r'\"{2,}', '\"', text)\n",
" # Clean up any quotes now left dangling at edges after internal collapse\n",
" text = text.strip('\" ')\n",
" return text\n",
"\n",
"tagged_boosted_df['review'] = tagged_boosted_df['review'].apply(clean_quote_artifacts)\n",
"tagged_orignal_df['review'] = tagged_orignal_df['review'].apply(clean_quote_artifacts)\n",
"\n",
"print(\"Boosted - doubled internal quotes:\", tagged_boosted_df[tagged_boosted_df['review'].str.contains('\"{2,}', regex=True)].shape[0])\n",
"print(\"Original - doubled internal quotes:\", tagged_orignal_df[tagged_orignal_df['review'].str.contains('\"{2,}', regex=True)].shape[0])"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "96c0520b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Boosted: 4,989 rows\n",
"Original: 4,999 rows\n",
"Boosted nulls:\n",
" review 0\n",
"word_count 0\n",
"rating 84\n",
"tagged 0\n",
"feature_request 0\n",
"bug_report 0\n",
"aspect 0\n",
"aspect_sentiment 0\n",
"dtype: int64\n",
"\n",
"Original nulls:\n",
" review 0\n",
"rating 0\n",
"word_count 0\n",
"tagged 0\n",
"feature_request 0\n",
"bug_report 0\n",
"aspect 0\n",
"aspect_sentiment 0\n",
"dtype: int64\n",
"Boosted - empty reviews: 0\n",
"Original - empty reviews: 0\n",
"Boosted - leading quote artifacts: 0\n",
"Original - leading quote artifacts: 0\n",
"Boosted - doubled internal quotes: 0\n",
"Original - doubled internal quotes: 0\n",
"Boosted columns match: True\n",
"Original columns match: False\n",
"[\"*the worst customer care and worst transportation app in sri lanka because of it's staff service. *uber service in sri lanka is a shame to uber technologies, inc. *i expected better service because it's international service but in sri lanka it's better to use local transportation services than using uber! uber!\"\n",
" 'worst customer care service they app stole my 140rs and not even replying'\n",
" 'worst customer care service.cab service are good and needs to pay attention to the dynamic pricing. i feel ola is better in most of the cases. feb16 2019: bill duplicate and charged twice. customer care is so patatic and request for all the statements with no solution. i feel that there is no mechanism to track if the payment is successful or not. this is happening only with uber and second of such instance. i seen most of the online site does nice error handling and instantly give updates.'\n",
" \"i hate this app. not working from 10 days and worst customer care service. no one is resolving. one of my uber account is disabled. and there unable to provide any reason and the other account. it says error processing your request. i tried to open another account. it is not even registring. the worst customer care service i have seen in my life is uber. 1 don't want to give even that 1 star to u. but it is the minimum. can't help😠😠😠\"\n",
" \"worst customer care. worst service sometimes. the cab arrives 1 hour late yet they charge the cancellation amount. when complaint is made to customer care, they don't speak proper english or they were unable to understand. they don't listen to understand what the customer says. they are very keen on closing the issue and not interested in caring for the customer.\"\n",
" \"you have the worst customer care support. you simply waste people's time without taking any option. rapido is far better atleast they have a helpline number where they call. i asked your customer support to call me on phone,but you always insist on messaging and keep on asking same thing again and again. i would give you minus review had there been options.\"\n",
" \"worst customer care support possible. i lost my phone in uber and i couldn't contact anyone to help me out. only if uber was caring enough for their customers i would have not lost my phone and got it back!\"\n",
" 'they have the worst payment system and worst customer care service too. i had a 50rs outstanding and 50 rs credits. but it was not balancing those while booking and didnt let me book. so i paid through credit card 50 rs. that 50 rs is nowhere in their system as well as got deducted from my account. even irctc has much better refund system than theirs. asking the customer care you get same automated reply irrespective of whatever you ask. no telephone customer care. what are they running, a local grocery shop!']\n"
]
}
],
"source": [
"print(f\"Boosted: {len(tagged_boosted_df):,} rows\")\n",
"print(f\"Original: {len(tagged_orignal_df):,} rows\")\n",
"print(\"Boosted nulls:\\n\", tagged_boosted_df.isnull().sum())\n",
"print(\"\\nOriginal nulls:\\n\", tagged_orignal_df.isnull().sum())\n",
"print(\"Boosted - empty reviews:\", tagged_boosted_df['review'].str.strip().eq('').sum())\n",
"print(\"Original - empty reviews:\", tagged_orignal_df['review'].str.strip().eq('').sum())\n",
"print(\"Boosted - leading quote artifacts:\", tagged_boosted_df[tagged_boosted_df['review'].str.startswith('\"')].shape[0])\n",
"print(\"Original - leading quote artifacts:\", tagged_orignal_df[tagged_orignal_df['review'].str.startswith('\"')].shape[0])\n",
"print(\"Boosted - doubled internal quotes:\", tagged_boosted_df[tagged_boosted_df['review'].str.contains('\"{2,}', regex=True)].shape[0])\n",
"print(\"Original - doubled internal quotes:\", tagged_orignal_df[tagged_orignal_df['review'].str.contains('\"{2,}', regex=True)].shape[0])\n",
"expected_cols = ['review', 'word_count', 'rating', 'tagged', 'feature_request', 'bug_report', 'aspect', 'aspect_sentiment']\n",
"print(\"Boosted columns match:\", list(tagged_boosted_df.columns) == expected_cols)\n",
"print(\"Original columns match:\", list(tagged_orignal_df.columns) == expected_cols)\n",
"print(tagged_boosted_df[tagged_boosted_df['review'].str.contains('worst customer care', na=False)]['review'].values)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "27362604",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 44,
"id": "8b76f032",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"aspect_sentiment\n",
"Negative 75\n",
"Positive 9\n",
"Name: count, dtype: int64\n",
"aspect\n",
"Driver 23\n",
"App 21\n",
"Service 20\n",
"Payment 12\n",
"Pricing 7\n",
"General 1\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# Are the 84 null ratings clustered in a particular aspect/sentiment or spread across?\n",
"print(tagged_boosted_df[tagged_boosted_df['rating'].isnull()]['aspect_sentiment'].value_counts())\n",
"print(tagged_boosted_df[tagged_boosted_df['rating'].isnull()]['aspect'].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0aade3fb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Boosted saved: 4,989 rows\n",
"Original saved: 4,999 rows\n"
]
}
],
"source": [
"tagged_boosted_df.to_csv('tagged_boosted_cleaned.csv', index=False)\n",
"tagged_orignal_df.to_csv('tagged_original_cleaned.csv', index=False)\n",
"# I can confirm, the saved files have no malformed rows/number of columns is correct\n",
"print(f\"Boosted saved: {len(tagged_boosted_df):,} rows\")\n",
"print(f\"Original saved: {len(tagged_orignal_df):,} rows\")"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "7e814adb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=========== Boosted ===========\n",
"bug_report 0\n",
"feature_request 0\n",
"aspect 0\n",
"aspect_sentiment 0\n",
"dtype: int64\n",
"bug_report int64\n",
"feature_request int64\n",
"aspect int64\n",
"aspect_sentiment int64\n",
"dtype: object\n",
" bug_report feature_request aspect aspect_sentiment\n",
"0 0 0 5 2\n",
"1 0 0 1 0\n",
"2 1 0 0 2\n",
"\n",
"=========== Original ===========\n",
"bug_report 0\n",
"feature_request 0\n",
"aspect 0\n",
"aspect_sentiment 0\n",
"dtype: int64\n",
"bug_report int64\n",
"feature_request int64\n",
"aspect int64\n",
"aspect_sentiment int64\n",
"dtype: object\n",
" bug_report feature_request aspect aspect_sentiment\n",
"0 0 0 5 0\n",
"1 0 0 2 0\n",
"2 0 0 2 0\n"
]
}
],
"source": [
"# Now I need to convert/map yes / no to integers, same for all tasks.\n",
"# How did everything run earlier? there was a spelling mistake but everything looked fine\n",
"tagged_original_df = pd.read_csv('../data/tagged_original_cleaned.csv')\n",
"tagged_boosted_df = pd.read_csv('../data/tagged_boosted_cleaned.csv')\n",
"# mappings\n",
"binary_map = {'Yes': 1, 'No': 0}\n",
"aspect_map = {'App': 0, 'Driver': 1, 'General': 2, 'Payment': 3, 'Pricing': 4, 'Service': 5}\n",
"sentiment_map = {'Positive': 0, 'Neutral': 1, 'Negative': 2}\n",
"\n",
"tagged_original_df = pd.read_csv('../data/tagged_original_cleaned.csv')\n",
"tagged_boosted_df = pd.read_csv('../data/tagged_boosted_cleaned.csv')\n",
"\n",
"for df in [tagged_boosted_df, tagged_original_df]:\n",
" df['bug_report'] = df['bug_report'].map(binary_map)\n",
" df['feature_request'] = df['feature_request'].map(binary_map)\n",
" df['aspect'] = df['aspect'].map(aspect_map)\n",
" df['aspect_sentiment'] = df['aspect_sentiment'].map(sentiment_map)\n",
" \n",
"for df, name in [(tagged_boosted_df, 'Boosted'), (tagged_original_df, 'Original')]:\n",
"\n",
" # Verification after mapping\n",
" print(f\"\\n=========== {name} ===========\")\n",
" print(df[['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']].isnull().sum())\n",
" print(df[['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']].dtypes)\n",
" print(df[['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']].head(3))\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "093432db",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Boosted saved: 4,989 rows\n",
"Original saved: 4,999 rows\n"
]
}
],
"source": [
"tagged_boosted_df.to_csv('../data/tagged_boosted_cleaned.csv', index=False)\n",
"tagged_original_df.to_csv('../data/tagged_original_cleaned.csv', index=False)\n",
"\n",
"print(f\"Boosted saved: {len(tagged_boosted_df):,} rows\")\n",
"print(f\"Original saved: {len(tagged_original_df):,} rows\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fe8fb12c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0 1]\n",
"[0 1]\n",
"\n",
"../data/tagged_boosted_cleaned.csv\n",
" Expected columns: 8\n",
" Malformed rows: 0\n",
"\n",
"../data/tagged_original_cleaned.csv\n",
" Expected columns: 8\n",
" Malformed rows: 0\n"
]
}
],
"source": [
"print(tagged_boosted_df['bug_report'].unique())\n",
"print(tagged_original_df['bug_report'].unique())\n",
"\n",
"# CSV structure checks "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d914c7b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "multitag",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,189 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "3203f7f9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" review rating word_count \\\n",
"0 their have many problem but also best service 5 8 \n",
"1 it's excellent i loved it thank you uber 5 8 \n",
"2 it does the job as it should be, in a nice way! 5 12 \n",
"3 i support my family members with the help of uber 5 10 \n",
"4 it's good bt it is only.for 1 man or woman 5 10 \n",
"\n",
" tagged feature_request bug_report aspect aspect_sentiment \n",
"0 1 No No Service Positive \n",
"1 1 No No General Positive \n",
"2 1 No No General Positive \n",
"3 1 No No General Positive \n",
"4 1 Yes No General Positive \n",
"review object\n",
"rating int64\n",
"word_count int64\n",
"tagged int64\n",
"feature_request object\n",
"bug_report object\n",
"aspect object\n",
"aspect_sentiment object\n",
"dtype: object\n",
" count percentage\n",
"rating \n",
"1 1325 26.51\n",
"2 195 3.90\n",
"3 235 4.70\n",
"4 390 7.80\n",
"5 2854 57.09\n"
]
}
],
"source": [
"# verify distribution of ratings before model training\n",
"\n",
"# As we can see the the ratings are pretty much the same as the original dataset, \n",
"# we can proceed with model training without any concerns about the distribution of ratings being altered during the tagging process.\n",
"import pandas as pd\n",
"df = pd.read_csv(\"../data/uber_reviews_taggedOriginal.csv\")\n",
"print(df.head())\n",
"print(df.dtypes)\n",
"rating_counts = df[\"rating\"].value_counts().sort_index()\n",
"rating_percent = df[\"rating\"].value_counts(normalize=True).sort_index() * 100\n",
"rating_dist = pd.DataFrame({\n",
" \"count\": rating_counts,\n",
" \"percentage\": rating_percent.round(2)\n",
"})\n",
"print(rating_dist)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ae6a3737",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" review word_count rating \\\n",
"0 \"\"*the worst customer care and worst transport... 51 NaN \n",
"1 guy was excellent give him q raise! 7 1.0 \n",
"2 \"\"poor service provider company, i have an err... 50 NaN \n",
"3 \"\"this app did not let me schedule a ride for ... 99 NaN \n",
"4 \"\"worst app.always high prices and drivers alw... 25 NaN \n",
"\n",
" tagged feature_request bug_report aspect aspect_sentiment \n",
"0 1 No No Service Negative \n",
"1 1 No No Driver Positive \n",
"2 1 No Yes App Negative \n",
"3 1 No Yes App Negative \n",
"4 1 No No Pricing Negative \n",
"review object\n",
"word_count int64\n",
"rating float64\n",
"tagged int64\n",
"feature_request object\n",
"bug_report object\n",
"aspect object\n",
"aspect_sentiment object\n",
"dtype: object\n",
" count percentage\n",
"rating \n",
"1.0 2118 43.11\n",
"2.0 703 14.31\n",
"3.0 697 14.19\n",
"4.0 606 12.33\n",
"5.0 789 16.06\n"
]
}
],
"source": [
"# verify distribution of ratings before model training\n",
"\n",
"# Expecting the distribution to be the same as the keyword boosted sample, so heavily negative ~%80 one star.\n",
"# Nevermind actually I just realized it is set to use a custom sample of the boosted to ensure it isn't heavily skewed.\n",
"\n",
"import pandas as pd\n",
"df = pd.read_csv(\"../data/uber_reviews_ISboostedAndFixed.csv\")\n",
"print(df.head())\n",
"print(df.dtypes)\n",
"rating_counts = df[\"rating\"].value_counts().sort_index()\n",
"rating_percent = df[\"rating\"].value_counts(normalize=True).sort_index() * 100\n",
"rating_dist = pd.DataFrame({\n",
" \"count\": rating_counts,\n",
" \"percentage\": rating_percent.round(2)\n",
"})\n",
"print(rating_dist)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d7c2da78",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" rating\n",
"0 1\n",
"1 5\n",
"2 5\n",
"3 5\n",
"4 5\n",
"rating int64\n",
"dtype: object\n",
" count percentage\n",
"rating \n",
"1 283895 26.54\n",
"2 41707 3.90\n",
"3 49928 4.67\n",
"4 82953 7.76\n",
"5 611133 57.14\n"
]
}
],
"source": [
"# check distribution of ratings for raw dataset\n",
"import pandas as pd\n",
"df = pd.read_csv(\"../data/uber_reviews.csv\", usecols=[\"rating\"])\n",
"print(df.head())\n",
"print(df.dtypes)\n",
"rating_counts = df[\"rating\"].value_counts().sort_index()\n",
"rating_percent = df[\"rating\"].value_counts(normalize=True).sort_index() * 100\n",
"rating_dist = pd.DataFrame({\n",
" \"count\": rating_counts,\n",
" \"percentage\": rating_percent.round(2)\n",
"})\n",
"print(rating_dist)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "multitag",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,4 @@
# mappings
binary_map = {'Yes': 1, 'No': 0}
aspect_map = {'App': 0, 'Driver': 1, 'General': 2, 'Payment': 3, 'Pricing': 4, 'Service': 5}
sentiment_map = {'Positive': 0, 'Neutral': 1, 'Negative': 2}