House Cleaning

This commit is contained in:
charlie-rasberry
2026-01-28 16:41:27 +00:00
parent 6cf36faf64
commit 8d3dee6d30
10 changed files with 150 additions and 483 deletions

View File

@@ -0,0 +1,457 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"id": "470fe7c6-1614-4daf-879f-e6c399117c7b",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "afe1168c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cwd: c:\\Users\\ch\\6013\\notebooks\n",
"exists data: True\n"
]
}
],
"source": [
"import os\n",
"print(\"cwd:\", os.getcwd())\n",
"print(\"exists data:\", os.path.exists(\"../data/\"))\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "b855045e-2dd1-4fa1-ab5a-8ce8b50b02ee",
"metadata": {},
"outputs": [],
"source": [
"\n",
"df = pd.read_csv('../data/uber_reviews.csv', low_memory=False)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e7da1fb6-ede6-46c6-8fbd-fa491d3351c5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>source</th>\n",
" <th>review_id</th>\n",
" <th>user_name</th>\n",
" <th>review_title</th>\n",
" <th>review_description</th>\n",
" <th>rating</th>\n",
" <th>thumbs_up</th>\n",
" <th>review_date</th>\n",
" <th>developer_response</th>\n",
" <th>developer_response_date</th>\n",
" <th>appVersion</th>\n",
" <th>laguage_code</th>\n",
" <th>country_code</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Google Play</td>\n",
" <td>18d6584c-d0e9-4833-a744-f607058aee97</td>\n",
" <td>Milky Way</td>\n",
" <td>NaN</td>\n",
" <td>Suddenly, the driver can't have my location an...</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>2023-08-10 17:48:51</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>en</td>\n",
" <td>in</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Google Play</td>\n",
" <td>50a08f18-cece-4ddf-b617-028844c8aa28</td>\n",
" <td>Bradlee Severa</td>\n",
" <td>NaN</td>\n",
" <td>Very cordial.. And helped with a quick turnaro...</td>\n",
" <td>5</td>\n",
" <td>0.0</td>\n",
" <td>2023-08-10 17:38:35</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4.485.10000</td>\n",
" <td>en</td>\n",
" <td>in</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Google Play</td>\n",
" <td>b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7</td>\n",
" <td>Amit Aggarwal</td>\n",
" <td>NaN</td>\n",
" <td>Very good experience</td>\n",
" <td>5</td>\n",
" <td>0.0</td>\n",
" <td>2023-08-10 17:38:17</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4.486.10002</td>\n",
" <td>en</td>\n",
" <td>in</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Google Play</td>\n",
" <td>502702a9-25ed-4373-a96c-7fa1f06caacd</td>\n",
" <td>Bryant Inman</td>\n",
" <td>NaN</td>\n",
" <td>All I use</td>\n",
" <td>5</td>\n",
" <td>0.0</td>\n",
" <td>2023-08-10 17:37:45</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4.467.10008</td>\n",
" <td>en</td>\n",
" <td>in</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Google Play</td>\n",
" <td>f47a3fb6-23db-49bd-9e63-f33c8d724d07</td>\n",
" <td>Addie Whittaker</td>\n",
" <td>NaN</td>\n",
" <td>I have enjoyed traveling by Uber my drivers ha...</td>\n",
" <td>5</td>\n",
" <td>0.0</td>\n",
" <td>2023-08-10 17:36:56</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4.486.10002</td>\n",
" <td>en</td>\n",
" <td>in</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" source review_id user_name \\\n",
"0 Google Play 18d6584c-d0e9-4833-a744-f607058aee97 Milky Way \n",
"1 Google Play 50a08f18-cece-4ddf-b617-028844c8aa28 Bradlee Severa \n",
"2 Google Play b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7 Amit Aggarwal \n",
"3 Google Play 502702a9-25ed-4373-a96c-7fa1f06caacd Bryant Inman \n",
"4 Google Play f47a3fb6-23db-49bd-9e63-f33c8d724d07 Addie Whittaker \n",
"\n",
" review_title review_description rating \\\n",
"0 NaN Suddenly, the driver can't have my location an... 1 \n",
"1 NaN Very cordial.. And helped with a quick turnaro... 5 \n",
"2 NaN Very good experience 5 \n",
"3 NaN All I use 5 \n",
"4 NaN I have enjoyed traveling by Uber my drivers ha... 5 \n",
"\n",
" thumbs_up review_date developer_response developer_response_date \\\n",
"0 0.0 2023-08-10 17:48:51 NaN NaN \n",
"1 0.0 2023-08-10 17:38:35 NaN NaN \n",
"2 0.0 2023-08-10 17:38:17 NaN NaN \n",
"3 0.0 2023-08-10 17:37:45 NaN NaN \n",
"4 0.0 2023-08-10 17:36:56 NaN NaN \n",
"\n",
" appVersion laguage_code country_code \n",
"0 NaN en in \n",
"1 4.485.10000 en in \n",
"2 4.486.10002 en in \n",
"3 4.467.10008 en in \n",
"4 4.486.10002 en in "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "5c02ec54-4583-4720-88c6-1110b52c3f88",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rating\n",
"1 283895\n",
"2 41707\n",
"3 49928\n",
"4 82953\n",
"5 611133\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"print(df['rating'].value_counts().sort_index())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "1da5d625-a4ba-49f8-8314-cc9e0f4ef96a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Review length stats:\n",
" Mean: 13.1 words\n",
" Median: 4.0 words\n",
" Min: 1.0 words\n",
" Max: 755.0 words\n"
]
}
],
"source": [
"df['word_count'] = df['review_description'].str.split().str.len()\n",
"print('Review length stats:')\n",
"print(f\" Mean: {df['word_count'].mean():.1f} words\")\n",
"print(f\" Median: {df['word_count'].median():.1f} words\")\n",
"print(f\" Min: {df['word_count'].min()} words\")\n",
"print(f\" Max: {df['word_count'].max()} words\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "1c97e396-8f05-4df7-bd0a-1bbecf6911b4",
"metadata": {},
"outputs": [],
"source": [
"short_reviews = df[df['word_count'] < 5]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "55324c94-4944-4844-b00e-dc08c8989f7b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Reviews < 5 words: 569632 (53.3%)\n"
]
}
],
"source": [
"print(f\"\\nReviews < 5 words: {len(short_reviews)} ({len(short_reviews)/len(df)*100:.1f}%)\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "c45959fe-3e23-4831-a41a-94c89892247f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Missing values:\n",
"source 0\n",
"review_id 0\n",
"user_name 1\n",
"review_title 1067436\n",
"review_description 169\n",
"rating 0\n",
"thumbs_up 2180\n",
"review_date 0\n",
"developer_response 871352\n",
"developer_response_date 872338\n",
"appVersion 241548\n",
"laguage_code 0\n",
"country_code 0\n",
"word_count 169\n",
"dtype: int64\n"
]
}
],
"source": [
"print(f\"\\nMissing values:\")\n",
"print(df.isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "bf14e3db-a1b4-4fad-8102-b7ac25feeefa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Duplicate reviews: 422458\n"
]
}
],
"source": [
"print(f\"Duplicate reviews: {df.duplicated(subset=['review_description']).sum()}\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "8ccc07fa-9913-4047-ae17-35d2454eb059",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"==========================================\n",
"1 STAR REVIEWS:\n",
"==========================================\n",
"\n",
"Driver come very late than you can't do anthing pay and wait. Very bad service you can't cancil your when driver not come. Ubar charge money without service. This is very bad bad bad. So take local au...\n",
"(Length: 47.0 words)\n",
"\n",
"I have uninstalled and reinstalled the app around 5 times over the course of the past 3 days. Every time I try to use the app, I get stuck in and endless reCaptcha loop. (I enter my phone number, solv...\n",
"(Length: 66.0 words)\n",
"\n",
"Thieves. Sent an Uber to my house in the middle of the night and wouldn't refund.\n",
"(Length: 16.0 words)\n",
"\n",
"==========================================\n",
"2 STAR REVIEWS:\n",
"==========================================\n",
"\n",
"Your app is required to much space\n",
"(Length: 7.0 words)\n",
"\n",
"I'm very disappointed. At first,I used Uber because it was far better than regular taxi. But I stopped using it because the application is very heavy and the drivers rarely reached my pinned locatio...\n",
"(Length: 107.0 words)\n",
"\n",
"nowhere to leave a tip!\n",
"(Length: 5.0 words)\n",
"\n",
"==========================================\n",
"3 STAR REVIEWS:\n",
"==========================================\n",
"\n",
"اوبر المدينة احيانا كويس .. بس لما يكون السائق باخر ملك ربي و تنتظر 14 دقيقه و بعدين يلغي و تصير دخلت بوقت الذروة المفروض يكون في تعويض .. زي لما تلغي انت .\n",
"(Length: 34.0 words)\n",
"\n",
"Good application\n",
"(Length: 2.0 words)\n",
"\n",
"Toooslooow\n",
"(Length: 1.0 words)\n",
"\n",
"==========================================\n",
"4 STAR REVIEWS:\n",
"==========================================\n",
"\n",
"Help full\n",
"(Length: 2.0 words)\n",
"\n",
"Won't allow me to change my payment details. Update: Problem solved.\n",
"(Length: 11.0 words)\n",
"\n",
"Very good\n",
"(Length: 2.0 words)\n",
"\n",
"==========================================\n",
"5 STAR REVIEWS:\n",
"==========================================\n",
"\n",
"Good driving skills\n",
"(Length: 3.0 words)\n",
"\n",
"Lovery\n",
"(Length: 1.0 words)\n",
"\n",
"Excellent experience\n",
"(Length: 2.0 words)\n"
]
}
],
"source": [
"for rating in [1, 2, 3, 4, 5]:\n",
" samples = df[df['rating'] == rating].sample(min(3, len(df[df['rating'] == rating])))\n",
" print(f\"\\n{'='*42}\")\n",
" print(f\"{rating} STAR REVIEWS:\")\n",
" print(f\"{'='*42}\")\n",
" for idx, row in samples.iterrows():\n",
" review_text = row['review_description']\n",
" print(f\"\\n{review_text[:200]}{'...' if len(review_text) > 200 else ''}\")\n",
" print(f\"(Length: {row['word_count']} words)\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "multitag",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,266 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "739e61bf",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import re\n",
"from langdetect import detect, LangDetectException\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d9da1b98",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>rating</th>\n",
" <th>word_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>suddenly, the driver can't have my location an...</td>\n",
" <td>1</td>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>very cordial. and helped with a quick turnarou...</td>\n",
" <td>5</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>i have enjoyed traveling by uber my drivers ha...</td>\n",
" <td>5</td>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>app is good but main problem is the drivers ca...</td>\n",
" <td>1</td>\n",
" <td>23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>very bad experience no customer service</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" review rating word_count\n",
"0 suddenly, the driver can't have my location an... 1 23\n",
"1 very cordial. and helped with a quick turnarou... 5 11\n",
"2 i have enjoyed traveling by uber my drivers ha... 5 23\n",
"3 app is good but main problem is the drivers ca... 1 23\n",
"4 very bad experience no customer service 1 6"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('../data/uber_reviews_cleaned.csv')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "91dc1d9a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.int64(6740)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(df['word_count'] > 100).sum()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "827b6435",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Max length review:\n",
"i've been using uber for a few years now and for the most part haven't had any problems the only problem i have ever had that was never resolved or explained to me was when i was refunded cancellation fee because the driver pulled into me and then literally looked in my face and drove away and canceled the ride.buy still charges me.a cancellation fee the problem wasn't being charged the fee because i was reimbursed a few minutes later the problem was that they gave me a $5 uber credit and i don't have a debit card so i use uber gift cards so i had a balance on the gift card of roughly $4.85 and is an uber credit of $5 but for some reason you can't combine them to use on a single gel ride and there is a minimum of i think $6 or $7 for a ride so i was never able to use the money i was reimbursed and the remaing balance on my gift card was not enough for a full ride and that was 4 months ago and still have not been able to get a response as to how i can use the funds remaining mn my uber account or the uber credits i was reimbursed. another time the driver took between 7-10 minutes to set the destination and begin driving and then when i was trying to help her with the directions because the gps was bringing us rather far out of the way to reach out destination she was very rude with me and then when we were approaching the turn that we needed to make and i warned her that it was coming up shortly and then said a few more times as we got closer to it she drove passed it the reason i was making sure she was aware of this was because we were in a rather busy highway and she would not be able to turn again for quite some time and i was already cutting it close to being late for work due to the delay in starting the trip and her failure to follow the direction i was giving her to get to.our destination so i said pull over into.the gas station and i will walk it'll be faster then driving at this point but she failed to pull into parking lot she just stopped in the middle.lf the highway granted we were in the right lane but she still put us both in a very dangerous situation and we were in a bad section of newark nj at around 10pm which for most people would put them in an uncomfortable position seeing how as the crime rate is extremely high in this area luckily i'm familiar with the residence of the neighborhood because i lived there for a few years in the past and then she notified uber that i was a disgruntled passenger and made her nervous from my reaction and in was warned to not let anything like this happen again. and the final problem.i had with uber was when my mother had ordered me.an uber through her account and during the ride we some.how started to discuss religion and i'm far from a religious person i do not even consider myself a member of any type of religion at all. but my driver was very dedicated to the religion he practiced so he got very upset and decided to call mother and tell her that i didn't not go the full distance of the original route he was hired to drive and that he had dropped me off a few blocks from where j was going to judge in some illegal activities (he knew i had some personal issues i was overcoming recently because i had mention it to him briefly in the beginning of the trip) and that he was not going to charge her the full ride amount because i did. ot cimplete the full distance and this caused my mother to become very upset with me and called me extremely untilni sent her a picture showing that i was in fact where i said i was going and the app showed that he did drop me off at the predetermined destination. sorry for the long post but out of the probably 100 or so ride i have used with uber those were the only 3 problems i have ever had and they all 3 were actually in the same week\n"
]
}
],
"source": [
"print(f\"Max length review:\\n{df.loc[df['word_count'].idxmax(), 'review']}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "7a811e3d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.int64(2839)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def has_spam_repetition(text):\n",
" return bool(re.search(r'(.)\\1{4,}', str(text)))\n",
"df['review'].apply(has_spam_repetition).sum()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0a550434",
"metadata": {},
"outputs": [],
"source": [
"sample_check = df[df['word_count'] >= 10].sample(1000)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ec7b2ec5",
"metadata": {},
"outputs": [],
"source": [
"def detect_language(text):\n",
" \"\"\"Detect language of text\"\"\"\n",
" try:\n",
" if pd.isna(text) or len(str(text).strip()) < 10:\n",
" return 'unknown'\n",
" return detect(str(text))\n",
" except LangDetectException:\n",
" return 'unknown'"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d68dac67",
"metadata": {},
"outputs": [],
"source": [
"sample_check['lang'] = sample_check['review'].apply(detect_language)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "9a8a49b2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Language distribution in 10+ word reviews:\n",
"lang\n",
"en 956\n",
"es 7\n",
"pt 7\n",
"ar 4\n",
"id 3\n",
"da 3\n",
"bn 3\n",
"af 3\n",
"it 2\n",
"tl 2\n",
"tr 2\n",
"fr 1\n",
"ro 1\n",
"et 1\n",
"sv 1\n",
"nl 1\n",
"hi 1\n",
"pl 1\n",
"ta 1\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"print(f\"\\nLanguage distribution in 10+ word reviews:\")\n",
"print(sample_check['lang'].value_counts())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "multitag",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}