460 lines
14 KiB
Plaintext
460 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "470fe7c6-1614-4daf-879f-e6c399117c7b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "b855045e-2dd1-4fa1-ab5a-8ce8b50b02ee",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv('data/uber_reviews.csv', low_memory=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "e7da1fb6-ede6-46c6-8fbd-fa491d3351c5",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>source</th>\n",
|
|
" <th>review_id</th>\n",
|
|
" <th>user_name</th>\n",
|
|
" <th>review_title</th>\n",
|
|
" <th>review_description</th>\n",
|
|
" <th>rating</th>\n",
|
|
" <th>thumbs_up</th>\n",
|
|
" <th>review_date</th>\n",
|
|
" <th>developer_response</th>\n",
|
|
" <th>developer_response_date</th>\n",
|
|
" <th>appVersion</th>\n",
|
|
" <th>laguage_code</th>\n",
|
|
" <th>country_code</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>Google Play</td>\n",
|
|
" <td>18d6584c-d0e9-4833-a744-f607058aee97</td>\n",
|
|
" <td>Milky Way</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>Suddenly, the driver can't have my location an...</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>2023-08-10 17:48:51</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>en</td>\n",
|
|
" <td>in</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>Google Play</td>\n",
|
|
" <td>50a08f18-cece-4ddf-b617-028844c8aa28</td>\n",
|
|
" <td>Bradlee Severa</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>Very cordial.. And helped with a quick turnaro...</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>2023-08-10 17:38:35</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.485.10000</td>\n",
|
|
" <td>en</td>\n",
|
|
" <td>in</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>Google Play</td>\n",
|
|
" <td>b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7</td>\n",
|
|
" <td>Amit Aggarwal</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>Very good experience</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>2023-08-10 17:38:17</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.486.10002</td>\n",
|
|
" <td>en</td>\n",
|
|
" <td>in</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>Google Play</td>\n",
|
|
" <td>502702a9-25ed-4373-a96c-7fa1f06caacd</td>\n",
|
|
" <td>Bryant Inman</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>All I use</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>2023-08-10 17:37:45</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.467.10008</td>\n",
|
|
" <td>en</td>\n",
|
|
" <td>in</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>Google Play</td>\n",
|
|
" <td>f47a3fb6-23db-49bd-9e63-f33c8d724d07</td>\n",
|
|
" <td>Addie Whittaker</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>I have enjoyed traveling by Uber my drivers ha...</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>2023-08-10 17:36:56</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.486.10002</td>\n",
|
|
" <td>en</td>\n",
|
|
" <td>in</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" source review_id user_name \\\n",
|
|
"0 Google Play 18d6584c-d0e9-4833-a744-f607058aee97 Milky Way \n",
|
|
"1 Google Play 50a08f18-cece-4ddf-b617-028844c8aa28 Bradlee Severa \n",
|
|
"2 Google Play b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7 Amit Aggarwal \n",
|
|
"3 Google Play 502702a9-25ed-4373-a96c-7fa1f06caacd Bryant Inman \n",
|
|
"4 Google Play f47a3fb6-23db-49bd-9e63-f33c8d724d07 Addie Whittaker \n",
|
|
"\n",
|
|
" review_title review_description rating \\\n",
|
|
"0 NaN Suddenly, the driver can't have my location an... 1 \n",
|
|
"1 NaN Very cordial.. And helped with a quick turnaro... 5 \n",
|
|
"2 NaN Very good experience 5 \n",
|
|
"3 NaN All I use 5 \n",
|
|
"4 NaN I have enjoyed traveling by Uber my drivers ha... 5 \n",
|
|
"\n",
|
|
" thumbs_up review_date developer_response developer_response_date \\\n",
|
|
"0 0.0 2023-08-10 17:48:51 NaN NaN \n",
|
|
"1 0.0 2023-08-10 17:38:35 NaN NaN \n",
|
|
"2 0.0 2023-08-10 17:38:17 NaN NaN \n",
|
|
"3 0.0 2023-08-10 17:37:45 NaN NaN \n",
|
|
"4 0.0 2023-08-10 17:36:56 NaN NaN \n",
|
|
"\n",
|
|
" appVersion laguage_code country_code \n",
|
|
"0 NaN en in \n",
|
|
"1 4.485.10000 en in \n",
|
|
"2 4.486.10002 en in \n",
|
|
"3 4.467.10008 en in \n",
|
|
"4 4.486.10002 en in "
|
|
]
|
|
},
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "5c02ec54-4583-4720-88c6-1110b52c3f88",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"rating\n",
|
|
"1 283895\n",
|
|
"2 41707\n",
|
|
"3 49928\n",
|
|
"4 82953\n",
|
|
"5 611133\n",
|
|
"Name: count, dtype: int64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(df['rating'].value_counts().sort_index())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "1da5d625-a4ba-49f8-8314-cc9e0f4ef96a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Review length stats:\n",
|
|
" Mean: 13.1 words\n",
|
|
" Median: 4.0 words\n",
|
|
" Min: 1.0 words\n",
|
|
" Max: 755.0 words\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"df['word_count'] = df['review_description'].str.split().str.len()\n",
|
|
"print('Review length stats:')\n",
|
|
"print(f\" Mean: {df['word_count'].mean():.1f} words\")\n",
|
|
"print(f\" Median: {df['word_count'].median():.1f} words\")\n",
|
|
"print(f\" Min: {df['word_count'].min()} words\")\n",
|
|
"print(f\" Max: {df['word_count'].max()} words\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "1c97e396-8f05-4df7-bd0a-1bbecf6911b4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"short_reviews = df[df['word_count'] < 5]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "55324c94-4944-4844-b00e-dc08c8989f7b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Reviews < 5 words: 569632 (53.3%)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f\"\\nReviews < 5 words: {len(short_reviews)} ({len(short_reviews)/len(df)*100:.1f}%)\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "c45959fe-3e23-4831-a41a-94c89892247f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Missing values:\n",
|
|
"source 0\n",
|
|
"review_id 0\n",
|
|
"user_name 1\n",
|
|
"review_title 1067436\n",
|
|
"review_description 169\n",
|
|
"rating 0\n",
|
|
"thumbs_up 2180\n",
|
|
"review_date 0\n",
|
|
"developer_response 871352\n",
|
|
"developer_response_date 872338\n",
|
|
"appVersion 241548\n",
|
|
"laguage_code 0\n",
|
|
"country_code 0\n",
|
|
"word_count 169\n",
|
|
"dtype: int64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f\"\\nMissing values:\")\n",
|
|
"print(df.isnull().sum())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "bf14e3db-a1b4-4fad-8102-b7ac25feeefa",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Duplicate reviews: 422458\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f\"Duplicate reviews: {df.duplicated(subset=['review_description']).sum()}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "8ccc07fa-9913-4047-ae17-35d2454eb059",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"==========================================\n",
|
|
"1 STAR REVIEWS:\n",
|
|
"==========================================\n",
|
|
"\n",
|
|
"Once drivers confirm the rider its showing with in 2 minutes but they take 25 minutes more and if driver cancel the ride I pay for that in next ride it's redicules\n",
|
|
"(Length: 32.0 words)\n",
|
|
"\n",
|
|
"they charge very high as they show before the ride\n",
|
|
"(Length: 10.0 words)\n",
|
|
"\n",
|
|
"scam drivers, worst service\n",
|
|
"(Length: 4.0 words)\n",
|
|
"\n",
|
|
"==========================================\n",
|
|
"2 STAR REVIEWS:\n",
|
|
"==========================================\n",
|
|
"\n",
|
|
"Drivers always ask is destination and cancel if they dont want to go ? Cant they already see destination before accepting ride ?\n",
|
|
"(Length: 23.0 words)\n",
|
|
"\n",
|
|
"she hole her phone on her hand she driving 80\n",
|
|
"(Length: 10.0 words)\n",
|
|
"\n",
|
|
"I7u.6f هنه\n",
|
|
"(Length: 2.0 words)\n",
|
|
"\n",
|
|
"==========================================\n",
|
|
"3 STAR REVIEWS:\n",
|
|
"==========================================\n",
|
|
"\n",
|
|
"I think this app is very important to me\n",
|
|
"(Length: 9.0 words)\n",
|
|
"\n",
|
|
"Ok\n",
|
|
"(Length: 1.0 words)\n",
|
|
"\n",
|
|
"The rate will be one while booking and after the ride it changes if asked the reason is due to traffic, but it should be mentioned first only, the destination time is also calculated by uber only alon...\n",
|
|
"(Length: 53.0 words)\n",
|
|
"\n",
|
|
"==========================================\n",
|
|
"4 STAR REVIEWS:\n",
|
|
"==========================================\n",
|
|
"\n",
|
|
"Nice\n",
|
|
"(Length: 1.0 words)\n",
|
|
"\n",
|
|
"Good app but sometimes it take long time to get booking even the cabs are nearby and sometimes they even cancel the ride after confirming and making us wait for 30 mins and above\n",
|
|
"(Length: 34.0 words)\n",
|
|
"\n",
|
|
"its good and helpful.. Thank you\n",
|
|
"(Length: 6.0 words)\n",
|
|
"\n",
|
|
"==========================================\n",
|
|
"5 STAR REVIEWS:\n",
|
|
"==========================================\n",
|
|
"\n",
|
|
"good service\n",
|
|
"(Length: 2.0 words)\n",
|
|
"\n",
|
|
"Drivers have been getting us home quickly and effectively.\n",
|
|
"(Length: 9.0 words)\n",
|
|
"\n",
|
|
"Hbby\n",
|
|
"(Length: 1.0 words)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"for rating in [1, 2, 3, 4, 5]:\n",
|
|
" samples = df[df['rating'] == rating].sample(min(3, len(df[df['rating'] == rating])))\n",
|
|
" print(f\"\\n{'='*42}\")\n",
|
|
" print(f\"{rating} STAR REVIEWS:\")\n",
|
|
" print(f\"{'='*42}\")\n",
|
|
" for idx, row in samples.iterrows():\n",
|
|
" review_text = row['review_description']\n",
|
|
" print(f\"\\n{review_text[:200]}{'...' if len(review_text) > 200 else ''}\")\n",
|
|
" print(f\"(Length: {row['word_count']} words)\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b12dcb89-d291-447a-98f3-02817dc0eb8e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "87a15e76-51c8-4586-acea-ca3176c18757",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "73c4bbb9-3f8e-4b4c-8538-539b140cf610",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.14.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|