474 lines
15 KiB
Plaintext
474 lines
15 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f4474e0f",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Preprocessing Requirements\n",
|
|
"## RECLASS\n",
|
|
"\n",
|
|
"**Purpose**: Ensure samples are consistent with the original dataset and find issues with current sampling/preprocessing methods.\n",
|
|
"\n",
|
|
"**Dataset**: Uber Customer Reviews from Google Play (Kaggle)\n",
|
|
"\n",
|
|
"---"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "470fe7c6-1614-4daf-879f-e6c399117c7b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"id": "afe1168c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"cwd: c:\\Users\\ch\\6013\\notebooks\n",
|
|
"exists data: True\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import os\n",
|
|
"print(\"cwd:\", os.getcwd())\n",
|
|
"print(\"exists data:\", os.path.exists(\"../data/\"))\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"id": "b855045e-2dd1-4fa1-ab5a-8ce8b50b02ee",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"df = pd.read_csv('../data/uber_reviews.csv', low_memory=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"id": "e7da1fb6-ede6-46c6-8fbd-fa491d3351c5",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>source</th>\n",
|
|
" <th>review_id</th>\n",
|
|
" <th>user_name</th>\n",
|
|
" <th>review_title</th>\n",
|
|
" <th>review_description</th>\n",
|
|
" <th>rating</th>\n",
|
|
" <th>thumbs_up</th>\n",
|
|
" <th>review_date</th>\n",
|
|
" <th>developer_response</th>\n",
|
|
" <th>developer_response_date</th>\n",
|
|
" <th>appVersion</th>\n",
|
|
" <th>laguage_code</th>\n",
|
|
" <th>country_code</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>Google Play</td>\n",
|
|
" <td>18d6584c-d0e9-4833-a744-f607058aee97</td>\n",
|
|
" <td>Milky Way</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>Suddenly, the driver can't have my location an...</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>2023-08-10 17:48:51</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>en</td>\n",
|
|
" <td>in</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>Google Play</td>\n",
|
|
" <td>50a08f18-cece-4ddf-b617-028844c8aa28</td>\n",
|
|
" <td>Bradlee Severa</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>Very cordial.. And helped with a quick turnaro...</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>2023-08-10 17:38:35</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.485.10000</td>\n",
|
|
" <td>en</td>\n",
|
|
" <td>in</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>Google Play</td>\n",
|
|
" <td>b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7</td>\n",
|
|
" <td>Amit Aggarwal</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>Very good experience</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>2023-08-10 17:38:17</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.486.10002</td>\n",
|
|
" <td>en</td>\n",
|
|
" <td>in</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>Google Play</td>\n",
|
|
" <td>502702a9-25ed-4373-a96c-7fa1f06caacd</td>\n",
|
|
" <td>Bryant Inman</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>All I use</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>2023-08-10 17:37:45</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.467.10008</td>\n",
|
|
" <td>en</td>\n",
|
|
" <td>in</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>Google Play</td>\n",
|
|
" <td>f47a3fb6-23db-49bd-9e63-f33c8d724d07</td>\n",
|
|
" <td>Addie Whittaker</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>I have enjoyed traveling by Uber my drivers ha...</td>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" <td>2023-08-10 17:36:56</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.486.10002</td>\n",
|
|
" <td>en</td>\n",
|
|
" <td>in</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" source review_id user_name \\\n",
|
|
"0 Google Play 18d6584c-d0e9-4833-a744-f607058aee97 Milky Way \n",
|
|
"1 Google Play 50a08f18-cece-4ddf-b617-028844c8aa28 Bradlee Severa \n",
|
|
"2 Google Play b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7 Amit Aggarwal \n",
|
|
"3 Google Play 502702a9-25ed-4373-a96c-7fa1f06caacd Bryant Inman \n",
|
|
"4 Google Play f47a3fb6-23db-49bd-9e63-f33c8d724d07 Addie Whittaker \n",
|
|
"\n",
|
|
" review_title review_description rating \\\n",
|
|
"0 NaN Suddenly, the driver can't have my location an... 1 \n",
|
|
"1 NaN Very cordial.. And helped with a quick turnaro... 5 \n",
|
|
"2 NaN Very good experience 5 \n",
|
|
"3 NaN All I use 5 \n",
|
|
"4 NaN I have enjoyed traveling by Uber my drivers ha... 5 \n",
|
|
"\n",
|
|
" thumbs_up review_date developer_response developer_response_date \\\n",
|
|
"0 0.0 2023-08-10 17:48:51 NaN NaN \n",
|
|
"1 0.0 2023-08-10 17:38:35 NaN NaN \n",
|
|
"2 0.0 2023-08-10 17:38:17 NaN NaN \n",
|
|
"3 0.0 2023-08-10 17:37:45 NaN NaN \n",
|
|
"4 0.0 2023-08-10 17:36:56 NaN NaN \n",
|
|
"\n",
|
|
" appVersion laguage_code country_code \n",
|
|
"0 NaN en in \n",
|
|
"1 4.485.10000 en in \n",
|
|
"2 4.486.10002 en in \n",
|
|
"3 4.467.10008 en in \n",
|
|
"4 4.486.10002 en in "
|
|
]
|
|
},
|
|
"execution_count": 26,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"id": "5c02ec54-4583-4720-88c6-1110b52c3f88",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"rating\n",
|
|
"1 283895\n",
|
|
"2 41707\n",
|
|
"3 49928\n",
|
|
"4 82953\n",
|
|
"5 611133\n",
|
|
"Name: count, dtype: int64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(df['rating'].value_counts().sort_index())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"id": "1da5d625-a4ba-49f8-8314-cc9e0f4ef96a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Review length stats:\n",
|
|
" Mean: 13.1 words\n",
|
|
" Median: 4.0 words\n",
|
|
" Min: 1.0 words\n",
|
|
" Max: 755.0 words\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"df['word_count'] = df['review_description'].str.split().str.len()\n",
|
|
"print('Review length stats:')\n",
|
|
"print(f\" Mean: {df['word_count'].mean():.1f} words\")\n",
|
|
"print(f\" Median: {df['word_count'].median():.1f} words\")\n",
|
|
"print(f\" Min: {df['word_count'].min()} words\")\n",
|
|
"print(f\" Max: {df['word_count'].max()} words\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"id": "1c97e396-8f05-4df7-bd0a-1bbecf6911b4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"short_reviews = df[df['word_count'] < 5]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"id": "55324c94-4944-4844-b00e-dc08c8989f7b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Reviews < 5 words: 569632 (53.3%)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f\"\\nReviews < 5 words: {len(short_reviews)} ({len(short_reviews)/len(df)*100:.1f}%)\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"id": "c45959fe-3e23-4831-a41a-94c89892247f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Missing values:\n",
|
|
"source 0\n",
|
|
"review_id 0\n",
|
|
"user_name 1\n",
|
|
"review_title 1067436\n",
|
|
"review_description 169\n",
|
|
"rating 0\n",
|
|
"thumbs_up 2180\n",
|
|
"review_date 0\n",
|
|
"developer_response 871352\n",
|
|
"developer_response_date 872338\n",
|
|
"appVersion 241548\n",
|
|
"laguage_code 0\n",
|
|
"country_code 0\n",
|
|
"word_count 169\n",
|
|
"dtype: int64\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f\"\\nMissing values:\")\n",
|
|
"print(df.isnull().sum())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"id": "bf14e3db-a1b4-4fad-8102-b7ac25feeefa",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Duplicate reviews: 422458\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f\"Duplicate reviews: {df.duplicated(subset=['review_description']).sum()}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 33,
|
|
"id": "8ccc07fa-9913-4047-ae17-35d2454eb059",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"==========================================\n",
|
|
"1 STAR REVIEWS:\n",
|
|
"==========================================\n",
|
|
"\n",
|
|
"Many times driver cancelled the ride only because he didn't wanted to go to my destination and I was supposed to pay the cancellation fees without any reason.\n",
|
|
"(Length: 28.0 words)\n",
|
|
"\n",
|
|
"Their drivers are always finding new ways to outsmart customers.When Uber started initally, it was a pleasure to use their services.Now either when you book it says a few minutes n the moment you conf...\n",
|
|
"(Length: 98.0 words)\n",
|
|
"\n",
|
|
"terrible GPS system. takes you the long way everywhere. seriously, Waze, google maps and pretty much every other GPS shows faster routes. please fix this.\n",
|
|
"(Length: 25.0 words)\n",
|
|
"\n",
|
|
"==========================================\n",
|
|
"2 STAR REVIEWS:\n",
|
|
"==========================================\n",
|
|
"\n",
|
|
"no helpline number, customer is unable to contect in case of emegrncy\n",
|
|
"(Length: 12.0 words)\n",
|
|
"\n",
|
|
"Ghaantaa tum threk nhi kr skte u r just lying\n",
|
|
"(Length: 10.0 words)\n",
|
|
"\n",
|
|
"Nice application 😘😘\n",
|
|
"(Length: 3.0 words)\n",
|
|
"\n",
|
|
"==========================================\n",
|
|
"3 STAR REVIEWS:\n",
|
|
"==========================================\n",
|
|
"\n",
|
|
"The app is good but I got charged for a cancelation because the driver was going to make me walk a block to go to him... what's the point in the app if I have to go to them\n",
|
|
"(Length: 39.0 words)\n",
|
|
"\n",
|
|
"Final amount to pay in cash doesn't always appear correct on app. You can't challenge the cost or question it. Example toll. They over charged by 60% of original cost and won't review it properly. Whe...\n",
|
|
"(Length: 59.0 words)\n",
|
|
"\n",
|
|
"Location of the driver's car is not updated properly . I'm using android, and the location is keep being update all the time . Please fix this problem .\n",
|
|
"(Length: 29.0 words)\n",
|
|
"\n",
|
|
"==========================================\n",
|
|
"4 STAR REVIEWS:\n",
|
|
"==========================================\n",
|
|
"\n",
|
|
"Good\n",
|
|
"(Length: 1.0 words)\n",
|
|
"\n",
|
|
"I like that app 😍🙃\n",
|
|
"(Length: 5.0 words)\n",
|
|
"\n",
|
|
"it is very difficult to contact the chief operator if there is any \n",
|
|
"problem...we are not clear as to whom to contact if problem with uber driver\n",
|
|
"(Length: 27.0 words)\n",
|
|
"\n",
|
|
"==========================================\n",
|
|
"5 STAR REVIEWS:\n",
|
|
"==========================================\n",
|
|
"\n",
|
|
"I had a great uber experience at kolkata good experience.\n",
|
|
"(Length: 10.0 words)\n",
|
|
"\n",
|
|
"Nice\n",
|
|
"(Length: 1.0 words)\n",
|
|
"\n",
|
|
"It's an awesome aap\n",
|
|
"(Length: 4.0 words)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"for rating in [1, 2, 3, 4, 5]:\n",
|
|
" samples = df[df['rating'] == rating].sample(min(3, len(df[df['rating'] == rating])))\n",
|
|
" print(f\"\\n{'='*42}\")\n",
|
|
" print(f\"{rating} STAR REVIEWS:\")\n",
|
|
" print(f\"{'='*42}\")\n",
|
|
" for idx, row in samples.iterrows():\n",
|
|
" review_text = row['review_description']\n",
|
|
" print(f\"\\n{review_text[:200]}{'...' if len(review_text) > 200 else ''}\")\n",
|
|
" print(f\"(Length: {row['word_count']} words)\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "multitag",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.14.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|