{ "cells": [ { "cell_type": "markdown", "id": "f4474e0f", "metadata": {}, "source": [ "# Preprocessing Requirements\n", "## RECLASS\n", "\n", "**Purpose**: Ensure samples are consistent with the original dataset and find issues with current sampling/preprocessing methods.\n", "\n", "**Dataset**: Uber Customer Reviews from Google Play (Kaggle)\n", "\n", "---" ] }, { "cell_type": "code", "execution_count": 23, "id": "470fe7c6-1614-4daf-879f-e6c399117c7b", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 24, "id": "afe1168c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cwd: c:\\Users\\ch\\6013\\notebooks\n", "exists data: True\n" ] } ], "source": [ "import os\n", "print(\"cwd:\", os.getcwd())\n", "print(\"exists data:\", os.path.exists(\"../data/\"))\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "b855045e-2dd1-4fa1-ab5a-8ce8b50b02ee", "metadata": {}, "outputs": [], "source": [ "\n", "df = pd.read_csv('../data/uber_reviews.csv', low_memory=False)" ] }, { "cell_type": "code", "execution_count": 26, "id": "e7da1fb6-ede6-46c6-8fbd-fa491d3351c5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sourcereview_iduser_namereview_titlereview_descriptionratingthumbs_upreview_datedeveloper_responsedeveloper_response_dateappVersionlaguage_codecountry_code
0Google Play18d6584c-d0e9-4833-a744-f607058aee97Milky WayNaNSuddenly, the driver can't have my location an...10.02023-08-10 17:48:51NaNNaNNaNenin
1Google Play50a08f18-cece-4ddf-b617-028844c8aa28Bradlee SeveraNaNVery cordial.. And helped with a quick turnaro...50.02023-08-10 17:38:35NaNNaN4.485.10000enin
2Google Playb0d8e75a-80a7-4dcd-abaf-72b046dbeeb7Amit AggarwalNaNVery good experience50.02023-08-10 17:38:17NaNNaN4.486.10002enin
3Google Play502702a9-25ed-4373-a96c-7fa1f06caacdBryant InmanNaNAll I use50.02023-08-10 17:37:45NaNNaN4.467.10008enin
4Google Playf47a3fb6-23db-49bd-9e63-f33c8d724d07Addie WhittakerNaNI have enjoyed traveling by Uber my drivers ha...50.02023-08-10 17:36:56NaNNaN4.486.10002enin
\n", "
" ], "text/plain": [ " source review_id user_name \\\n", "0 Google Play 18d6584c-d0e9-4833-a744-f607058aee97 Milky Way \n", "1 Google Play 50a08f18-cece-4ddf-b617-028844c8aa28 Bradlee Severa \n", "2 Google Play b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7 Amit Aggarwal \n", "3 Google Play 502702a9-25ed-4373-a96c-7fa1f06caacd Bryant Inman \n", "4 Google Play f47a3fb6-23db-49bd-9e63-f33c8d724d07 Addie Whittaker \n", "\n", " review_title review_description rating \\\n", "0 NaN Suddenly, the driver can't have my location an... 1 \n", "1 NaN Very cordial.. And helped with a quick turnaro... 5 \n", "2 NaN Very good experience 5 \n", "3 NaN All I use 5 \n", "4 NaN I have enjoyed traveling by Uber my drivers ha... 5 \n", "\n", " thumbs_up review_date developer_response developer_response_date \\\n", "0 0.0 2023-08-10 17:48:51 NaN NaN \n", "1 0.0 2023-08-10 17:38:35 NaN NaN \n", "2 0.0 2023-08-10 17:38:17 NaN NaN \n", "3 0.0 2023-08-10 17:37:45 NaN NaN \n", "4 0.0 2023-08-10 17:36:56 NaN NaN \n", "\n", " appVersion laguage_code country_code \n", "0 NaN en in \n", "1 4.485.10000 en in \n", "2 4.486.10002 en in \n", "3 4.467.10008 en in \n", "4 4.486.10002 en in " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 27, "id": "5c02ec54-4583-4720-88c6-1110b52c3f88", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rating\n", "1 283895\n", "2 41707\n", "3 49928\n", "4 82953\n", "5 611133\n", "Name: count, dtype: int64\n" ] } ], "source": [ "print(df['rating'].value_counts().sort_index())" ] }, { "cell_type": "code", "execution_count": 28, "id": "1da5d625-a4ba-49f8-8314-cc9e0f4ef96a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Review length stats:\n", " Mean: 13.1 words\n", " Median: 4.0 words\n", " Min: 1.0 words\n", " Max: 755.0 words\n" ] } ], "source": [ "df['word_count'] = df['review_description'].str.split().str.len()\n", "print('Review length stats:')\n", "print(f\" Mean: {df['word_count'].mean():.1f} words\")\n", "print(f\" Median: {df['word_count'].median():.1f} words\")\n", "print(f\" Min: {df['word_count'].min()} words\")\n", "print(f\" Max: {df['word_count'].max()} words\")" ] }, { "cell_type": "code", "execution_count": 29, "id": "1c97e396-8f05-4df7-bd0a-1bbecf6911b4", "metadata": {}, "outputs": [], "source": [ "short_reviews = df[df['word_count'] < 5]" ] }, { "cell_type": "code", "execution_count": 30, "id": "55324c94-4944-4844-b00e-dc08c8989f7b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Reviews < 5 words: 569632 (53.3%)\n" ] } ], "source": [ "print(f\"\\nReviews < 5 words: {len(short_reviews)} ({len(short_reviews)/len(df)*100:.1f}%)\")" ] }, { "cell_type": "code", "execution_count": 31, "id": "c45959fe-3e23-4831-a41a-94c89892247f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Missing values:\n", "source 0\n", "review_id 0\n", "user_name 1\n", "review_title 1067436\n", "review_description 169\n", "rating 0\n", "thumbs_up 2180\n", "review_date 0\n", "developer_response 871352\n", "developer_response_date 872338\n", "appVersion 241548\n", "laguage_code 0\n", "country_code 0\n", "word_count 169\n", "dtype: int64\n" ] } ], "source": [ "print(f\"\\nMissing values:\")\n", "print(df.isnull().sum())" ] }, { "cell_type": "code", "execution_count": 32, "id": "bf14e3db-a1b4-4fad-8102-b7ac25feeefa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Duplicate reviews: 422458\n" ] } ], "source": [ "print(f\"Duplicate reviews: {df.duplicated(subset=['review_description']).sum()}\")" ] }, { "cell_type": "code", "execution_count": 33, "id": "8ccc07fa-9913-4047-ae17-35d2454eb059", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "==========================================\n", "1 STAR REVIEWS:\n", "==========================================\n", "\n", "Many times driver cancelled the ride only because he didn't wanted to go to my destination and I was supposed to pay the cancellation fees without any reason.\n", "(Length: 28.0 words)\n", "\n", "Their drivers are always finding new ways to outsmart customers.When Uber started initally, it was a pleasure to use their services.Now either when you book it says a few minutes n the moment you conf...\n", "(Length: 98.0 words)\n", "\n", "terrible GPS system. takes you the long way everywhere. seriously, Waze, google maps and pretty much every other GPS shows faster routes. please fix this.\n", "(Length: 25.0 words)\n", "\n", "==========================================\n", "2 STAR REVIEWS:\n", "==========================================\n", "\n", "no helpline number, customer is unable to contect in case of emegrncy\n", "(Length: 12.0 words)\n", "\n", "Ghaantaa tum threk nhi kr skte u r just lying\n", "(Length: 10.0 words)\n", "\n", "Nice application 😘😘\n", "(Length: 3.0 words)\n", "\n", "==========================================\n", "3 STAR REVIEWS:\n", "==========================================\n", "\n", "The app is good but I got charged for a cancelation because the driver was going to make me walk a block to go to him... what's the point in the app if I have to go to them\n", "(Length: 39.0 words)\n", "\n", "Final amount to pay in cash doesn't always appear correct on app. You can't challenge the cost or question it. Example toll. They over charged by 60% of original cost and won't review it properly. Whe...\n", "(Length: 59.0 words)\n", "\n", "Location of the driver's car is not updated properly . I'm using android, and the location is keep being update all the time . Please fix this problem .\n", "(Length: 29.0 words)\n", "\n", "==========================================\n", "4 STAR REVIEWS:\n", "==========================================\n", "\n", "Good\n", "(Length: 1.0 words)\n", "\n", "I like that app 😍🙃\n", "(Length: 5.0 words)\n", "\n", "it is very difficult to contact the chief operator if there is any \n", "problem...we are not clear as to whom to contact if problem with uber driver\n", "(Length: 27.0 words)\n", "\n", "==========================================\n", "5 STAR REVIEWS:\n", "==========================================\n", "\n", "I had a great uber experience at kolkata good experience.\n", "(Length: 10.0 words)\n", "\n", "Nice\n", "(Length: 1.0 words)\n", "\n", "It's an awesome aap\n", "(Length: 4.0 words)\n" ] } ], "source": [ "for rating in [1, 2, 3, 4, 5]:\n", " samples = df[df['rating'] == rating].sample(min(3, len(df[df['rating'] == rating])))\n", " print(f\"\\n{'='*42}\")\n", " print(f\"{rating} STAR REVIEWS:\")\n", " print(f\"{'='*42}\")\n", " for idx, row in samples.iterrows():\n", " review_text = row['review_description']\n", " print(f\"\\n{review_text[:200]}{'...' if len(review_text) > 200 else ''}\")\n", " print(f\"(Length: {row['word_count']} words)\")" ] } ], "metadata": { "kernelspec": { "display_name": "multitag", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.0" } }, "nbformat": 4, "nbformat_minor": 5 }