{ "cells": [ { "cell_type": "code", "execution_count": 11, "id": "470fe7c6-1614-4daf-879f-e6c399117c7b", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 12, "id": "afe1168c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cwd: c:\\Users\\ch\\6013\\notebooks\n", "exists data: True\n" ] } ], "source": [ "import os\n", "print(\"cwd:\", os.getcwd())\n", "print(\"exists data:\", os.path.exists(\"../data/\"))\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "b855045e-2dd1-4fa1-ab5a-8ce8b50b02ee", "metadata": {}, "outputs": [], "source": [ "\n", "df = pd.read_csv('../data/uber_reviews.csv', low_memory=False)" ] }, { "cell_type": "code", "execution_count": 14, "id": "e7da1fb6-ede6-46c6-8fbd-fa491d3351c5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sourcereview_iduser_namereview_titlereview_descriptionratingthumbs_upreview_datedeveloper_responsedeveloper_response_dateappVersionlaguage_codecountry_code
0Google Play18d6584c-d0e9-4833-a744-f607058aee97Milky WayNaNSuddenly, the driver can't have my location an...10.02023-08-10 17:48:51NaNNaNNaNenin
1Google Play50a08f18-cece-4ddf-b617-028844c8aa28Bradlee SeveraNaNVery cordial.. And helped with a quick turnaro...50.02023-08-10 17:38:35NaNNaN4.485.10000enin
2Google Playb0d8e75a-80a7-4dcd-abaf-72b046dbeeb7Amit AggarwalNaNVery good experience50.02023-08-10 17:38:17NaNNaN4.486.10002enin
3Google Play502702a9-25ed-4373-a96c-7fa1f06caacdBryant InmanNaNAll I use50.02023-08-10 17:37:45NaNNaN4.467.10008enin
4Google Playf47a3fb6-23db-49bd-9e63-f33c8d724d07Addie WhittakerNaNI have enjoyed traveling by Uber my drivers ha...50.02023-08-10 17:36:56NaNNaN4.486.10002enin
\n", "
" ], "text/plain": [ " source review_id user_name \\\n", "0 Google Play 18d6584c-d0e9-4833-a744-f607058aee97 Milky Way \n", "1 Google Play 50a08f18-cece-4ddf-b617-028844c8aa28 Bradlee Severa \n", "2 Google Play b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7 Amit Aggarwal \n", "3 Google Play 502702a9-25ed-4373-a96c-7fa1f06caacd Bryant Inman \n", "4 Google Play f47a3fb6-23db-49bd-9e63-f33c8d724d07 Addie Whittaker \n", "\n", " review_title review_description rating \\\n", "0 NaN Suddenly, the driver can't have my location an... 1 \n", "1 NaN Very cordial.. And helped with a quick turnaro... 5 \n", "2 NaN Very good experience 5 \n", "3 NaN All I use 5 \n", "4 NaN I have enjoyed traveling by Uber my drivers ha... 5 \n", "\n", " thumbs_up review_date developer_response developer_response_date \\\n", "0 0.0 2023-08-10 17:48:51 NaN NaN \n", "1 0.0 2023-08-10 17:38:35 NaN NaN \n", "2 0.0 2023-08-10 17:38:17 NaN NaN \n", "3 0.0 2023-08-10 17:37:45 NaN NaN \n", "4 0.0 2023-08-10 17:36:56 NaN NaN \n", "\n", " appVersion laguage_code country_code \n", "0 NaN en in \n", "1 4.485.10000 en in \n", "2 4.486.10002 en in \n", "3 4.467.10008 en in \n", "4 4.486.10002 en in " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 15, "id": "5c02ec54-4583-4720-88c6-1110b52c3f88", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rating\n", "1 283895\n", "2 41707\n", "3 49928\n", "4 82953\n", "5 611133\n", "Name: count, dtype: int64\n" ] } ], "source": [ "print(df['rating'].value_counts().sort_index())" ] }, { "cell_type": "code", "execution_count": 16, "id": "1da5d625-a4ba-49f8-8314-cc9e0f4ef96a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Review length stats:\n", " Mean: 13.1 words\n", " Median: 4.0 words\n", " Min: 1.0 words\n", " Max: 755.0 words\n" ] } ], "source": [ "df['word_count'] = df['review_description'].str.split().str.len()\n", "print('Review length stats:')\n", "print(f\" Mean: {df['word_count'].mean():.1f} words\")\n", "print(f\" Median: {df['word_count'].median():.1f} words\")\n", "print(f\" Min: {df['word_count'].min()} words\")\n", "print(f\" Max: {df['word_count'].max()} words\")" ] }, { "cell_type": "code", "execution_count": 17, "id": "1c97e396-8f05-4df7-bd0a-1bbecf6911b4", "metadata": {}, "outputs": [], "source": [ "short_reviews = df[df['word_count'] < 5]" ] }, { "cell_type": "code", "execution_count": 18, "id": "55324c94-4944-4844-b00e-dc08c8989f7b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Reviews < 5 words: 569632 (53.3%)\n" ] } ], "source": [ "print(f\"\\nReviews < 5 words: {len(short_reviews)} ({len(short_reviews)/len(df)*100:.1f}%)\")" ] }, { "cell_type": "code", "execution_count": 19, "id": "c45959fe-3e23-4831-a41a-94c89892247f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Missing values:\n", "source 0\n", "review_id 0\n", "user_name 1\n", "review_title 1067436\n", "review_description 169\n", "rating 0\n", "thumbs_up 2180\n", "review_date 0\n", "developer_response 871352\n", "developer_response_date 872338\n", "appVersion 241548\n", "laguage_code 0\n", "country_code 0\n", "word_count 169\n", "dtype: int64\n" ] } ], "source": [ "print(f\"\\nMissing values:\")\n", "print(df.isnull().sum())" ] }, { "cell_type": "code", "execution_count": 20, "id": "bf14e3db-a1b4-4fad-8102-b7ac25feeefa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Duplicate reviews: 422458\n" ] } ], "source": [ "print(f\"Duplicate reviews: {df.duplicated(subset=['review_description']).sum()}\")" ] }, { "cell_type": "code", "execution_count": 21, "id": "8ccc07fa-9913-4047-ae17-35d2454eb059", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "==========================================\n", "1 STAR REVIEWS:\n", "==========================================\n", "\n", "Driver come very late than you can't do anthing pay and wait. Very bad service you can't cancil your when driver not come. Ubar charge money without service. This is very bad bad bad. So take local au...\n", "(Length: 47.0 words)\n", "\n", "I have uninstalled and reinstalled the app around 5 times over the course of the past 3 days. Every time I try to use the app, I get stuck in and endless reCaptcha loop. (I enter my phone number, solv...\n", "(Length: 66.0 words)\n", "\n", "Thieves. Sent an Uber to my house in the middle of the night and wouldn't refund.\n", "(Length: 16.0 words)\n", "\n", "==========================================\n", "2 STAR REVIEWS:\n", "==========================================\n", "\n", "Your app is required to much space\n", "(Length: 7.0 words)\n", "\n", "I'm very disappointed. At first,I used Uber because it was far better than regular taxi. But I stopped using it because the application is very heavy and the drivers rarely reached my pinned locatio...\n", "(Length: 107.0 words)\n", "\n", "nowhere to leave a tip!\n", "(Length: 5.0 words)\n", "\n", "==========================================\n", "3 STAR REVIEWS:\n", "==========================================\n", "\n", "اوبر المدينة احيانا كويس .. بس لما يكون السائق باخر ملك ربي و تنتظر 14 دقيقه و بعدين يلغي و تصير دخلت بوقت الذروة المفروض يكون في تعويض .. زي لما تلغي انت .\n", "(Length: 34.0 words)\n", "\n", "Good application\n", "(Length: 2.0 words)\n", "\n", "Toooslooow\n", "(Length: 1.0 words)\n", "\n", "==========================================\n", "4 STAR REVIEWS:\n", "==========================================\n", "\n", "Help full\n", "(Length: 2.0 words)\n", "\n", "Won't allow me to change my payment details. Update: Problem solved.\n", "(Length: 11.0 words)\n", "\n", "Very good\n", "(Length: 2.0 words)\n", "\n", "==========================================\n", "5 STAR REVIEWS:\n", "==========================================\n", "\n", "Good driving skills\n", "(Length: 3.0 words)\n", "\n", "Lovery\n", "(Length: 1.0 words)\n", "\n", "Excellent experience\n", "(Length: 2.0 words)\n" ] } ], "source": [ "for rating in [1, 2, 3, 4, 5]:\n", " samples = df[df['rating'] == rating].sample(min(3, len(df[df['rating'] == rating])))\n", " print(f\"\\n{'='*42}\")\n", " print(f\"{rating} STAR REVIEWS:\")\n", " print(f\"{'='*42}\")\n", " for idx, row in samples.iterrows():\n", " review_text = row['review_description']\n", " print(f\"\\n{review_text[:200]}{'...' if len(review_text) > 200 else ''}\")\n", " print(f\"(Length: {row['word_count']} words)\")" ] } ], "metadata": { "kernelspec": { "display_name": "multitag", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.0" } }, "nbformat": 4, "nbformat_minor": 5 }