diff --git a/.gitignore b/.gitignore index aa3e679..8054551 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ multitag/data/*.csv multitag/raw_data/ +multitag/.ipynb_checkpoints +multitag/.vscode diff --git a/.ipynb_checkpoints/datasets_reviews-checkpoint.ipynb b/.ipynb_checkpoints/datasets_reviews-checkpoint.ipynb deleted file mode 100644 index 910a184..0000000 --- a/.ipynb_checkpoints/datasets_reviews-checkpoint.ipynb +++ /dev/null @@ -1,1471 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "f3da59fb-eb6b-449f-b8d5-95ddacd456f2", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "7c97ff6e-05a0-4ed1-945a-04f024b3045a", - "metadata": {}, - "outputs": [], - "source": [ - "csv0 = pd.read_csv(\"spotify.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c0631560-c1be-4bbf-b050-b6a552e74d63", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Time_submittedReviewRatingTotal_thumbsupReply
02022-07-09 15:00:00Great music service, the audio is high quality...52NaN
12022-07-09 14:21:22Please ignore previous negative rating. This a...51NaN
22022-07-09 13:27:32This pop-up \"Get the best Spotify experience o...40NaN
32022-07-09 13:26:45Really buggy and terrible to use as of recently11NaN
42022-07-09 13:20:49Dear Spotify why do I get songs that I didn't ...11NaN
\n", - "
" - ], - "text/plain": [ - " Time_submitted Review \\\n", - "0 2022-07-09 15:00:00 Great music service, the audio is high quality... \n", - "1 2022-07-09 14:21:22 Please ignore previous negative rating. This a... \n", - "2 2022-07-09 13:27:32 This pop-up \"Get the best Spotify experience o... \n", - "3 2022-07-09 13:26:45 Really buggy and terrible to use as of recently \n", - "4 2022-07-09 13:20:49 Dear Spotify why do I get songs that I didn't ... \n", - "\n", - " Rating Total_thumbsup Reply \n", - "0 5 2 NaN \n", - "1 5 1 NaN \n", - "2 4 0 NaN \n", - "3 1 1 NaN \n", - "4 1 1 NaN " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv0.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "bd769aee-cbe3-4237-b420-4c3bcd8eec73", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Time_submittedReviewRatingTotal_thumbsupReply
615892022-01-01 03:01:29Even though it was communicated that lyrics fe...16NaN
615902022-01-01 02:13:40Use to be sooo good back when I had it, and wh...10NaN
615912022-01-01 01:02:29This app would be good if not for it taking ov...210NaN
615922022-01-01 00:49:23The app is good hard to navigate and won't jus...21NaN
615932022-01-01 00:19:09Its good but sometimes it doesnt load the musi...40NaN
\n", - "
" - ], - "text/plain": [ - " Time_submitted Review \\\n", - "61589 2022-01-01 03:01:29 Even though it was communicated that lyrics fe... \n", - "61590 2022-01-01 02:13:40 Use to be sooo good back when I had it, and wh... \n", - "61591 2022-01-01 01:02:29 This app would be good if not for it taking ov... \n", - "61592 2022-01-01 00:49:23 The app is good hard to navigate and won't jus... \n", - "61593 2022-01-01 00:19:09 Its good but sometimes it doesnt load the musi... \n", - "\n", - " Rating Total_thumbsup Reply \n", - "61589 1 6 NaN \n", - "61590 1 0 NaN \n", - "61591 2 10 NaN \n", - "61592 2 1 NaN \n", - "61593 4 0 NaN " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv0.tail()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4e1fd6d9-df1e-4615-aae2-203559d51cd6", - "metadata": {}, - "outputs": [], - "source": [ - "csv1 = pd.read_csv(\"Airbnb_Open_Data.csv\", low_memory=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "1efba903-5004-4d7b-a1ee-42f333111055", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNAMEhost idhost_identity_verifiedhost nameneighbourhood groupneighbourhoodlatlongcountry...service feeminimum nightsnumber of reviewslast reviewreviews per monthreview rate numbercalculated host listings countavailability 365house_ruleslicense
01001254Clean & quiet apt home by the park80014485718unconfirmedMadalineBrooklynKensington40.64749-73.97237United States...$19310.09.010/19/20210.214.06.0286.0Clean up and treat the home the way you'd like...NaN
11002102Skylit Midtown Castle52335172823verifiedJennaManhattanMidtown40.75362-73.98377United States...$2830.045.05/21/20220.384.02.0228.0Pet friendly but please confirm with me if the...NaN
21002403THE VILLAGE OF HARLEM....NEW YORK !78829239556NaNEliseManhattanHarlem40.80902-73.94190United States...$1243.00.0NaNNaN5.01.0352.0I encourage you to use my kitchen, cooking and...NaN
31002755NaN85098326012unconfirmedGarryBrooklynClinton Hill40.68514-73.95976United States...$7430.0270.07/5/20194.644.01.0322.0NaNNaN
41003689Entire Apt: Spacious Studio/Loft by central park92037596077verifiedLyndonManhattanEast Harlem40.79851-73.94399United States...$4110.09.011/19/20180.103.01.0289.0Please no smoking in the house, porch or on th...NaN
\n", - "

5 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " id NAME host id \\\n", - "0 1001254 Clean & quiet apt home by the park 80014485718 \n", - "1 1002102 Skylit Midtown Castle 52335172823 \n", - "2 1002403 THE VILLAGE OF HARLEM....NEW YORK ! 78829239556 \n", - "3 1002755 NaN 85098326012 \n", - "4 1003689 Entire Apt: Spacious Studio/Loft by central park 92037596077 \n", - "\n", - " host_identity_verified host name neighbourhood group neighbourhood \\\n", - "0 unconfirmed Madaline Brooklyn Kensington \n", - "1 verified Jenna Manhattan Midtown \n", - "2 NaN Elise Manhattan Harlem \n", - "3 unconfirmed Garry Brooklyn Clinton Hill \n", - "4 verified Lyndon Manhattan East Harlem \n", - "\n", - " lat long country ... service fee minimum nights \\\n", - "0 40.64749 -73.97237 United States ... $193 10.0 \n", - "1 40.75362 -73.98377 United States ... $28 30.0 \n", - "2 40.80902 -73.94190 United States ... $124 3.0 \n", - "3 40.68514 -73.95976 United States ... $74 30.0 \n", - "4 40.79851 -73.94399 United States ... $41 10.0 \n", - "\n", - " number of reviews last review reviews per month review rate number \\\n", - "0 9.0 10/19/2021 0.21 4.0 \n", - "1 45.0 5/21/2022 0.38 4.0 \n", - "2 0.0 NaN NaN 5.0 \n", - "3 270.0 7/5/2019 4.64 4.0 \n", - "4 9.0 11/19/2018 0.10 3.0 \n", - "\n", - " calculated host listings count availability 365 \\\n", - "0 6.0 286.0 \n", - "1 2.0 228.0 \n", - "2 1.0 352.0 \n", - "3 1.0 322.0 \n", - "4 1.0 289.0 \n", - "\n", - " house_rules license \n", - "0 Clean up and treat the home the way you'd like... NaN \n", - "1 Pet friendly but please confirm with me if the... NaN \n", - "2 I encourage you to use my kitchen, cooking and... NaN \n", - "3 NaN NaN \n", - "4 Please no smoking in the house, porch or on th... NaN \n", - "\n", - "[5 rows x 26 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv1.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "39d543be-013a-4976-942d-f9884274c7be", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNAMEhost idhost_identity_verifiedhost nameneighbourhood groupneighbourhoodlatlongcountry...service feeminimum nightsnumber of reviewslast reviewreviews per monthreview rate numbercalculated host listings countavailability 365house_ruleslicense
01001254Clean & quiet apt home by the park80014485718unconfirmedMadalineBrooklynKensington40.64749-73.97237United States...$19310.09.010/19/20210.214.06.0286.0Clean up and treat the home the way you'd like...NaN
11002102Skylit Midtown Castle52335172823verifiedJennaManhattanMidtown40.75362-73.98377United States...$2830.045.05/21/20220.384.02.0228.0Pet friendly but please confirm with me if the...NaN
21002403THE VILLAGE OF HARLEM....NEW YORK !78829239556NaNEliseManhattanHarlem40.80902-73.94190United States...$1243.00.0NaNNaN5.01.0352.0I encourage you to use my kitchen, cooking and...NaN
31002755NaN85098326012unconfirmedGarryBrooklynClinton Hill40.68514-73.95976United States...$7430.0270.07/5/20194.644.01.0322.0NaNNaN
41003689Entire Apt: Spacious Studio/Loft by central park92037596077verifiedLyndonManhattanEast Harlem40.79851-73.94399United States...$4110.09.011/19/20180.103.01.0289.0Please no smoking in the house, porch or on th...NaN
\n", - "

5 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " id NAME host id \\\n", - "0 1001254 Clean & quiet apt home by the park 80014485718 \n", - "1 1002102 Skylit Midtown Castle 52335172823 \n", - "2 1002403 THE VILLAGE OF HARLEM....NEW YORK ! 78829239556 \n", - "3 1002755 NaN 85098326012 \n", - "4 1003689 Entire Apt: Spacious Studio/Loft by central park 92037596077 \n", - "\n", - " host_identity_verified host name neighbourhood group neighbourhood \\\n", - "0 unconfirmed Madaline Brooklyn Kensington \n", - "1 verified Jenna Manhattan Midtown \n", - "2 NaN Elise Manhattan Harlem \n", - "3 unconfirmed Garry Brooklyn Clinton Hill \n", - "4 verified Lyndon Manhattan East Harlem \n", - "\n", - " lat long country ... service fee minimum nights \\\n", - "0 40.64749 -73.97237 United States ... $193 10.0 \n", - "1 40.75362 -73.98377 United States ... $28 30.0 \n", - "2 40.80902 -73.94190 United States ... $124 3.0 \n", - "3 40.68514 -73.95976 United States ... $74 30.0 \n", - "4 40.79851 -73.94399 United States ... $41 10.0 \n", - "\n", - " number of reviews last review reviews per month review rate number \\\n", - "0 9.0 10/19/2021 0.21 4.0 \n", - "1 45.0 5/21/2022 0.38 4.0 \n", - "2 0.0 NaN NaN 5.0 \n", - "3 270.0 7/5/2019 4.64 4.0 \n", - "4 9.0 11/19/2018 0.10 3.0 \n", - "\n", - " calculated host listings count availability 365 \\\n", - "0 6.0 286.0 \n", - "1 2.0 228.0 \n", - "2 1.0 352.0 \n", - "3 1.0 322.0 \n", - "4 1.0 289.0 \n", - "\n", - " house_rules license \n", - "0 Clean up and treat the home the way you'd like... NaN \n", - "1 Pet friendly but please confirm with me if the... NaN \n", - "2 I encourage you to use my kitchen, cooking and... NaN \n", - "3 NaN NaN \n", - "4 Please no smoking in the house, porch or on th... NaN \n", - "\n", - "[5 rows x 26 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv1.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "95f93b29-94be-4c93-9793-cf51c2ba2442", - "metadata": {}, - "outputs": [], - "source": [ - "csv02 = pd.read_csv(\"WAZE_REVIEWS.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "7f8b10d2-6225-47d8-82b5-b8041ee6412b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0review_idpseudo_author_idauthor_namereview_textreview_ratingreview_likesauthor_app_versionreview_timestamp
006caba53d-789d-4733-bad5-c7491daf80f2152618553977019693742A Google userNice app need to add red light cam.500.99.2.32009-06-30 16:48:15
1130c15838-8b02-4dae-8f51-25905cb40b68234382942865437071667A Google userReally cool social app. Lots of potential to b...500.99.2.32009-06-30 16:58:43
22c090400e-f88f-4129-930d-a650f3163a11174473604608358796368A Google userI was all excited about this app (ehat a great...100.99.2.32009-06-30 17:08:33
33f6f37456-793b-4786-af6e-454a811361bf286593453219054880269A Google userI love this app! Lol500.99.2.32009-06-30 17:37:22
448ae5d962-7c0c-476d-82fa-79f6e5484acc167276875678680630145A Google userGreat app i like the idea of your car being pa...400.99.2.32009-06-30 23:58:43
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 review_id pseudo_author_id \\\n", - "0 0 6caba53d-789d-4733-bad5-c7491daf80f2 152618553977019693742 \n", - "1 1 30c15838-8b02-4dae-8f51-25905cb40b68 234382942865437071667 \n", - "2 2 c090400e-f88f-4129-930d-a650f3163a11 174473604608358796368 \n", - "3 3 f6f37456-793b-4786-af6e-454a811361bf 286593453219054880269 \n", - "4 4 8ae5d962-7c0c-476d-82fa-79f6e5484acc 167276875678680630145 \n", - "\n", - " author_name review_text \\\n", - "0 A Google user Nice app need to add red light cam. \n", - "1 A Google user Really cool social app. Lots of potential to b... \n", - "2 A Google user I was all excited about this app (ehat a great... \n", - "3 A Google user I love this app! Lol \n", - "4 A Google user Great app i like the idea of your car being pa... \n", - "\n", - " review_rating review_likes author_app_version review_timestamp \n", - "0 5 0 0.99.2.3 2009-06-30 16:48:15 \n", - "1 5 0 0.99.2.3 2009-06-30 16:58:43 \n", - "2 1 0 0.99.2.3 2009-06-30 17:08:33 \n", - "3 5 0 0.99.2.3 2009-06-30 17:37:22 \n", - "4 4 0 0.99.2.3 2009-06-30 23:58:43 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv02.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "39b1151a-655a-4191-8fcb-2ff1b40e5edf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0review_idpseudo_author_idauthor_namereview_textreview_ratingreview_likesauthor_app_versionreview_timestamp
78006878006801655504-5a51-4c19-b313-2bd5fa3f253a680743620884748258838Ma********llFreezes30NaN2023-11-17 03:18:26
780069780069f04306cb-af60-4a44-aebc-c37122620319266638684561117704682Zu******elTo stuck10NaN2023-11-17 03:18:38
780070780070894e3c41-ca20-4781-9308-70eeb060a865154572309081670894420br**********jiracist made app104.99.0.22023-11-17 03:23:20
7800717800714fafb0b1-485e-473e-9bcd-d5c9848424d2154995071911163107981Mo***********daبهترین مثل همیشه.با آی پی ثابت های کانال تلگرا...504.99.1.12023-11-17 04:05:02
780072780072d1570ba0-ffc5-4fc6-8d34-12daba4b38e2200574835524973617311Re***********izBest app ever used.504.99.0.22023-11-17 04:06:44
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 review_id \\\n", - "780068 780068 01655504-5a51-4c19-b313-2bd5fa3f253a \n", - "780069 780069 f04306cb-af60-4a44-aebc-c37122620319 \n", - "780070 780070 894e3c41-ca20-4781-9308-70eeb060a865 \n", - "780071 780071 4fafb0b1-485e-473e-9bcd-d5c9848424d2 \n", - "780072 780072 d1570ba0-ffc5-4fc6-8d34-12daba4b38e2 \n", - "\n", - " pseudo_author_id author_name \\\n", - "780068 680743620884748258838 Ma********ll \n", - "780069 266638684561117704682 Zu******el \n", - "780070 154572309081670894420 br**********ji \n", - "780071 154995071911163107981 Mo***********da \n", - "780072 200574835524973617311 Re***********iz \n", - "\n", - " review_text review_rating \\\n", - "780068 Freezes 3 \n", - "780069 To stuck 1 \n", - "780070 racist made app 1 \n", - "780071 بهترین مثل همیشه.با آی پی ثابت های کانال تلگرا... 5 \n", - "780072 Best app ever used. 5 \n", - "\n", - " review_likes author_app_version review_timestamp \n", - "780068 0 NaN 2023-11-17 03:18:26 \n", - "780069 0 NaN 2023-11-17 03:18:38 \n", - "780070 0 4.99.0.2 2023-11-17 03:23:20 \n", - "780071 0 4.99.1.1 2023-11-17 04:05:02 \n", - "780072 0 4.99.0.2 2023-11-17 04:06:44 " - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv02.tail()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "2fc95472-e0ae-45f2-86fd-4aa023239c0d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sourcereview_iduser_namereview_titlereview_descriptionratingthumbs_upreview_datedeveloper_responsedeveloper_response_dateappVersionlaguage_codecountry_code
0Google Play18d6584c-d0e9-4833-a744-f607058aee97Milky WayNaNSuddenly, the driver can't have my location an...10.02023-08-10 17:48:51NaNNaNNaNenin
1Google Play50a08f18-cece-4ddf-b617-028844c8aa28Bradlee SeveraNaNVery cordial.. And helped with a quick turnaro...50.02023-08-10 17:38:35NaNNaN4.485.10000enin
2Google Playb0d8e75a-80a7-4dcd-abaf-72b046dbeeb7Amit AggarwalNaNVery good experience50.02023-08-10 17:38:17NaNNaN4.486.10002enin
3Google Play502702a9-25ed-4373-a96c-7fa1f06caacdBryant InmanNaNAll I use50.02023-08-10 17:37:45NaNNaN4.467.10008enin
4Google Playf47a3fb6-23db-49bd-9e63-f33c8d724d07Addie WhittakerNaNI have enjoyed traveling by Uber my drivers ha...50.02023-08-10 17:36:56NaNNaN4.486.10002enin
\n", - "
" - ], - "text/plain": [ - " source review_id user_name \\\n", - "0 Google Play 18d6584c-d0e9-4833-a744-f607058aee97 Milky Way \n", - "1 Google Play 50a08f18-cece-4ddf-b617-028844c8aa28 Bradlee Severa \n", - "2 Google Play b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7 Amit Aggarwal \n", - "3 Google Play 502702a9-25ed-4373-a96c-7fa1f06caacd Bryant Inman \n", - "4 Google Play f47a3fb6-23db-49bd-9e63-f33c8d724d07 Addie Whittaker \n", - "\n", - " review_title review_description rating \\\n", - "0 NaN Suddenly, the driver can't have my location an... 1 \n", - "1 NaN Very cordial.. And helped with a quick turnaro... 5 \n", - "2 NaN Very good experience 5 \n", - "3 NaN All I use 5 \n", - "4 NaN I have enjoyed traveling by Uber my drivers ha... 5 \n", - "\n", - " thumbs_up review_date developer_response developer_response_date \\\n", - "0 0.0 2023-08-10 17:48:51 NaN NaN \n", - "1 0.0 2023-08-10 17:38:35 NaN NaN \n", - "2 0.0 2023-08-10 17:38:17 NaN NaN \n", - "3 0.0 2023-08-10 17:37:45 NaN NaN \n", - "4 0.0 2023-08-10 17:36:56 NaN NaN \n", - "\n", - " appVersion laguage_code country_code \n", - "0 NaN en in \n", - "1 4.485.10000 en in \n", - "2 4.486.10002 en in \n", - "3 4.467.10008 en in \n", - "4 4.486.10002 en in " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "uber = pd.read_csv(\"Uber Customer Reviews.csv\", low_memory=False)\n", - "uber.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "d4ace5a2-346a-4099-9854-1cac2749a216", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1069616, 13)\n" - ] - } - ], - "source": [ - "print(np.shape(uber))" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "ad7ac03d-a9df-4688-ad3c-8e354996f52c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sourcereview_iduser_namereview_titlereview_descriptionratingthumbs_upreview_datedeveloper_responsedeveloper_response_dateappVersionlaguage_codecountry_code
0Google Playfbc7ffc9-5a89-446e-87fd-d69bf4a7f984Puipuii RalteNaNThe map in Ola is so messed up, i have to pay ...10.02023-08-10 16:40:50NaNNaN6.3.2enin
1Google Play5a0051fb-220a-45b2-ba94-a15a2949218fDeepak KumarNaNDeepak Kumar.... 🙏🙏🙏🙏🙏]50.02023-08-10 16:36:14NaNNaNNaNenin
2Google Play71ebf933-b734-474d-bb65-a18c90906ed2Ahamed AzarudeenNaNSuch aa irresponsible app more then I waiting ...10.02023-08-10 16:29:31NaNNaN6.3.1enin
3Google Playe1cc0010-60b3-4126-99c2-e8549088566aRahil SyedNaNWorst10.02023-08-10 15:52:06NaNNaN5.0.4enin
4Google Play77cf1be1-b428-4493-ae25-e0f288f79b8fvin 007NaNToo much expensive .. try UBer... They are pro...10.02023-08-10 15:51:10NaNNaNNaNenin
\n", - "
" - ], - "text/plain": [ - " source review_id user_name \\\n", - "0 Google Play fbc7ffc9-5a89-446e-87fd-d69bf4a7f984 Puipuii Ralte \n", - "1 Google Play 5a0051fb-220a-45b2-ba94-a15a2949218f Deepak Kumar \n", - "2 Google Play 71ebf933-b734-474d-bb65-a18c90906ed2 Ahamed Azarudeen \n", - "3 Google Play e1cc0010-60b3-4126-99c2-e8549088566a Rahil Syed \n", - "4 Google Play 77cf1be1-b428-4493-ae25-e0f288f79b8f vin 007 \n", - "\n", - " review_title review_description rating \\\n", - "0 NaN The map in Ola is so messed up, i have to pay ... 1 \n", - "1 NaN Deepak Kumar.... 🙏🙏🙏🙏🙏] 5 \n", - "2 NaN Such aa irresponsible app more then I waiting ... 1 \n", - "3 NaN Worst 1 \n", - "4 NaN Too much expensive .. try UBer... They are pro... 1 \n", - "\n", - " thumbs_up review_date developer_response developer_response_date \\\n", - "0 0.0 2023-08-10 16:40:50 NaN NaN \n", - "1 0.0 2023-08-10 16:36:14 NaN NaN \n", - "2 0.0 2023-08-10 16:29:31 NaN NaN \n", - "3 0.0 2023-08-10 15:52:06 NaN NaN \n", - "4 0.0 2023-08-10 15:51:10 NaN NaN \n", - "\n", - " appVersion laguage_code country_code \n", - "0 6.3.2 en in \n", - "1 NaN en in \n", - "2 6.3.1 en in \n", - "3 5.0.4 en in \n", - "4 NaN en in " - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ola_df = pd.read_csv(\"Ola Customer Reviews.csv\", low_memory=False)\n", - "ola_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "878a39c4-45d5-41d6-82b0-9c373c28e280", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "count 357678.000000\n", - "mean 92.402697\n", - "std 125.489169\n", - "min 1.000000\n", - "25% 8.000000\n", - "50% 33.000000\n", - "75% 131.000000\n", - "max 2877.000000\n", - "Name: review_length, dtype: float64\n" - ] - } - ], - "source": [ - "# Check average review length\n", - "ola_df['review_length'] = ola_df['review_description'].str.len()\n", - "print(ola_df['review_length'].describe())\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "1dd032ba-343b-4402-9d96-ee5e0432ab07", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Substantive reviews: 204715\n" - ] - } - ], - "source": [ - "# Filter out very short reviews\n", - "substantive_reviews = ola_df[ola_df['review_length'] > 20]\n", - "print(f\"Substantive reviews: {len(substantive_reviews)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "2e58bf99-c08e-4e41-9b98-124b3f9e6145", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "count 1.069447e+06\n", - "mean 7.023987e+01\n", - "std 1.158196e+02\n", - "min 1.000000e+00\n", - "25% 8.000000e+00\n", - "50% 2.100000e+01\n", - "75% 7.800000e+01\n", - "max 3.792000e+03\n", - "Name: review_length, dtype: float64\n" - ] - } - ], - "source": [ - "# Check average review length\n", - "uber['review_length'] = uber['review_description'].str.len()\n", - "print(uber['review_length'].describe())\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "2dd05939-e87c-443d-9012-e5f45cf64ff5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Substantive reviews: 542110\n" - ] - } - ], - "source": [ - "# Filter out very short reviews\n", - "substantive_reviews = uber[uber['review_length'] > 20]\n", - "print(f\"Substantive reviews: {len(substantive_reviews)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75ad8e81-3f11-4152-9494-b95bbba6fa01", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/multitag/multitag.py b/multitag/multitag.py index a728777..32aca56 100644 --- a/multitag/multitag.py +++ b/multitag/multitag.py @@ -1,3 +1,14 @@ +# TODO: Refactor,especially change expected names as I jumped the gun when first making this without sampling properly +# TODO: Add button labels and finalise the categories of aspects +# TODO: Ensure there is persistent progress tracking implentation before labelling +# TODO: Finalise keybinds +# TODO: Display progress e.g. review 1020 of 5000 +# TODO: Validate saving progres +# TODO: Loop instead of pressing enter +# TODO: Autosave ? / confirm quit at least +# TODO: More visual q's + + import tkinter as tk from tkinter import ttk import pandas as pd diff --git a/multitag/preprocess.py b/multitag/preprocess.py index 9167b1a..646378b 100644 --- a/multitag/preprocess.py +++ b/multitag/preprocess.py @@ -88,9 +88,12 @@ def preprocess_uber_reviews(input_path, output_path): df['word_count'] = df['review_clean'].str.split().str.len() # 5. Remove short reviews - review_length_limit = 5 - print(f"\n4. Removing short reviews (< {review_length_limit})...") - print(" Rationale: Insufficient context for classification") + review_length_limit = 5 ### limit review length ### + print(f"\n4. Removing short reviews so reviews have better context / (usefulness) (< {review_length_limit})...") + # 1 word reviews provide little to draw conclusions from and bloat the + # dataset a lot, nearly 50% of reviews! + + # display changes before = len(df) df = df[df['word_count'] >= review_length_limit] removed = before - len(df) @@ -119,8 +122,10 @@ def preprocess_uber_reviews(input_path, output_path): print("PREPROCESSING COMPLETE") print("="*50) print(f"\nFinal dataset: {len(df_clean):,} reviews") - print(f"Data source: Indian Uber market (predominantly English)") - print(f"Quality filters: word_count >= 5, duplicates removed") + print(f"Quality filters: word_count >= 5, duplicates removed") + # while this does remove a some legitimate reviews which would provide use in classification + # it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5 + # it showed the most amount of formative reviews without seeming excessive in data removal print("\nRating distribution:") rating_dist = df_clean['rating'].value_counts().sort_index() @@ -138,7 +143,7 @@ def preprocess_uber_reviews(input_path, output_path): print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}") print(f" Null values: {df_clean.isnull().sum().to_dict()}") print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}") - # lang detection takes 5+ mins + # lang detection takes 5+ mins so leaving it commented for now #df_clean['detected_lang'] = df_clean['review'].apply(detect_language) #print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}") @@ -150,13 +155,13 @@ def preprocess_uber_reviews(input_path, output_path): if len(df_clean[df_clean['rating'] == rating]) > 0: sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating]))) print(f"\n{rating} {"✭" * rating} REVIEWS:") - for idx, row in sample.iterrows(): + for index, row in sample.iterrows(): print(f" • ({row['word_count']} words) {row['review'][:100]}") # Note about language print("Language detection not applied due to unreliability on short") - print("informal text. Dataset is from the Indian market, labeled as English.") - print("Manual annotation phase will identify any non-English reviews. And put aside.") + print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.") + print(" ...Manual annotation phase will identify any non-English reviews") return df_clean diff --git a/multitag/sampler.py b/multitag/sampler.py index a90a9a4..25a16ae 100644 --- a/multitag/sampler.py +++ b/multitag/sampler.py @@ -1,22 +1,45 @@ +# TODO: Fix get_stratified_sample() replace broken x() with actual working logic +# TODO: Add verification comparison between ratings +# TODO: implement sample_with_keywords() add to lists, and implement logic +# TODO: Clean up the logging print statements + + import pandas as pd import numpy as np print(pd.__version__) print(np.__version__) -path = "data/uber_reviews.csv" -sampled_path = "data/uber_reviews_sampled.csv" +path = "multitag/data/uber_reviews_cleaned.csv" +sampled_path = "multitag/data/uber_reviews_sampled.csv" +original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison class Sampler: - def __init__(self, data_path): + def __init__(self, data_path, target_samples): self.data_path = data_path + self.target_samples = 5000 # target number of samples + self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers) + + self.original_data = pd.read_csv(original_path, low_memory=False) self.data = pd.read_csv(self.data_path, low_memory=False) self.total = len(self.data) # total number of records in the dataset - self.target_samples = 5000 # target number of samples - self.stratify_column = "rating" # column to stratify by + print("="*50) + print("SAMPLER INITIALIZED") + print("="*50,"\n") + + + print(f"Total records in dataset: {self.total}") print(f"Data loaded from {self.data_path}, total records: {len(self.data)}") - print(self.data.head()) + #print(self.data.head()) + #print(f"\nCurrent distribution:") + #print(self.data[self.stratify_column].value_counts().sort_index()) + #print(f"\nColumns: {self.data.columns.tolist()}") + print(f"Percentage distribution (working data):") + print((self.data[self.stratify_column].value_counts(normalize=True).sort_index() * 100).round(1),"\n") + _origdist = self.original_data[self.stratify_column].value_counts(normalize=True).sort_index() + print(f"Original Distribution from {original_path}:") + print((_origdist*100).round(1),"\n") self.data.info() @@ -31,36 +54,128 @@ class Sampler: 2 3.9% (41707) Name: proportion, dtype: object """ - - def get_stratified_sample(self): - stratified_sample = self.data.groupby(self.stratify_column).apply( - lambda x: x.sample(n=int(len(x) / self.total * self.target_samples)), - # include_groups=False - ) - return stratified_sample -sampler = Sampler("data/uber_reviews.csv") - - - -to_sample = input("Do you want to create a stratified sample of the data? (y/n): ") - -if to_sample == 'y': - sampled = sampler.get_stratified_sample() - sampled.to_csv("data/uber_reviews_sampled.csv", index=False) - print("Original columns:", sampler.data.columns.tolist()) - print("Sampled columns:", sampled.columns.tolist()) - print("Stratified sample saved to data/uber_reviews_sampled.csv") -elif to_sample == 'n': - sampled_data = pd.read_csv("data/uber_reviews_sampled.csv", low_memory=False) - """ - debug to check sampled data matches original columns - print("Original columns:", sampler.data.columns.tolist()) - print("Sampled columns:", sampled_data.columns.tolist()) """ - print("Original data distribution:") - print(sampler.data["rating"].value_counts()) - print("Sampled data distribution:") - print(sampled_data["rating"].value_counts()) -else: - print("Invalid input, please enter 'y' or 'n'") + Sample size by rating + Redundant calculation, kept for clarity + Doesn't factor that the distribution changed greatly after preprocessing + + """ + def get_stratified_sample(self) -> pd.Series: + stratified_sample = self.data.groupby(self.stratify_column).apply(self.x) + return stratified_sample + + + # x(self): helper function for get_proportional_sample and get_stratified_sample =FIX= + def x(self, ): + return lambda x: x.sample(n=int(len(x) / self.total * self.target_samples)) + """ + get_proportional_sample() + + """ + + """ + original_distribution_sample() + The main sampling method for our labelling as it + keeps composition of the original uber dataset + which is a fairer comparison, may also work better in general + + inputs: + + outputs: + + """ + def original_distribution_sample(self): + original_dist = { + 5: int(0.571 * self.target_samples), + 1: int(0.265 * self.target_samples), + 4: int(0.078 * self.target_samples), + 3: int(0.047 * self.target_samples), + 2: int(0.039 * self.target_samples) + } + print("Target Distribution =", original_dist) + samples = [] + for rating, num_samples in original_dist.items(): + rating_data = self.data[self.data[self.stratify_column] == rating] + if len(rating_data) < num_samples: + print("Missing samples available for rating") + num_samples = len(rating_data) + sample = rating_data.sample(n = num_samples,random_state=33) + samples.append(sample) + original_sample = pd.concat(samples, ignore_index=True) + return original_sample + + """ + sample_with_keywords() + + In order to train on more bugs and features data in + future this method was created + - 2000 balanced by rating (400 per) + - 1500 likely bugs using bug_keywords list + - 1500 likely features using feature_keywords list + + inputs: + outputs: + + """ + + def sample_with_keywords(): + #TODO add keywords for feature classification + print(f"\n{"="*50}") + print("Keyword influenced / rating stratified set") + print(f"\n{"="*50}") + + bug_keywords = ["crash","crashes", "freeze", "freezes", "error", + "stops", "doesnt work", "doesn't work","loading", + "blank", "stuck", "load", "loads", "broken", "breaks", + "glitch", "glitches", "issue", "could you", "fix", + "failed"] + + + return + + def save_sample(self, sample_df,output_path): + """Save sample and display statistics""" + sample_df.to_csv(output_path, index=False) + + print(f"\n{'='*50}") + print("SAMPLE SAVED") + print(f"{'='*50}") + print(f"Location: {output_path}") + print(f"Total samples: {len(sample_df):,}") + print(f"\nDistribution:") + for rating in sorted(sample_df[self.stratify_column].unique()): + count = (sample_df[self.stratify_column] == rating).sum() + pct = count / len(sample_df) * 100 + print(f" {rating}★: {count:,} ({pct:.1f}%)") + +def main(): + + sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000) + + # Choose sampling strategy + print(f"\n{'='*50}") + print("SAMPLING STRATEGY OPTIONS") + print(f"{'='*50}") + print("1. get_stratified_sample() stratified by current distribution") + print("2. original_distribution_sample() stratified by the original data distribution") + print("3. get_keyword_boosted_sample() stratified using original distribution but also using a keyword dictionary") + + choice = input("\nEnter choice (1-3): ").strip() + + if choice == '1': + sample = sampler.get_stratified_sample() + sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") + + elif choice == '2': + sample = sampler.original_distribution_sample() + sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") + + elif choice == '3': + sample = sampler.get_keyword_boosted_sample() + sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") + + + +if __name__ == "__main__": + main() \ No newline at end of file