diff --git a/.gitignore b/.gitignore
index aa3e679..8054551 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
multitag/data/*.csv
multitag/raw_data/
+multitag/.ipynb_checkpoints
+multitag/.vscode
diff --git a/.ipynb_checkpoints/datasets_reviews-checkpoint.ipynb b/.ipynb_checkpoints/datasets_reviews-checkpoint.ipynb
deleted file mode 100644
index 910a184..0000000
--- a/.ipynb_checkpoints/datasets_reviews-checkpoint.ipynb
+++ /dev/null
@@ -1,1471 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "f3da59fb-eb6b-449f-b8d5-95ddacd456f2",
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "7c97ff6e-05a0-4ed1-945a-04f024b3045a",
- "metadata": {},
- "outputs": [],
- "source": [
- "csv0 = pd.read_csv(\"spotify.csv\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "c0631560-c1be-4bbf-b050-b6a552e74d63",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Time_submitted | \n",
- " Review | \n",
- " Rating | \n",
- " Total_thumbsup | \n",
- " Reply | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 2022-07-09 15:00:00 | \n",
- " Great music service, the audio is high quality... | \n",
- " 5 | \n",
- " 2 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 2022-07-09 14:21:22 | \n",
- " Please ignore previous negative rating. This a... | \n",
- " 5 | \n",
- " 1 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 2022-07-09 13:27:32 | \n",
- " This pop-up \"Get the best Spotify experience o... | \n",
- " 4 | \n",
- " 0 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 2022-07-09 13:26:45 | \n",
- " Really buggy and terrible to use as of recently | \n",
- " 1 | \n",
- " 1 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 2022-07-09 13:20:49 | \n",
- " Dear Spotify why do I get songs that I didn't ... | \n",
- " 1 | \n",
- " 1 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Time_submitted Review \\\n",
- "0 2022-07-09 15:00:00 Great music service, the audio is high quality... \n",
- "1 2022-07-09 14:21:22 Please ignore previous negative rating. This a... \n",
- "2 2022-07-09 13:27:32 This pop-up \"Get the best Spotify experience o... \n",
- "3 2022-07-09 13:26:45 Really buggy and terrible to use as of recently \n",
- "4 2022-07-09 13:20:49 Dear Spotify why do I get songs that I didn't ... \n",
- "\n",
- " Rating Total_thumbsup Reply \n",
- "0 5 2 NaN \n",
- "1 5 1 NaN \n",
- "2 4 0 NaN \n",
- "3 1 1 NaN \n",
- "4 1 1 NaN "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "csv0.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "bd769aee-cbe3-4237-b420-4c3bcd8eec73",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Time_submitted | \n",
- " Review | \n",
- " Rating | \n",
- " Total_thumbsup | \n",
- " Reply | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 61589 | \n",
- " 2022-01-01 03:01:29 | \n",
- " Even though it was communicated that lyrics fe... | \n",
- " 1 | \n",
- " 6 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 61590 | \n",
- " 2022-01-01 02:13:40 | \n",
- " Use to be sooo good back when I had it, and wh... | \n",
- " 1 | \n",
- " 0 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 61591 | \n",
- " 2022-01-01 01:02:29 | \n",
- " This app would be good if not for it taking ov... | \n",
- " 2 | \n",
- " 10 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 61592 | \n",
- " 2022-01-01 00:49:23 | \n",
- " The app is good hard to navigate and won't jus... | \n",
- " 2 | \n",
- " 1 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 61593 | \n",
- " 2022-01-01 00:19:09 | \n",
- " Its good but sometimes it doesnt load the musi... | \n",
- " 4 | \n",
- " 0 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Time_submitted Review \\\n",
- "61589 2022-01-01 03:01:29 Even though it was communicated that lyrics fe... \n",
- "61590 2022-01-01 02:13:40 Use to be sooo good back when I had it, and wh... \n",
- "61591 2022-01-01 01:02:29 This app would be good if not for it taking ov... \n",
- "61592 2022-01-01 00:49:23 The app is good hard to navigate and won't jus... \n",
- "61593 2022-01-01 00:19:09 Its good but sometimes it doesnt load the musi... \n",
- "\n",
- " Rating Total_thumbsup Reply \n",
- "61589 1 6 NaN \n",
- "61590 1 0 NaN \n",
- "61591 2 10 NaN \n",
- "61592 2 1 NaN \n",
- "61593 4 0 NaN "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "csv0.tail()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "4e1fd6d9-df1e-4615-aae2-203559d51cd6",
- "metadata": {},
- "outputs": [],
- "source": [
- "csv1 = pd.read_csv(\"Airbnb_Open_Data.csv\", low_memory=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "1efba903-5004-4d7b-a1ee-42f333111055",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " NAME | \n",
- " host id | \n",
- " host_identity_verified | \n",
- " host name | \n",
- " neighbourhood group | \n",
- " neighbourhood | \n",
- " lat | \n",
- " long | \n",
- " country | \n",
- " ... | \n",
- " service fee | \n",
- " minimum nights | \n",
- " number of reviews | \n",
- " last review | \n",
- " reviews per month | \n",
- " review rate number | \n",
- " calculated host listings count | \n",
- " availability 365 | \n",
- " house_rules | \n",
- " license | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1001254 | \n",
- " Clean & quiet apt home by the park | \n",
- " 80014485718 | \n",
- " unconfirmed | \n",
- " Madaline | \n",
- " Brooklyn | \n",
- " Kensington | \n",
- " 40.64749 | \n",
- " -73.97237 | \n",
- " United States | \n",
- " ... | \n",
- " $193 | \n",
- " 10.0 | \n",
- " 9.0 | \n",
- " 10/19/2021 | \n",
- " 0.21 | \n",
- " 4.0 | \n",
- " 6.0 | \n",
- " 286.0 | \n",
- " Clean up and treat the home the way you'd like... | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1002102 | \n",
- " Skylit Midtown Castle | \n",
- " 52335172823 | \n",
- " verified | \n",
- " Jenna | \n",
- " Manhattan | \n",
- " Midtown | \n",
- " 40.75362 | \n",
- " -73.98377 | \n",
- " United States | \n",
- " ... | \n",
- " $28 | \n",
- " 30.0 | \n",
- " 45.0 | \n",
- " 5/21/2022 | \n",
- " 0.38 | \n",
- " 4.0 | \n",
- " 2.0 | \n",
- " 228.0 | \n",
- " Pet friendly but please confirm with me if the... | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1002403 | \n",
- " THE VILLAGE OF HARLEM....NEW YORK ! | \n",
- " 78829239556 | \n",
- " NaN | \n",
- " Elise | \n",
- " Manhattan | \n",
- " Harlem | \n",
- " 40.80902 | \n",
- " -73.94190 | \n",
- " United States | \n",
- " ... | \n",
- " $124 | \n",
- " 3.0 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " 5.0 | \n",
- " 1.0 | \n",
- " 352.0 | \n",
- " I encourage you to use my kitchen, cooking and... | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1002755 | \n",
- " NaN | \n",
- " 85098326012 | \n",
- " unconfirmed | \n",
- " Garry | \n",
- " Brooklyn | \n",
- " Clinton Hill | \n",
- " 40.68514 | \n",
- " -73.95976 | \n",
- " United States | \n",
- " ... | \n",
- " $74 | \n",
- " 30.0 | \n",
- " 270.0 | \n",
- " 7/5/2019 | \n",
- " 4.64 | \n",
- " 4.0 | \n",
- " 1.0 | \n",
- " 322.0 | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1003689 | \n",
- " Entire Apt: Spacious Studio/Loft by central park | \n",
- " 92037596077 | \n",
- " verified | \n",
- " Lyndon | \n",
- " Manhattan | \n",
- " East Harlem | \n",
- " 40.79851 | \n",
- " -73.94399 | \n",
- " United States | \n",
- " ... | \n",
- " $41 | \n",
- " 10.0 | \n",
- " 9.0 | \n",
- " 11/19/2018 | \n",
- " 0.10 | \n",
- " 3.0 | \n",
- " 1.0 | \n",
- " 289.0 | \n",
- " Please no smoking in the house, porch or on th... | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 26 columns
\n",
- "
"
- ],
- "text/plain": [
- " id NAME host id \\\n",
- "0 1001254 Clean & quiet apt home by the park 80014485718 \n",
- "1 1002102 Skylit Midtown Castle 52335172823 \n",
- "2 1002403 THE VILLAGE OF HARLEM....NEW YORK ! 78829239556 \n",
- "3 1002755 NaN 85098326012 \n",
- "4 1003689 Entire Apt: Spacious Studio/Loft by central park 92037596077 \n",
- "\n",
- " host_identity_verified host name neighbourhood group neighbourhood \\\n",
- "0 unconfirmed Madaline Brooklyn Kensington \n",
- "1 verified Jenna Manhattan Midtown \n",
- "2 NaN Elise Manhattan Harlem \n",
- "3 unconfirmed Garry Brooklyn Clinton Hill \n",
- "4 verified Lyndon Manhattan East Harlem \n",
- "\n",
- " lat long country ... service fee minimum nights \\\n",
- "0 40.64749 -73.97237 United States ... $193 10.0 \n",
- "1 40.75362 -73.98377 United States ... $28 30.0 \n",
- "2 40.80902 -73.94190 United States ... $124 3.0 \n",
- "3 40.68514 -73.95976 United States ... $74 30.0 \n",
- "4 40.79851 -73.94399 United States ... $41 10.0 \n",
- "\n",
- " number of reviews last review reviews per month review rate number \\\n",
- "0 9.0 10/19/2021 0.21 4.0 \n",
- "1 45.0 5/21/2022 0.38 4.0 \n",
- "2 0.0 NaN NaN 5.0 \n",
- "3 270.0 7/5/2019 4.64 4.0 \n",
- "4 9.0 11/19/2018 0.10 3.0 \n",
- "\n",
- " calculated host listings count availability 365 \\\n",
- "0 6.0 286.0 \n",
- "1 2.0 228.0 \n",
- "2 1.0 352.0 \n",
- "3 1.0 322.0 \n",
- "4 1.0 289.0 \n",
- "\n",
- " house_rules license \n",
- "0 Clean up and treat the home the way you'd like... NaN \n",
- "1 Pet friendly but please confirm with me if the... NaN \n",
- "2 I encourage you to use my kitchen, cooking and... NaN \n",
- "3 NaN NaN \n",
- "4 Please no smoking in the house, porch or on th... NaN \n",
- "\n",
- "[5 rows x 26 columns]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "csv1.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "39d543be-013a-4976-942d-f9884274c7be",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " NAME | \n",
- " host id | \n",
- " host_identity_verified | \n",
- " host name | \n",
- " neighbourhood group | \n",
- " neighbourhood | \n",
- " lat | \n",
- " long | \n",
- " country | \n",
- " ... | \n",
- " service fee | \n",
- " minimum nights | \n",
- " number of reviews | \n",
- " last review | \n",
- " reviews per month | \n",
- " review rate number | \n",
- " calculated host listings count | \n",
- " availability 365 | \n",
- " house_rules | \n",
- " license | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1001254 | \n",
- " Clean & quiet apt home by the park | \n",
- " 80014485718 | \n",
- " unconfirmed | \n",
- " Madaline | \n",
- " Brooklyn | \n",
- " Kensington | \n",
- " 40.64749 | \n",
- " -73.97237 | \n",
- " United States | \n",
- " ... | \n",
- " $193 | \n",
- " 10.0 | \n",
- " 9.0 | \n",
- " 10/19/2021 | \n",
- " 0.21 | \n",
- " 4.0 | \n",
- " 6.0 | \n",
- " 286.0 | \n",
- " Clean up and treat the home the way you'd like... | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1002102 | \n",
- " Skylit Midtown Castle | \n",
- " 52335172823 | \n",
- " verified | \n",
- " Jenna | \n",
- " Manhattan | \n",
- " Midtown | \n",
- " 40.75362 | \n",
- " -73.98377 | \n",
- " United States | \n",
- " ... | \n",
- " $28 | \n",
- " 30.0 | \n",
- " 45.0 | \n",
- " 5/21/2022 | \n",
- " 0.38 | \n",
- " 4.0 | \n",
- " 2.0 | \n",
- " 228.0 | \n",
- " Pet friendly but please confirm with me if the... | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1002403 | \n",
- " THE VILLAGE OF HARLEM....NEW YORK ! | \n",
- " 78829239556 | \n",
- " NaN | \n",
- " Elise | \n",
- " Manhattan | \n",
- " Harlem | \n",
- " 40.80902 | \n",
- " -73.94190 | \n",
- " United States | \n",
- " ... | \n",
- " $124 | \n",
- " 3.0 | \n",
- " 0.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " 5.0 | \n",
- " 1.0 | \n",
- " 352.0 | \n",
- " I encourage you to use my kitchen, cooking and... | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1002755 | \n",
- " NaN | \n",
- " 85098326012 | \n",
- " unconfirmed | \n",
- " Garry | \n",
- " Brooklyn | \n",
- " Clinton Hill | \n",
- " 40.68514 | \n",
- " -73.95976 | \n",
- " United States | \n",
- " ... | \n",
- " $74 | \n",
- " 30.0 | \n",
- " 270.0 | \n",
- " 7/5/2019 | \n",
- " 4.64 | \n",
- " 4.0 | \n",
- " 1.0 | \n",
- " 322.0 | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1003689 | \n",
- " Entire Apt: Spacious Studio/Loft by central park | \n",
- " 92037596077 | \n",
- " verified | \n",
- " Lyndon | \n",
- " Manhattan | \n",
- " East Harlem | \n",
- " 40.79851 | \n",
- " -73.94399 | \n",
- " United States | \n",
- " ... | \n",
- " $41 | \n",
- " 10.0 | \n",
- " 9.0 | \n",
- " 11/19/2018 | \n",
- " 0.10 | \n",
- " 3.0 | \n",
- " 1.0 | \n",
- " 289.0 | \n",
- " Please no smoking in the house, porch or on th... | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 26 columns
\n",
- "
"
- ],
- "text/plain": [
- " id NAME host id \\\n",
- "0 1001254 Clean & quiet apt home by the park 80014485718 \n",
- "1 1002102 Skylit Midtown Castle 52335172823 \n",
- "2 1002403 THE VILLAGE OF HARLEM....NEW YORK ! 78829239556 \n",
- "3 1002755 NaN 85098326012 \n",
- "4 1003689 Entire Apt: Spacious Studio/Loft by central park 92037596077 \n",
- "\n",
- " host_identity_verified host name neighbourhood group neighbourhood \\\n",
- "0 unconfirmed Madaline Brooklyn Kensington \n",
- "1 verified Jenna Manhattan Midtown \n",
- "2 NaN Elise Manhattan Harlem \n",
- "3 unconfirmed Garry Brooklyn Clinton Hill \n",
- "4 verified Lyndon Manhattan East Harlem \n",
- "\n",
- " lat long country ... service fee minimum nights \\\n",
- "0 40.64749 -73.97237 United States ... $193 10.0 \n",
- "1 40.75362 -73.98377 United States ... $28 30.0 \n",
- "2 40.80902 -73.94190 United States ... $124 3.0 \n",
- "3 40.68514 -73.95976 United States ... $74 30.0 \n",
- "4 40.79851 -73.94399 United States ... $41 10.0 \n",
- "\n",
- " number of reviews last review reviews per month review rate number \\\n",
- "0 9.0 10/19/2021 0.21 4.0 \n",
- "1 45.0 5/21/2022 0.38 4.0 \n",
- "2 0.0 NaN NaN 5.0 \n",
- "3 270.0 7/5/2019 4.64 4.0 \n",
- "4 9.0 11/19/2018 0.10 3.0 \n",
- "\n",
- " calculated host listings count availability 365 \\\n",
- "0 6.0 286.0 \n",
- "1 2.0 228.0 \n",
- "2 1.0 352.0 \n",
- "3 1.0 322.0 \n",
- "4 1.0 289.0 \n",
- "\n",
- " house_rules license \n",
- "0 Clean up and treat the home the way you'd like... NaN \n",
- "1 Pet friendly but please confirm with me if the... NaN \n",
- "2 I encourage you to use my kitchen, cooking and... NaN \n",
- "3 NaN NaN \n",
- "4 Please no smoking in the house, porch or on th... NaN \n",
- "\n",
- "[5 rows x 26 columns]"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "csv1.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "95f93b29-94be-4c93-9793-cf51c2ba2442",
- "metadata": {},
- "outputs": [],
- "source": [
- "csv02 = pd.read_csv(\"WAZE_REVIEWS.csv\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "7f8b10d2-6225-47d8-82b5-b8041ee6412b",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Unnamed: 0 | \n",
- " review_id | \n",
- " pseudo_author_id | \n",
- " author_name | \n",
- " review_text | \n",
- " review_rating | \n",
- " review_likes | \n",
- " author_app_version | \n",
- " review_timestamp | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 0 | \n",
- " 6caba53d-789d-4733-bad5-c7491daf80f2 | \n",
- " 152618553977019693742 | \n",
- " A Google user | \n",
- " Nice app need to add red light cam. | \n",
- " 5 | \n",
- " 0 | \n",
- " 0.99.2.3 | \n",
- " 2009-06-30 16:48:15 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1 | \n",
- " 30c15838-8b02-4dae-8f51-25905cb40b68 | \n",
- " 234382942865437071667 | \n",
- " A Google user | \n",
- " Really cool social app. Lots of potential to b... | \n",
- " 5 | \n",
- " 0 | \n",
- " 0.99.2.3 | \n",
- " 2009-06-30 16:58:43 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 2 | \n",
- " c090400e-f88f-4129-930d-a650f3163a11 | \n",
- " 174473604608358796368 | \n",
- " A Google user | \n",
- " I was all excited about this app (ehat a great... | \n",
- " 1 | \n",
- " 0 | \n",
- " 0.99.2.3 | \n",
- " 2009-06-30 17:08:33 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 3 | \n",
- " f6f37456-793b-4786-af6e-454a811361bf | \n",
- " 286593453219054880269 | \n",
- " A Google user | \n",
- " I love this app! Lol | \n",
- " 5 | \n",
- " 0 | \n",
- " 0.99.2.3 | \n",
- " 2009-06-30 17:37:22 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 4 | \n",
- " 8ae5d962-7c0c-476d-82fa-79f6e5484acc | \n",
- " 167276875678680630145 | \n",
- " A Google user | \n",
- " Great app i like the idea of your car being pa... | \n",
- " 4 | \n",
- " 0 | \n",
- " 0.99.2.3 | \n",
- " 2009-06-30 23:58:43 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Unnamed: 0 review_id pseudo_author_id \\\n",
- "0 0 6caba53d-789d-4733-bad5-c7491daf80f2 152618553977019693742 \n",
- "1 1 30c15838-8b02-4dae-8f51-25905cb40b68 234382942865437071667 \n",
- "2 2 c090400e-f88f-4129-930d-a650f3163a11 174473604608358796368 \n",
- "3 3 f6f37456-793b-4786-af6e-454a811361bf 286593453219054880269 \n",
- "4 4 8ae5d962-7c0c-476d-82fa-79f6e5484acc 167276875678680630145 \n",
- "\n",
- " author_name review_text \\\n",
- "0 A Google user Nice app need to add red light cam. \n",
- "1 A Google user Really cool social app. Lots of potential to b... \n",
- "2 A Google user I was all excited about this app (ehat a great... \n",
- "3 A Google user I love this app! Lol \n",
- "4 A Google user Great app i like the idea of your car being pa... \n",
- "\n",
- " review_rating review_likes author_app_version review_timestamp \n",
- "0 5 0 0.99.2.3 2009-06-30 16:48:15 \n",
- "1 5 0 0.99.2.3 2009-06-30 16:58:43 \n",
- "2 1 0 0.99.2.3 2009-06-30 17:08:33 \n",
- "3 5 0 0.99.2.3 2009-06-30 17:37:22 \n",
- "4 4 0 0.99.2.3 2009-06-30 23:58:43 "
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "csv02.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "39b1151a-655a-4191-8fcb-2ff1b40e5edf",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Unnamed: 0 | \n",
- " review_id | \n",
- " pseudo_author_id | \n",
- " author_name | \n",
- " review_text | \n",
- " review_rating | \n",
- " review_likes | \n",
- " author_app_version | \n",
- " review_timestamp | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 780068 | \n",
- " 780068 | \n",
- " 01655504-5a51-4c19-b313-2bd5fa3f253a | \n",
- " 680743620884748258838 | \n",
- " Ma********ll | \n",
- " Freezes | \n",
- " 3 | \n",
- " 0 | \n",
- " NaN | \n",
- " 2023-11-17 03:18:26 | \n",
- "
\n",
- " \n",
- " | 780069 | \n",
- " 780069 | \n",
- " f04306cb-af60-4a44-aebc-c37122620319 | \n",
- " 266638684561117704682 | \n",
- " Zu******el | \n",
- " To stuck | \n",
- " 1 | \n",
- " 0 | \n",
- " NaN | \n",
- " 2023-11-17 03:18:38 | \n",
- "
\n",
- " \n",
- " | 780070 | \n",
- " 780070 | \n",
- " 894e3c41-ca20-4781-9308-70eeb060a865 | \n",
- " 154572309081670894420 | \n",
- " br**********ji | \n",
- " racist made app | \n",
- " 1 | \n",
- " 0 | \n",
- " 4.99.0.2 | \n",
- " 2023-11-17 03:23:20 | \n",
- "
\n",
- " \n",
- " | 780071 | \n",
- " 780071 | \n",
- " 4fafb0b1-485e-473e-9bcd-d5c9848424d2 | \n",
- " 154995071911163107981 | \n",
- " Mo***********da | \n",
- " بهترین مثل همیشه.با آی پی ثابت های کانال تلگرا... | \n",
- " 5 | \n",
- " 0 | \n",
- " 4.99.1.1 | \n",
- " 2023-11-17 04:05:02 | \n",
- "
\n",
- " \n",
- " | 780072 | \n",
- " 780072 | \n",
- " d1570ba0-ffc5-4fc6-8d34-12daba4b38e2 | \n",
- " 200574835524973617311 | \n",
- " Re***********iz | \n",
- " Best app ever used. | \n",
- " 5 | \n",
- " 0 | \n",
- " 4.99.0.2 | \n",
- " 2023-11-17 04:06:44 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Unnamed: 0 review_id \\\n",
- "780068 780068 01655504-5a51-4c19-b313-2bd5fa3f253a \n",
- "780069 780069 f04306cb-af60-4a44-aebc-c37122620319 \n",
- "780070 780070 894e3c41-ca20-4781-9308-70eeb060a865 \n",
- "780071 780071 4fafb0b1-485e-473e-9bcd-d5c9848424d2 \n",
- "780072 780072 d1570ba0-ffc5-4fc6-8d34-12daba4b38e2 \n",
- "\n",
- " pseudo_author_id author_name \\\n",
- "780068 680743620884748258838 Ma********ll \n",
- "780069 266638684561117704682 Zu******el \n",
- "780070 154572309081670894420 br**********ji \n",
- "780071 154995071911163107981 Mo***********da \n",
- "780072 200574835524973617311 Re***********iz \n",
- "\n",
- " review_text review_rating \\\n",
- "780068 Freezes 3 \n",
- "780069 To stuck 1 \n",
- "780070 racist made app 1 \n",
- "780071 بهترین مثل همیشه.با آی پی ثابت های کانال تلگرا... 5 \n",
- "780072 Best app ever used. 5 \n",
- "\n",
- " review_likes author_app_version review_timestamp \n",
- "780068 0 NaN 2023-11-17 03:18:26 \n",
- "780069 0 NaN 2023-11-17 03:18:38 \n",
- "780070 0 4.99.0.2 2023-11-17 03:23:20 \n",
- "780071 0 4.99.1.1 2023-11-17 04:05:02 \n",
- "780072 0 4.99.0.2 2023-11-17 04:06:44 "
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "csv02.tail()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "2fc95472-e0ae-45f2-86fd-4aa023239c0d",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " source | \n",
- " review_id | \n",
- " user_name | \n",
- " review_title | \n",
- " review_description | \n",
- " rating | \n",
- " thumbs_up | \n",
- " review_date | \n",
- " developer_response | \n",
- " developer_response_date | \n",
- " appVersion | \n",
- " laguage_code | \n",
- " country_code | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " Google Play | \n",
- " 18d6584c-d0e9-4833-a744-f607058aee97 | \n",
- " Milky Way | \n",
- " NaN | \n",
- " Suddenly, the driver can't have my location an... | \n",
- " 1 | \n",
- " 0.0 | \n",
- " 2023-08-10 17:48:51 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " en | \n",
- " in | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " Google Play | \n",
- " 50a08f18-cece-4ddf-b617-028844c8aa28 | \n",
- " Bradlee Severa | \n",
- " NaN | \n",
- " Very cordial.. And helped with a quick turnaro... | \n",
- " 5 | \n",
- " 0.0 | \n",
- " 2023-08-10 17:38:35 | \n",
- " NaN | \n",
- " NaN | \n",
- " 4.485.10000 | \n",
- " en | \n",
- " in | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " Google Play | \n",
- " b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7 | \n",
- " Amit Aggarwal | \n",
- " NaN | \n",
- " Very good experience | \n",
- " 5 | \n",
- " 0.0 | \n",
- " 2023-08-10 17:38:17 | \n",
- " NaN | \n",
- " NaN | \n",
- " 4.486.10002 | \n",
- " en | \n",
- " in | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " Google Play | \n",
- " 502702a9-25ed-4373-a96c-7fa1f06caacd | \n",
- " Bryant Inman | \n",
- " NaN | \n",
- " All I use | \n",
- " 5 | \n",
- " 0.0 | \n",
- " 2023-08-10 17:37:45 | \n",
- " NaN | \n",
- " NaN | \n",
- " 4.467.10008 | \n",
- " en | \n",
- " in | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " Google Play | \n",
- " f47a3fb6-23db-49bd-9e63-f33c8d724d07 | \n",
- " Addie Whittaker | \n",
- " NaN | \n",
- " I have enjoyed traveling by Uber my drivers ha... | \n",
- " 5 | \n",
- " 0.0 | \n",
- " 2023-08-10 17:36:56 | \n",
- " NaN | \n",
- " NaN | \n",
- " 4.486.10002 | \n",
- " en | \n",
- " in | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " source review_id user_name \\\n",
- "0 Google Play 18d6584c-d0e9-4833-a744-f607058aee97 Milky Way \n",
- "1 Google Play 50a08f18-cece-4ddf-b617-028844c8aa28 Bradlee Severa \n",
- "2 Google Play b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7 Amit Aggarwal \n",
- "3 Google Play 502702a9-25ed-4373-a96c-7fa1f06caacd Bryant Inman \n",
- "4 Google Play f47a3fb6-23db-49bd-9e63-f33c8d724d07 Addie Whittaker \n",
- "\n",
- " review_title review_description rating \\\n",
- "0 NaN Suddenly, the driver can't have my location an... 1 \n",
- "1 NaN Very cordial.. And helped with a quick turnaro... 5 \n",
- "2 NaN Very good experience 5 \n",
- "3 NaN All I use 5 \n",
- "4 NaN I have enjoyed traveling by Uber my drivers ha... 5 \n",
- "\n",
- " thumbs_up review_date developer_response developer_response_date \\\n",
- "0 0.0 2023-08-10 17:48:51 NaN NaN \n",
- "1 0.0 2023-08-10 17:38:35 NaN NaN \n",
- "2 0.0 2023-08-10 17:38:17 NaN NaN \n",
- "3 0.0 2023-08-10 17:37:45 NaN NaN \n",
- "4 0.0 2023-08-10 17:36:56 NaN NaN \n",
- "\n",
- " appVersion laguage_code country_code \n",
- "0 NaN en in \n",
- "1 4.485.10000 en in \n",
- "2 4.486.10002 en in \n",
- "3 4.467.10008 en in \n",
- "4 4.486.10002 en in "
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "uber = pd.read_csv(\"Uber Customer Reviews.csv\", low_memory=False)\n",
- "uber.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "d4ace5a2-346a-4099-9854-1cac2749a216",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(1069616, 13)\n"
- ]
- }
- ],
- "source": [
- "print(np.shape(uber))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "ad7ac03d-a9df-4688-ad3c-8e354996f52c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " source | \n",
- " review_id | \n",
- " user_name | \n",
- " review_title | \n",
- " review_description | \n",
- " rating | \n",
- " thumbs_up | \n",
- " review_date | \n",
- " developer_response | \n",
- " developer_response_date | \n",
- " appVersion | \n",
- " laguage_code | \n",
- " country_code | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " Google Play | \n",
- " fbc7ffc9-5a89-446e-87fd-d69bf4a7f984 | \n",
- " Puipuii Ralte | \n",
- " NaN | \n",
- " The map in Ola is so messed up, i have to pay ... | \n",
- " 1 | \n",
- " 0.0 | \n",
- " 2023-08-10 16:40:50 | \n",
- " NaN | \n",
- " NaN | \n",
- " 6.3.2 | \n",
- " en | \n",
- " in | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " Google Play | \n",
- " 5a0051fb-220a-45b2-ba94-a15a2949218f | \n",
- " Deepak Kumar | \n",
- " NaN | \n",
- " Deepak Kumar.... 🙏🙏🙏🙏🙏] | \n",
- " 5 | \n",
- " 0.0 | \n",
- " 2023-08-10 16:36:14 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " en | \n",
- " in | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " Google Play | \n",
- " 71ebf933-b734-474d-bb65-a18c90906ed2 | \n",
- " Ahamed Azarudeen | \n",
- " NaN | \n",
- " Such aa irresponsible app more then I waiting ... | \n",
- " 1 | \n",
- " 0.0 | \n",
- " 2023-08-10 16:29:31 | \n",
- " NaN | \n",
- " NaN | \n",
- " 6.3.1 | \n",
- " en | \n",
- " in | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " Google Play | \n",
- " e1cc0010-60b3-4126-99c2-e8549088566a | \n",
- " Rahil Syed | \n",
- " NaN | \n",
- " Worst | \n",
- " 1 | \n",
- " 0.0 | \n",
- " 2023-08-10 15:52:06 | \n",
- " NaN | \n",
- " NaN | \n",
- " 5.0.4 | \n",
- " en | \n",
- " in | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " Google Play | \n",
- " 77cf1be1-b428-4493-ae25-e0f288f79b8f | \n",
- " vin 007 | \n",
- " NaN | \n",
- " Too much expensive .. try UBer... They are pro... | \n",
- " 1 | \n",
- " 0.0 | \n",
- " 2023-08-10 15:51:10 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " en | \n",
- " in | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " source review_id user_name \\\n",
- "0 Google Play fbc7ffc9-5a89-446e-87fd-d69bf4a7f984 Puipuii Ralte \n",
- "1 Google Play 5a0051fb-220a-45b2-ba94-a15a2949218f Deepak Kumar \n",
- "2 Google Play 71ebf933-b734-474d-bb65-a18c90906ed2 Ahamed Azarudeen \n",
- "3 Google Play e1cc0010-60b3-4126-99c2-e8549088566a Rahil Syed \n",
- "4 Google Play 77cf1be1-b428-4493-ae25-e0f288f79b8f vin 007 \n",
- "\n",
- " review_title review_description rating \\\n",
- "0 NaN The map in Ola is so messed up, i have to pay ... 1 \n",
- "1 NaN Deepak Kumar.... 🙏🙏🙏🙏🙏] 5 \n",
- "2 NaN Such aa irresponsible app more then I waiting ... 1 \n",
- "3 NaN Worst 1 \n",
- "4 NaN Too much expensive .. try UBer... They are pro... 1 \n",
- "\n",
- " thumbs_up review_date developer_response developer_response_date \\\n",
- "0 0.0 2023-08-10 16:40:50 NaN NaN \n",
- "1 0.0 2023-08-10 16:36:14 NaN NaN \n",
- "2 0.0 2023-08-10 16:29:31 NaN NaN \n",
- "3 0.0 2023-08-10 15:52:06 NaN NaN \n",
- "4 0.0 2023-08-10 15:51:10 NaN NaN \n",
- "\n",
- " appVersion laguage_code country_code \n",
- "0 6.3.2 en in \n",
- "1 NaN en in \n",
- "2 6.3.1 en in \n",
- "3 5.0.4 en in \n",
- "4 NaN en in "
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "ola_df = pd.read_csv(\"Ola Customer Reviews.csv\", low_memory=False)\n",
- "ola_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "878a39c4-45d5-41d6-82b0-9c373c28e280",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "count 357678.000000\n",
- "mean 92.402697\n",
- "std 125.489169\n",
- "min 1.000000\n",
- "25% 8.000000\n",
- "50% 33.000000\n",
- "75% 131.000000\n",
- "max 2877.000000\n",
- "Name: review_length, dtype: float64\n"
- ]
- }
- ],
- "source": [
- "# Check average review length\n",
- "ola_df['review_length'] = ola_df['review_description'].str.len()\n",
- "print(ola_df['review_length'].describe())\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "1dd032ba-343b-4402-9d96-ee5e0432ab07",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Substantive reviews: 204715\n"
- ]
- }
- ],
- "source": [
- "# Filter out very short reviews\n",
- "substantive_reviews = ola_df[ola_df['review_length'] > 20]\n",
- "print(f\"Substantive reviews: {len(substantive_reviews)}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "2e58bf99-c08e-4e41-9b98-124b3f9e6145",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "count 1.069447e+06\n",
- "mean 7.023987e+01\n",
- "std 1.158196e+02\n",
- "min 1.000000e+00\n",
- "25% 8.000000e+00\n",
- "50% 2.100000e+01\n",
- "75% 7.800000e+01\n",
- "max 3.792000e+03\n",
- "Name: review_length, dtype: float64\n"
- ]
- }
- ],
- "source": [
- "# Check average review length\n",
- "uber['review_length'] = uber['review_description'].str.len()\n",
- "print(uber['review_length'].describe())\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "id": "2dd05939-e87c-443d-9012-e5f45cf64ff5",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Substantive reviews: 542110\n"
- ]
- }
- ],
- "source": [
- "# Filter out very short reviews\n",
- "substantive_reviews = uber[uber['review_length'] > 20]\n",
- "print(f\"Substantive reviews: {len(substantive_reviews)}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "75ad8e81-3f11-4152-9494-b95bbba6fa01",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/multitag/multitag.py b/multitag/multitag.py
index a728777..32aca56 100644
--- a/multitag/multitag.py
+++ b/multitag/multitag.py
@@ -1,3 +1,14 @@
+# TODO: Refactor,especially change expected names as I jumped the gun when first making this without sampling properly
+# TODO: Add button labels and finalise the categories of aspects
+# TODO: Ensure there is persistent progress tracking implentation before labelling
+# TODO: Finalise keybinds
+# TODO: Display progress e.g. review 1020 of 5000
+# TODO: Validate saving progres
+# TODO: Loop instead of pressing enter
+# TODO: Autosave ? / confirm quit at least
+# TODO: More visual q's
+
+
import tkinter as tk
from tkinter import ttk
import pandas as pd
diff --git a/multitag/preprocess.py b/multitag/preprocess.py
index 9167b1a..646378b 100644
--- a/multitag/preprocess.py
+++ b/multitag/preprocess.py
@@ -88,9 +88,12 @@ def preprocess_uber_reviews(input_path, output_path):
df['word_count'] = df['review_clean'].str.split().str.len()
# 5. Remove short reviews
- review_length_limit = 5
- print(f"\n4. Removing short reviews (< {review_length_limit})...")
- print(" Rationale: Insufficient context for classification")
+ review_length_limit = 5 ### limit review length ###
+ print(f"\n4. Removing short reviews so reviews have better context / (usefulness) (< {review_length_limit})...")
+ # 1 word reviews provide little to draw conclusions from and bloat the
+ # dataset a lot, nearly 50% of reviews!
+
+ # display changes
before = len(df)
df = df[df['word_count'] >= review_length_limit]
removed = before - len(df)
@@ -119,8 +122,10 @@ def preprocess_uber_reviews(input_path, output_path):
print("PREPROCESSING COMPLETE")
print("="*50)
print(f"\nFinal dataset: {len(df_clean):,} reviews")
- print(f"Data source: Indian Uber market (predominantly English)")
- print(f"Quality filters: word_count >= 5, duplicates removed")
+ print(f"Quality filters: word_count >= 5, duplicates removed")
+ # while this does remove a some legitimate reviews which would provide use in classification
+ # it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5
+ # it showed the most amount of formative reviews without seeming excessive in data removal
print("\nRating distribution:")
rating_dist = df_clean['rating'].value_counts().sort_index()
@@ -138,7 +143,7 @@ def preprocess_uber_reviews(input_path, output_path):
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
print(f" Null values: {df_clean.isnull().sum().to_dict()}")
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
- # lang detection takes 5+ mins
+ # lang detection takes 5+ mins so leaving it commented for now
#df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
#print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
@@ -150,13 +155,13 @@ def preprocess_uber_reviews(input_path, output_path):
if len(df_clean[df_clean['rating'] == rating]) > 0:
sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
print(f"\n{rating} {"✭" * rating} REVIEWS:")
- for idx, row in sample.iterrows():
+ for index, row in sample.iterrows():
print(f" • ({row['word_count']} words) {row['review'][:100]}")
# Note about language
print("Language detection not applied due to unreliability on short")
- print("informal text. Dataset is from the Indian market, labeled as English.")
- print("Manual annotation phase will identify any non-English reviews. And put aside.")
+ print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
+ print(" ...Manual annotation phase will identify any non-English reviews")
return df_clean
diff --git a/multitag/sampler.py b/multitag/sampler.py
index a90a9a4..25a16ae 100644
--- a/multitag/sampler.py
+++ b/multitag/sampler.py
@@ -1,22 +1,45 @@
+# TODO: Fix get_stratified_sample() replace broken x() with actual working logic
+# TODO: Add verification comparison between ratings
+# TODO: implement sample_with_keywords() add to lists, and implement logic
+# TODO: Clean up the logging print statements
+
+
import pandas as pd
import numpy as np
print(pd.__version__)
print(np.__version__)
-path = "data/uber_reviews.csv"
-sampled_path = "data/uber_reviews_sampled.csv"
+path = "multitag/data/uber_reviews_cleaned.csv"
+sampled_path = "multitag/data/uber_reviews_sampled.csv"
+original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
class Sampler:
- def __init__(self, data_path):
+ def __init__(self, data_path, target_samples):
self.data_path = data_path
+ self.target_samples = 5000 # target number of samples
+ self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
+
+ self.original_data = pd.read_csv(original_path, low_memory=False)
self.data = pd.read_csv(self.data_path, low_memory=False)
self.total = len(self.data) # total number of records in the dataset
- self.target_samples = 5000 # target number of samples
- self.stratify_column = "rating" # column to stratify by
+ print("="*50)
+ print("SAMPLER INITIALIZED")
+ print("="*50,"\n")
+
+
+ print(f"Total records in dataset: {self.total}")
print(f"Data loaded from {self.data_path}, total records: {len(self.data)}")
- print(self.data.head())
+ #print(self.data.head())
+ #print(f"\nCurrent distribution:")
+ #print(self.data[self.stratify_column].value_counts().sort_index())
+ #print(f"\nColumns: {self.data.columns.tolist()}")
+ print(f"Percentage distribution (working data):")
+ print((self.data[self.stratify_column].value_counts(normalize=True).sort_index() * 100).round(1),"\n")
+ _origdist = self.original_data[self.stratify_column].value_counts(normalize=True).sort_index()
+ print(f"Original Distribution from {original_path}:")
+ print((_origdist*100).round(1),"\n")
self.data.info()
@@ -31,36 +54,128 @@ class Sampler:
2 3.9% (41707)
Name: proportion, dtype: object
"""
-
- def get_stratified_sample(self):
- stratified_sample = self.data.groupby(self.stratify_column).apply(
- lambda x: x.sample(n=int(len(x) / self.total * self.target_samples)),
- # include_groups=False
- )
- return stratified_sample
-sampler = Sampler("data/uber_reviews.csv")
-
-
-
-to_sample = input("Do you want to create a stratified sample of the data? (y/n): ")
-
-if to_sample == 'y':
- sampled = sampler.get_stratified_sample()
- sampled.to_csv("data/uber_reviews_sampled.csv", index=False)
- print("Original columns:", sampler.data.columns.tolist())
- print("Sampled columns:", sampled.columns.tolist())
- print("Stratified sample saved to data/uber_reviews_sampled.csv")
-elif to_sample == 'n':
- sampled_data = pd.read_csv("data/uber_reviews_sampled.csv", low_memory=False)
- """
- debug to check sampled data matches original columns
- print("Original columns:", sampler.data.columns.tolist())
- print("Sampled columns:", sampled_data.columns.tolist())
"""
- print("Original data distribution:")
- print(sampler.data["rating"].value_counts())
- print("Sampled data distribution:")
- print(sampled_data["rating"].value_counts())
-else:
- print("Invalid input, please enter 'y' or 'n'")
+ Sample size by rating
+ Redundant calculation, kept for clarity
+ Doesn't factor that the distribution changed greatly after preprocessing
+
+ """
+ def get_stratified_sample(self) -> pd.Series:
+ stratified_sample = self.data.groupby(self.stratify_column).apply(self.x)
+ return stratified_sample
+
+
+ # x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
+ def x(self, ):
+ return lambda x: x.sample(n=int(len(x) / self.total * self.target_samples))
+ """
+ get_proportional_sample()
+
+ """
+
+ """
+ original_distribution_sample()
+ The main sampling method for our labelling as it
+ keeps composition of the original uber dataset
+ which is a fairer comparison, may also work better in general
+
+ inputs:
+
+ outputs:
+
+ """
+ def original_distribution_sample(self):
+ original_dist = {
+ 5: int(0.571 * self.target_samples),
+ 1: int(0.265 * self.target_samples),
+ 4: int(0.078 * self.target_samples),
+ 3: int(0.047 * self.target_samples),
+ 2: int(0.039 * self.target_samples)
+ }
+ print("Target Distribution =", original_dist)
+ samples = []
+ for rating, num_samples in original_dist.items():
+ rating_data = self.data[self.data[self.stratify_column] == rating]
+ if len(rating_data) < num_samples:
+ print("Missing samples available for rating")
+ num_samples = len(rating_data)
+ sample = rating_data.sample(n = num_samples,random_state=33)
+ samples.append(sample)
+ original_sample = pd.concat(samples, ignore_index=True)
+ return original_sample
+
+ """
+ sample_with_keywords()
+
+ In order to train on more bugs and features data in
+ future this method was created
+ - 2000 balanced by rating (400 per)
+ - 1500 likely bugs using bug_keywords list
+ - 1500 likely features using feature_keywords list
+
+ inputs:
+ outputs:
+
+ """
+
+ def sample_with_keywords():
+ #TODO add keywords for feature classification
+ print(f"\n{"="*50}")
+ print("Keyword influenced / rating stratified set")
+ print(f"\n{"="*50}")
+
+ bug_keywords = ["crash","crashes", "freeze", "freezes", "error",
+ "stops", "doesnt work", "doesn't work","loading",
+ "blank", "stuck", "load", "loads", "broken", "breaks",
+ "glitch", "glitches", "issue", "could you", "fix",
+ "failed"]
+
+
+ return
+
+ def save_sample(self, sample_df,output_path):
+ """Save sample and display statistics"""
+ sample_df.to_csv(output_path, index=False)
+
+ print(f"\n{'='*50}")
+ print("SAMPLE SAVED")
+ print(f"{'='*50}")
+ print(f"Location: {output_path}")
+ print(f"Total samples: {len(sample_df):,}")
+ print(f"\nDistribution:")
+ for rating in sorted(sample_df[self.stratify_column].unique()):
+ count = (sample_df[self.stratify_column] == rating).sum()
+ pct = count / len(sample_df) * 100
+ print(f" {rating}★: {count:,} ({pct:.1f}%)")
+
+def main():
+
+ sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
+
+ # Choose sampling strategy
+ print(f"\n{'='*50}")
+ print("SAMPLING STRATEGY OPTIONS")
+ print(f"{'='*50}")
+ print("1. get_stratified_sample() stratified by current distribution")
+ print("2. original_distribution_sample() stratified by the original data distribution")
+ print("3. get_keyword_boosted_sample() stratified using original distribution but also using a keyword dictionary")
+
+ choice = input("\nEnter choice (1-3): ").strip()
+
+ if choice == '1':
+ sample = sampler.get_stratified_sample()
+ sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+
+ elif choice == '2':
+ sample = sampler.original_distribution_sample()
+ sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+
+ elif choice == '3':
+ sample = sampler.get_keyword_boosted_sample()
+ sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file