diff --git a/datasets_reviews.ipynb b/datasets_reviews.ipynb index 14f23a8..d71bf3a 100644 --- a/datasets_reviews.ipynb +++ b/datasets_reviews.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "id": "f3da59fb-eb6b-449f-b8d5-95ddacd456f2", "metadata": {}, "outputs": [], @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "id": "0c897ead-dfb5-4d18-bcfc-949824a0868f", "metadata": {}, "outputs": [], @@ -24,17 +24,34 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "id": "75ad8e81-3f11-4152-9494-b95bbba6fa01", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\ch\\\\google-drive\\\\Charlie_6013_RECLASS\\\\Data\\\\Raw\\\\Uber Customer Reviews.csv'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m uber_df = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43muber\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlow_memory\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~\\anaconda3\\envs\\multitag\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1026\u001b[39m, in \u001b[36mread_csv\u001b[39m\u001b[34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[39m\n\u001b[32m 1013\u001b[39m kwds_defaults = _refine_defaults_read(\n\u001b[32m 1014\u001b[39m dialect,\n\u001b[32m 1015\u001b[39m delimiter,\n\u001b[32m (...)\u001b[39m\u001b[32m 1022\u001b[39m dtype_backend=dtype_backend,\n\u001b[32m 1023\u001b[39m )\n\u001b[32m 1024\u001b[39m kwds.update(kwds_defaults)\n\u001b[32m-> \u001b[39m\u001b[32m1026\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~\\anaconda3\\envs\\multitag\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:620\u001b[39m, in \u001b[36m_read\u001b[39m\u001b[34m(filepath_or_buffer, kwds)\u001b[39m\n\u001b[32m 617\u001b[39m _validate_names(kwds.get(\u001b[33m\"\u001b[39m\u001b[33mnames\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[32m 619\u001b[39m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m620\u001b[39m parser = \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 622\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[32m 623\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "\u001b[36mFile \u001b[39m\u001b[32m~\\anaconda3\\envs\\multitag\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1620\u001b[39m, in \u001b[36mTextFileReader.__init__\u001b[39m\u001b[34m(self, f, engine, **kwds)\u001b[39m\n\u001b[32m 1617\u001b[39m \u001b[38;5;28mself\u001b[39m.options[\u001b[33m\"\u001b[39m\u001b[33mhas_index_names\u001b[39m\u001b[33m\"\u001b[39m] = kwds[\u001b[33m\"\u001b[39m\u001b[33mhas_index_names\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 1619\u001b[39m \u001b[38;5;28mself\u001b[39m.handles: IOHandles | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1620\u001b[39m \u001b[38;5;28mself\u001b[39m._engine = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~\\anaconda3\\envs\\multitag\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1880\u001b[39m, in \u001b[36mTextFileReader._make_engine\u001b[39m\u001b[34m(self, f, engine)\u001b[39m\n\u001b[32m 1878\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[32m 1879\u001b[39m mode += \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m-> \u001b[39m\u001b[32m1880\u001b[39m \u001b[38;5;28mself\u001b[39m.handles = \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1881\u001b[39m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1882\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1883\u001b[39m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mencoding\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1884\u001b[39m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcompression\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1885\u001b[39m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmemory_map\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1886\u001b[39m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[43m=\u001b[49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1887\u001b[39m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mencoding_errors\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstrict\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1888\u001b[39m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstorage_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1889\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1890\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m.handles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1891\u001b[39m f = \u001b[38;5;28mself\u001b[39m.handles.handle\n", + "\u001b[36mFile \u001b[39m\u001b[32m~\\anaconda3\\envs\\multitag\\Lib\\site-packages\\pandas\\io\\common.py:873\u001b[39m, in \u001b[36mget_handle\u001b[39m\u001b[34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[39m\n\u001b[32m 868\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[32m 869\u001b[39m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[32m 870\u001b[39m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[32m 871\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m ioargs.encoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs.mode:\n\u001b[32m 872\u001b[39m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m873\u001b[39m handle = \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[32m 874\u001b[39m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 875\u001b[39m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 876\u001b[39m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m=\u001b[49m\u001b[43mioargs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 877\u001b[39m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 878\u001b[39m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 879\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 880\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 881\u001b[39m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[32m 882\u001b[39m handle = \u001b[38;5;28mopen\u001b[39m(handle, ioargs.mode)\n", + "\u001b[31mFileNotFoundError\u001b[39m: [Errno 2] No such file or directory: 'C:\\\\Users\\\\ch\\\\google-drive\\\\Charlie_6013_RECLASS\\\\Data\\\\Raw\\\\Uber Customer Reviews.csv'" + ] + } + ], "source": [ "uber_df = pd.read_csv(uber, low_memory=False)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "9b8469b3-c606-461f-aaef-9619b7dc1ffd", "metadata": {}, "outputs": [ @@ -200,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "1709a2cc-4f7a-4e77-994e-68668612caff", "metadata": {}, "outputs": [ @@ -221,7 +238,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "06c0c03c-14ba-4451-a6ea-44d36e85327c", "metadata": {}, "outputs": [ @@ -254,7 +271,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "d22d3bce-eac0-4d02-a4ef-38343f4958ff", "metadata": {}, "outputs": [ @@ -288,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "e08f5eae-7921-4526-b8fd-29038c55e1bb", "metadata": {}, "outputs": [ @@ -345,7 +362,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.13" + "version": "3.14.0" } }, "nbformat": 4, diff --git a/multitag/preprocessing_uber.ipynb b/multitag/preprocessing_uber.ipynb index 68d6796..5f5e245 100644 --- a/multitag/preprocessing_uber.ipynb +++ b/multitag/preprocessing_uber.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 11, "id": "470fe7c6-1614-4daf-879f-e6c399117c7b", "metadata": {}, "outputs": [], @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 12, "id": "b855045e-2dd1-4fa1-ab5a-8ce8b50b02ee", "metadata": {}, "outputs": [], @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 13, "id": "e7da1fb6-ede6-46c6-8fbd-fa491d3351c5", "metadata": {}, "outputs": [ @@ -178,7 +178,7 @@ "4 4.486.10002 en in " ] }, - "execution_count": 24, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -189,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 14, "id": "5c02ec54-4583-4720-88c6-1110b52c3f88", "metadata": {}, "outputs": [ @@ -213,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 15, "id": "1da5d625-a4ba-49f8-8314-cc9e0f4ef96a", "metadata": {}, "outputs": [ @@ -240,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 16, "id": "1c97e396-8f05-4df7-bd0a-1bbecf6911b4", "metadata": {}, "outputs": [], @@ -250,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 17, "id": "55324c94-4944-4844-b00e-dc08c8989f7b", "metadata": {}, "outputs": [ @@ -269,7 +269,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 18, "id": "c45959fe-3e23-4831-a41a-94c89892247f", "metadata": {}, "outputs": [ @@ -304,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 19, "id": "bf14e3db-a1b4-4fad-8102-b7ac25feeefa", "metadata": {}, "outputs": [ @@ -322,7 +322,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 20, "id": "8ccc07fa-9913-4047-ae17-35d2454eb059", "metadata": {}, "outputs": [ @@ -335,67 +335,65 @@ "1 STAR REVIEWS:\n", "==========================================\n", "\n", - "cant put gift card on dont like app\n", - "(Length: 8.0 words)\n", + "Once drivers confirm the rider its showing with in 2 minutes but they take 25 minutes more and if driver cancel the ride I pay for that in next ride it's redicules\n", + "(Length: 32.0 words)\n", "\n", - "Zapłaciłem za przejazd, uber pobral środki z mojego konta. Potem byla aktualizacja ceny na niższą i znowu kazał płacić. Teraz aplikacja zablokowała się na ekranie potwierdzenia płatności.\n", - "(Length: 27.0 words)\n", + "they charge very high as they show before the ride\n", + "(Length: 10.0 words)\n", "\n", - "The app hasn't been able to process any payment. Takes forever to find a ride. I don't even know why this app still exists. Absolutely useless!\n", - "(Length: 26.0 words)\n", + "scam drivers, worst service\n", + "(Length: 4.0 words)\n", "\n", "==========================================\n", "2 STAR REVIEWS:\n", "==========================================\n", "\n", - "In spite of receiving payment and acknowledging by email the app shows \n", - "payment due and disallowed booking and service not available to me. 4 days \n", - "have lapsed no solution to my problem. Problem solvi...\n", - "(Length: 37.0 words)\n", + "Drivers always ask is destination and cancel if they dont want to go ? Cant they already see destination before accepting ride ?\n", + "(Length: 23.0 words)\n", "\n", - "Poor\n", - "(Length: 1.0 words)\n", + "she hole her phone on her hand she driving 80\n", + "(Length: 10.0 words)\n", "\n", - "I had to reset my password and now I cant get in. Its telling me that my phone number is already in use. I need this fixed\n", - "(Length: 27.0 words)\n", + "I7u.6f هنه\n", + "(Length: 2.0 words)\n", "\n", "==========================================\n", "3 STAR REVIEWS:\n", "==========================================\n", "\n", - "Nice\n", + "I think this app is very important to me\n", + "(Length: 9.0 words)\n", + "\n", + "Ok\n", "(Length: 1.0 words)\n", "\n", - "Good rides\n", - "(Length: 2.0 words)\n", - "\n", - "Nice\n", - "(Length: 1.0 words)\n", + "The rate will be one while booking and after the ride it changes if asked the reason is due to traffic, but it should be mentioned first only, the destination time is also calculated by uber only alon...\n", + "(Length: 53.0 words)\n", "\n", "==========================================\n", "4 STAR REVIEWS:\n", "==========================================\n", "\n", - "Good service\n", - "(Length: 2.0 words)\n", + "Nice\n", + "(Length: 1.0 words)\n", "\n", - "A mobile number of the car driver should be an icon if Uber book for any other person, then it can be given the number.\n", - "(Length: 25.0 words)\n", + "Good app but sometimes it take long time to get booking even the cabs are nearby and sometimes they even cancel the ride after confirming and making us wait for 30 mins and above\n", + "(Length: 34.0 words)\n", "\n", - "many times pick up locations is shifted automatically . overall good much better\n", - "(Length: 13.0 words)\n", + "its good and helpful.. Thank you\n", + "(Length: 6.0 words)\n", "\n", "==========================================\n", "5 STAR REVIEWS:\n", "==========================================\n", "\n", - "So friendly. Thank you\n", - "(Length: 4.0 words)\n", + "good service\n", + "(Length: 2.0 words)\n", "\n", - "comfortable journey with effodable price\n", - "(Length: 5.0 words)\n", + "Drivers have been getting us home quickly and effectively.\n", + "(Length: 9.0 words)\n", "\n", - "Good\n", + "Hbby\n", "(Length: 1.0 words)\n" ] } @@ -419,6 +417,22 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87a15e76-51c8-4586-acea-ca3176c18757", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73c4bbb9-3f8e-4b4c-8538-539b140cf610", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -426,6 +440,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.0" } }, "nbformat": 4, diff --git a/multitag/sampler.py b/multitag/sampler.py index cff9ab2..35eb5f6 100644 --- a/multitag/sampler.py +++ b/multitag/sampler.py @@ -1,6 +1,4 @@ -# TODO: Fix get_stratified_sample() replace broken x() with actual working logic # TODO: Add verification comparison between ratings -# TODO: implement sample_with_keywords() add to lists, and implement logic # TODO: Clean up the logging print statements @@ -177,10 +175,15 @@ class Sampler: # Drop helper columns keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature']) + + print(f"\n Total samples: {len(keyword_sample):,}") return keyword_sample + def sample_tiny_size(self): + mini_sample = self.data.sample(200) # reading some samples manually + return mini_sample @@ -211,7 +214,7 @@ def main(): print("2. original_distribution_sample() stratified by the original data distribution") print("3. get_keyword_boosted_sample() stratified using original distribution but also using a keyword dictionary") - choice = input("\nEnter choice (1-3): ").strip() + choice = input("\nEnter choice (1-4): ").strip() if choice == '1': sample = sampler.get_stratified_sample() @@ -224,6 +227,10 @@ def main(): elif choice == '3': sample = sampler.sample_with_keywords() sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") + + elif choice == '4': + sample = sampler.sample_tiny_size() + sampler.save_sample(sample,"multitag/data/uber_review_temp.csv") diff --git a/multitag/uber_cleaned.ipynb b/multitag/uber_cleaned.ipynb index 4bc9287..66224fd 100644 --- a/multitag/uber_cleaned.ipynb +++ b/multitag/uber_cleaned.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 23, + "execution_count": 1, "id": "739e61bf", "metadata": {}, "outputs": [], @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "id": "d9da1b98", "metadata": {}, "outputs": [ @@ -55,7 +55,7 @@ " \n", "