diff --git a/.gitignore b/.gitignore index 8054551..3ae4918 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,8 @@ -multitag/data/*.csv -multitag/raw_data/ -multitag/.ipynb_checkpoints -multitag/.vscode +data/*.csv +raw_data/ +*.ipynb_checkpoints +*.vscode +models/ +*.pt +*.pth +.ipynb_checkpoints/ diff --git a/README.md b/README.md index 38e47e6..a5f112e 100644 --- a/README.md +++ b/README.md @@ -1 +1,75 @@ -# 6013 +# RECLASS: Multi-Task Deep Learning for App Review Classification + +**COMP6013 | Oxford Brookes University | 2025-26** + +--- + +## Project Overview + +RECLASS is a multi-task learning system which uses a shared BERT encoder with task-specific classification heads. + +| Task | Output | Classes | +|------|--------|---------| +| Bug Report Detection | Binary | Yes / No | +| Feature Request Detection | Binary | Yes / No | +| Aspect Classification | Multi-class | Driver, App, Pricing, Service, Payment, General | +| Aspect Sentiment | Multi-class | Positive, Neutral, Negative | + +## Dataset + +- **Source**: [Uber Customer Reviews (Kaggle)](https://www.kaggle.com/datasets/khushipitroda/ola-vs-uber-play-store-reviews) +- **Original size**: 1,069,616 reviews +- **Cleaned size**: 495,036 reviews (after removing short/duplicate reviews) +- **Annotation target**: 5,000 manually labelled reviews + +## Repository Structure + +``` +6013/ +README.md +requirements.txt + multitag/ + data/ + uber_reviews.csv # Raw dataset + uber_reviews_cleaned.csv # Preprocessed reviews + uber_reviews_sampled.csv # Stratified sample for annotation + uber_reviews_tagged.csv # Annotated reviews (in progress) + notebooks/ + datasets_reviews.ipynb # Initial data exploration + preprocessing_uber.ipynb # Preprocessing analysis + uber_cleaned.ipynb # Cleaned data verification + src/ + preprocess.py # Text cleaning and filtering pipeline + sampler.py # Stratified sampling strategies + multitag.py # GUI annotation tool + train.py # Model training (in progress) + infer.py # Inference pipeline (in progress) +``` + +## Current Progress + +- Manual annotation of 5,000 reviews +- BERT baseline implementation +- Multi-task model architecture +- Training and evaluation +- Comparative analysis (MTL vs single-task) +- Final report and presentation + +## Installation + +``` +# Clone repository +... +# Create conda environment +... +# Install dependencies +...requirements.txt +``` + +## Usage +## References +## Licenses + +--- + +*Last updated: January 2025* \ No newline at end of file diff --git a/multitag/notebooks/datasets_reviews.ipynb b/multitag/notebooks/datasets_reviews.ipynb deleted file mode 100644 index 3668e1a..0000000 --- a/multitag/notebooks/datasets_reviews.ipynb +++ /dev/null @@ -1,370 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "id": "f3da59fb-eb6b-449f-b8d5-95ddacd456f2", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "0c897ead-dfb5-4d18-bcfc-949824a0868f", - "metadata": {}, - "outputs": [], - "source": [ - "uber = Path.home() / 'google-drive' / 'Charlie_6013_RECLASS' / 'Data' / 'Raw' / 'Uber Customer Reviews.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "75ad8e81-3f11-4152-9494-b95bbba6fa01", - "metadata": {}, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\ch\\\\google-drive\\\\Charlie_6013_RECLASS\\\\Data\\\\Raw\\\\Uber Customer Reviews.csv'", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m uber_df = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43muber\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlow_memory\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~\\anaconda3\\envs\\multitag\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1026\u001b[39m, in \u001b[36mread_csv\u001b[39m\u001b[34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[39m\n\u001b[32m 1013\u001b[39m kwds_defaults = _refine_defaults_read(\n\u001b[32m 1014\u001b[39m dialect,\n\u001b[32m 1015\u001b[39m delimiter,\n\u001b[32m (...)\u001b[39m\u001b[32m 1022\u001b[39m dtype_backend=dtype_backend,\n\u001b[32m 1023\u001b[39m )\n\u001b[32m 1024\u001b[39m kwds.update(kwds_defaults)\n\u001b[32m-> \u001b[39m\u001b[32m1026\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~\\anaconda3\\envs\\multitag\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:620\u001b[39m, in \u001b[36m_read\u001b[39m\u001b[34m(filepath_or_buffer, kwds)\u001b[39m\n\u001b[32m 617\u001b[39m _validate_names(kwds.get(\u001b[33m\"\u001b[39m\u001b[33mnames\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[32m 619\u001b[39m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m620\u001b[39m parser = \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 622\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[32m 623\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", - "\u001b[36mFile \u001b[39m\u001b[32m~\\anaconda3\\envs\\multitag\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1620\u001b[39m, in \u001b[36mTextFileReader.__init__\u001b[39m\u001b[34m(self, f, engine, **kwds)\u001b[39m\n\u001b[32m 1617\u001b[39m \u001b[38;5;28mself\u001b[39m.options[\u001b[33m\"\u001b[39m\u001b[33mhas_index_names\u001b[39m\u001b[33m\"\u001b[39m] = kwds[\u001b[33m\"\u001b[39m\u001b[33mhas_index_names\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 1619\u001b[39m \u001b[38;5;28mself\u001b[39m.handles: IOHandles | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1620\u001b[39m \u001b[38;5;28mself\u001b[39m._engine = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~\\anaconda3\\envs\\multitag\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1880\u001b[39m, in \u001b[36mTextFileReader._make_engine\u001b[39m\u001b[34m(self, f, engine)\u001b[39m\n\u001b[32m 1878\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[32m 1879\u001b[39m mode += \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m-> \u001b[39m\u001b[32m1880\u001b[39m \u001b[38;5;28mself\u001b[39m.handles = \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1881\u001b[39m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1882\u001b[39m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1883\u001b[39m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mencoding\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1884\u001b[39m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcompression\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1885\u001b[39m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmemory_map\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1886\u001b[39m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[43m=\u001b[49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1887\u001b[39m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mencoding_errors\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstrict\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1888\u001b[39m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstorage_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1889\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1890\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m.handles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1891\u001b[39m f = \u001b[38;5;28mself\u001b[39m.handles.handle\n", - "\u001b[36mFile \u001b[39m\u001b[32m~\\anaconda3\\envs\\multitag\\Lib\\site-packages\\pandas\\io\\common.py:873\u001b[39m, in \u001b[36mget_handle\u001b[39m\u001b[34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[39m\n\u001b[32m 868\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[32m 869\u001b[39m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[32m 870\u001b[39m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[32m 871\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m ioargs.encoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs.mode:\n\u001b[32m 872\u001b[39m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m873\u001b[39m handle = \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[32m 874\u001b[39m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 875\u001b[39m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 876\u001b[39m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m=\u001b[49m\u001b[43mioargs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 877\u001b[39m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 878\u001b[39m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 879\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 880\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 881\u001b[39m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[32m 882\u001b[39m handle = \u001b[38;5;28mopen\u001b[39m(handle, ioargs.mode)\n", - "\u001b[31mFileNotFoundError\u001b[39m: [Errno 2] No such file or directory: 'C:\\\\Users\\\\ch\\\\google-drive\\\\Charlie_6013_RECLASS\\\\Data\\\\Raw\\\\Uber Customer Reviews.csv'" - ] - } - ], - "source": [ - "uber_df = pd.read_csv(uber, low_memory=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b8469b3-c606-461f-aaef-9619b7dc1ffd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sourcereview_iduser_namereview_titlereview_descriptionratingthumbs_upreview_datedeveloper_responsedeveloper_response_dateappVersionlaguage_codecountry_code
0Google Play18d6584c-d0e9-4833-a744-f607058aee97Milky WayNaNSuddenly, the driver can't have my location an...10.02023-08-10 17:48:51NaNNaNNaNenin
1Google Play50a08f18-cece-4ddf-b617-028844c8aa28Bradlee SeveraNaNVery cordial.. And helped with a quick turnaro...50.02023-08-10 17:38:35NaNNaN4.485.10000enin
2Google Playb0d8e75a-80a7-4dcd-abaf-72b046dbeeb7Amit AggarwalNaNVery good experience50.02023-08-10 17:38:17NaNNaN4.486.10002enin
3Google Play502702a9-25ed-4373-a96c-7fa1f06caacdBryant InmanNaNAll I use50.02023-08-10 17:37:45NaNNaN4.467.10008enin
4Google Playf47a3fb6-23db-49bd-9e63-f33c8d724d07Addie WhittakerNaNI have enjoyed traveling by Uber my drivers ha...50.02023-08-10 17:36:56NaNNaN4.486.10002enin
\n", - "
" - ], - "text/plain": [ - " source review_id user_name \\\n", - "0 Google Play 18d6584c-d0e9-4833-a744-f607058aee97 Milky Way \n", - "1 Google Play 50a08f18-cece-4ddf-b617-028844c8aa28 Bradlee Severa \n", - "2 Google Play b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7 Amit Aggarwal \n", - "3 Google Play 502702a9-25ed-4373-a96c-7fa1f06caacd Bryant Inman \n", - "4 Google Play f47a3fb6-23db-49bd-9e63-f33c8d724d07 Addie Whittaker \n", - "\n", - " review_title review_description rating \\\n", - "0 NaN Suddenly, the driver can't have my location an... 1 \n", - "1 NaN Very cordial.. And helped with a quick turnaro... 5 \n", - "2 NaN Very good experience 5 \n", - "3 NaN All I use 5 \n", - "4 NaN I have enjoyed traveling by Uber my drivers ha... 5 \n", - "\n", - " thumbs_up review_date developer_response developer_response_date \\\n", - "0 0.0 2023-08-10 17:48:51 NaN NaN \n", - "1 0.0 2023-08-10 17:38:35 NaN NaN \n", - "2 0.0 2023-08-10 17:38:17 NaN NaN \n", - "3 0.0 2023-08-10 17:37:45 NaN NaN \n", - "4 0.0 2023-08-10 17:36:56 NaN NaN \n", - "\n", - " appVersion laguage_code country_code \n", - "0 NaN en in \n", - "1 4.485.10000 en in \n", - "2 4.486.10002 en in \n", - "3 4.467.10008 en in \n", - "4 4.486.10002 en in " - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "uber_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1709a2cc-4f7a-4e77-994e-68668612caff", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(1069616, 13)" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.shape(uber_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "06c0c03c-14ba-4451-a6ea-44d36e85327c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['source',\n", - " 'review_id',\n", - " 'user_name',\n", - " 'review_title',\n", - " 'review_description',\n", - " 'rating',\n", - " 'thumbs_up',\n", - " 'review_date',\n", - " 'developer_response',\n", - " 'developer_response_date',\n", - " 'appVersion',\n", - " 'laguage_code',\n", - " 'country_code']" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "uber_df.columns.tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d22d3bce-eac0-4d02-a4ef-38343f4958ff", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "source object\n", - "review_id object\n", - "user_name object\n", - "review_title object\n", - "review_description object\n", - "rating int64\n", - "thumbs_up float64\n", - "review_date object\n", - "developer_response object\n", - "developer_response_date object\n", - "appVersion object\n", - "laguage_code object\n", - "country_code object\n", - "dtype: object" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "uber_df.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e08f5eae-7921-4526-b8fd-29038c55e1bb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "source 0\n", - "review_id 0\n", - "user_name 1\n", - "review_title 1067436\n", - "review_description 169\n", - "rating 0\n", - "thumbs_up 2180\n", - "review_date 0\n", - "developer_response 871352\n", - "developer_response_date 872338\n", - "appVersion 241548\n", - "laguage_code 0\n", - "country_code 0\n", - "dtype: int64" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "uber_df.isnull().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea59d211-9958-46f6-bf76-65d8d36c50e4", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "reclass", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.14.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/multitag/notebooks/preprocessing_uber.ipynb b/notebooks/preprocessing_uber.ipynb similarity index 84% rename from multitag/notebooks/preprocessing_uber.ipynb rename to notebooks/preprocessing_uber.ipynb index 95e36d7..69838e6 100644 --- a/multitag/notebooks/preprocessing_uber.ipynb +++ b/notebooks/preprocessing_uber.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 11, "id": "470fe7c6-1614-4daf-879f-e6c399117c7b", "metadata": {}, "outputs": [], @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "id": "afe1168c", "metadata": {}, "outputs": [ @@ -21,20 +21,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "cwd: /mnt/c/Users/ch/6013/multitag/notebooks\n", - "exists data: False\n" + "cwd: c:\\Users\\ch\\6013\\notebooks\n", + "exists data: True\n" ] } ], "source": [ "import os\n", "print(\"cwd:\", os.getcwd())\n", - "print(\"exists data:\", os.path.exists(\"mullitag\"))\n" + "print(\"exists data:\", os.path.exists(\"../data/\"))\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "id": "b855045e-2dd1-4fa1-ab5a-8ce8b50b02ee", "metadata": {}, "outputs": [], @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "e7da1fb6-ede6-46c6-8fbd-fa491d3351c5", "metadata": {}, "outputs": [ @@ -200,7 +200,7 @@ "4 4.486.10002 en in " ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -211,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "5c02ec54-4583-4720-88c6-1110b52c3f88", "metadata": {}, "outputs": [ @@ -235,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "1da5d625-a4ba-49f8-8314-cc9e0f4ef96a", "metadata": {}, "outputs": [ @@ -262,7 +262,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "1c97e396-8f05-4df7-bd0a-1bbecf6911b4", "metadata": {}, "outputs": [], @@ -272,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "55324c94-4944-4844-b00e-dc08c8989f7b", "metadata": {}, "outputs": [ @@ -291,7 +291,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "c45959fe-3e23-4831-a41a-94c89892247f", "metadata": {}, "outputs": [ @@ -326,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "bf14e3db-a1b4-4fad-8102-b7ac25feeefa", "metadata": {}, "outputs": [ @@ -344,7 +344,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "8ccc07fa-9913-4047-ae17-35d2454eb059", "metadata": {}, "outputs": [ @@ -357,66 +357,66 @@ "1 STAR REVIEWS:\n", "==========================================\n", "\n", - "Once drivers confirm the rider its showing with in 2 minutes but they take 25 minutes more and if driver cancel the ride I pay for that in next ride it's redicules\n", - "(Length: 32.0 words)\n", + "Driver come very late than you can't do anthing pay and wait. Very bad service you can't cancil your when driver not come. Ubar charge money without service. This is very bad bad bad. So take local au...\n", + "(Length: 47.0 words)\n", "\n", - "they charge very high as they show before the ride\n", - "(Length: 10.0 words)\n", + "I have uninstalled and reinstalled the app around 5 times over the course of the past 3 days. Every time I try to use the app, I get stuck in and endless reCaptcha loop. (I enter my phone number, solv...\n", + "(Length: 66.0 words)\n", "\n", - "scam drivers, worst service\n", - "(Length: 4.0 words)\n", + "Thieves. Sent an Uber to my house in the middle of the night and wouldn't refund.\n", + "(Length: 16.0 words)\n", "\n", "==========================================\n", "2 STAR REVIEWS:\n", "==========================================\n", "\n", - "Drivers always ask is destination and cancel if they dont want to go ? Cant they already see destination before accepting ride ?\n", - "(Length: 23.0 words)\n", + "Your app is required to much space\n", + "(Length: 7.0 words)\n", "\n", - "she hole her phone on her hand she driving 80\n", - "(Length: 10.0 words)\n", + "I'm very disappointed. At first,I used Uber because it was far better than regular taxi. But I stopped using it because the application is very heavy and the drivers rarely reached my pinned locatio...\n", + "(Length: 107.0 words)\n", "\n", - "I7u.6f هنه\n", - "(Length: 2.0 words)\n", + "nowhere to leave a tip!\n", + "(Length: 5.0 words)\n", "\n", "==========================================\n", "3 STAR REVIEWS:\n", "==========================================\n", "\n", - "I think this app is very important to me\n", - "(Length: 9.0 words)\n", + "اوبر المدينة احيانا كويس .. بس لما يكون السائق باخر ملك ربي و تنتظر 14 دقيقه و بعدين يلغي و تصير دخلت بوقت الذروة المفروض يكون في تعويض .. زي لما تلغي انت .\n", + "(Length: 34.0 words)\n", "\n", - "Ok\n", + "Good application\n", + "(Length: 2.0 words)\n", + "\n", + "Toooslooow\n", "(Length: 1.0 words)\n", "\n", - "The rate will be one while booking and after the ride it changes if asked the reason is due to traffic, but it should be mentioned first only, the destination time is also calculated by uber only alon...\n", - "(Length: 53.0 words)\n", - "\n", "==========================================\n", "4 STAR REVIEWS:\n", "==========================================\n", "\n", - "Nice\n", - "(Length: 1.0 words)\n", + "Help full\n", + "(Length: 2.0 words)\n", "\n", - "Good app but sometimes it take long time to get booking even the cabs are nearby and sometimes they even cancel the ride after confirming and making us wait for 30 mins and above\n", - "(Length: 34.0 words)\n", + "Won't allow me to change my payment details. Update: Problem solved.\n", + "(Length: 11.0 words)\n", "\n", - "its good and helpful.. Thank you\n", - "(Length: 6.0 words)\n", + "Very good\n", + "(Length: 2.0 words)\n", "\n", "==========================================\n", "5 STAR REVIEWS:\n", "==========================================\n", "\n", - "good service\n", - "(Length: 2.0 words)\n", + "Good driving skills\n", + "(Length: 3.0 words)\n", "\n", - "Drivers have been getting us home quickly and effectively.\n", - "(Length: 9.0 words)\n", + "Lovery\n", + "(Length: 1.0 words)\n", "\n", - "Hbby\n", - "(Length: 1.0 words)\n" + "Excellent experience\n", + "(Length: 2.0 words)\n" ] } ], @@ -431,35 +431,11 @@ " print(f\"\\n{review_text[:200]}{'...' if len(review_text) > 200 else ''}\")\n", " print(f\"(Length: {row['word_count']} words)\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b12dcb89-d291-447a-98f3-02817dc0eb8e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87a15e76-51c8-4586-acea-ca3176c18757", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "73c4bbb9-3f8e-4b4c-8538-539b140cf610", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "reclass", + "display_name": "multitag", "language": "python", "name": "python3" }, @@ -473,7 +449,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.14.2" + "version": "3.14.0" } }, "nbformat": 4, diff --git a/multitag/notebooks/uber_cleaned.ipynb b/notebooks/uber_cleaned.ipynb similarity index 92% rename from multitag/notebooks/uber_cleaned.ipynb rename to notebooks/uber_cleaned.ipynb index 66224fd..2430469 100644 --- a/multitag/notebooks/uber_cleaned.ipynb +++ b/notebooks/uber_cleaned.ipynb @@ -96,7 +96,7 @@ } ], "source": [ - "df = pd.read_csv('data/uber_reviews_cleaned.csv')\n", + "df = pd.read_csv('../data/uber_reviews_cleaned.csv')\n", "df.head()" ] }, @@ -213,26 +213,25 @@ "\n", "Language distribution in 10+ word reviews:\n", "lang\n", - "en 939\n", - "id 12\n", - "es 10\n", - "pt 9\n", - "ar 7\n", - "ru 5\n", - "fr 3\n", - "af 2\n", - "no 2\n", - "et 1\n", - "ca 1\n", - "hi 1\n", - "tr 1\n", - "cs 1\n", - "bn 1\n", - "sv 1\n", - "it 1\n", - "so 1\n", + "en 956\n", + "es 7\n", + "pt 7\n", + "ar 4\n", + "id 3\n", + "da 3\n", + "bn 3\n", + "af 3\n", + "it 2\n", + "tl 2\n", + "tr 2\n", + "fr 1\n", "ro 1\n", - "da 1\n", + "et 1\n", + "sv 1\n", + "nl 1\n", + "hi 1\n", + "pl 1\n", + "ta 1\n", "Name: count, dtype: int64\n" ] } @@ -241,27 +240,11 @@ "print(f\"\\nLanguage distribution in 10+ word reviews:\")\n", "print(sample_check['lang'].value_counts())" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6cf91607", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fd7576df-ce92-4c30-8466-34274290a934", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "multitag", "language": "python", "name": "python3" }, diff --git a/multitag/src/infer.py b/src/infer.py similarity index 100% rename from multitag/src/infer.py rename to src/infer.py diff --git a/multitag/src/multitag.py b/src/multitag.py similarity index 100% rename from multitag/src/multitag.py rename to src/multitag.py diff --git a/multitag/src/preprocess.py b/src/preprocess.py similarity index 100% rename from multitag/src/preprocess.py rename to src/preprocess.py diff --git a/multitag/src/sampler.py b/src/sampler.py similarity index 100% rename from multitag/src/sampler.py rename to src/sampler.py diff --git a/multitag/src/train.py b/src/train.py similarity index 100% rename from multitag/src/train.py rename to src/train.py