diff --git a/notebooks/getting_csv_for_inference.ipynb b/notebooks/getting_csv_for_inference.ipynb new file mode 100644 index 0000000..b1bdbef --- /dev/null +++ b/notebooks/getting_csv_for_inference.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 33, + "id": "79ac71dd", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "aa9117f0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sourcereview_iduser_namereview_titlereview_descriptionratingthumbs_upreview_datedeveloper_responsedeveloper_response_dateappVersionlaguage_codecountry_code
0Google Play18d6584c-d0e9-4833-a744-f607058aee97Milky WayNaNSuddenly, the driver can't have my location an...10.02023-08-10 17:48:51NaNNaNNaNenin
1Google Play50a08f18-cece-4ddf-b617-028844c8aa28Bradlee SeveraNaNVery cordial.. And helped with a quick turnaro...50.02023-08-10 17:38:35NaNNaN4.485.10000enin
2Google Playb0d8e75a-80a7-4dcd-abaf-72b046dbeeb7Amit AggarwalNaNVery good experience50.02023-08-10 17:38:17NaNNaN4.486.10002enin
3Google Play502702a9-25ed-4373-a96c-7fa1f06caacdBryant InmanNaNAll I use50.02023-08-10 17:37:45NaNNaN4.467.10008enin
4Google Playf47a3fb6-23db-49bd-9e63-f33c8d724d07Addie WhittakerNaNI have enjoyed traveling by Uber my drivers ha...50.02023-08-10 17:36:56NaNNaN4.486.10002enin
\n", + "
" + ], + "text/plain": [ + " source review_id user_name \\\n", + "0 Google Play 18d6584c-d0e9-4833-a744-f607058aee97 Milky Way \n", + "1 Google Play 50a08f18-cece-4ddf-b617-028844c8aa28 Bradlee Severa \n", + "2 Google Play b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7 Amit Aggarwal \n", + "3 Google Play 502702a9-25ed-4373-a96c-7fa1f06caacd Bryant Inman \n", + "4 Google Play f47a3fb6-23db-49bd-9e63-f33c8d724d07 Addie Whittaker \n", + "\n", + " review_title review_description rating \\\n", + "0 NaN Suddenly, the driver can't have my location an... 1 \n", + "1 NaN Very cordial.. And helped with a quick turnaro... 5 \n", + "2 NaN Very good experience 5 \n", + "3 NaN All I use 5 \n", + "4 NaN I have enjoyed traveling by Uber my drivers ha... 5 \n", + "\n", + " thumbs_up review_date developer_response developer_response_date \\\n", + "0 0.0 2023-08-10 17:48:51 NaN NaN \n", + "1 0.0 2023-08-10 17:38:35 NaN NaN \n", + "2 0.0 2023-08-10 17:38:17 NaN NaN \n", + "3 0.0 2023-08-10 17:37:45 NaN NaN \n", + "4 0.0 2023-08-10 17:36:56 NaN NaN \n", + "\n", + " appVersion laguage_code country_code \n", + "0 NaN en in \n", + "1 4.485.10000 en in \n", + "2 4.486.10002 en in \n", + "3 4.467.10008 en in \n", + "4 4.486.10002 en in " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"../data/raw/uber_reviews.csv\", low_memory=False)\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "36683790", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1069616 entries, 0 to 1069615\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 source 1069616 non-null str \n", + " 1 review_id 1069616 non-null str \n", + " 2 user_name 1069615 non-null str \n", + " 3 review_title 2180 non-null str \n", + " 4 review_description 1069447 non-null str \n", + " 5 rating 1069616 non-null int64 \n", + " 6 thumbs_up 1067436 non-null float64\n", + " 7 review_date 1069616 non-null str \n", + " 8 developer_response 198264 non-null str \n", + " 9 developer_response_date 197278 non-null str \n", + " 10 appVersion 828068 non-null str \n", + " 11 laguage_code 1069616 non-null str \n", + " 12 country_code 1069616 non-null str \n", + "dtypes: float64(1), int64(1), str(11)\n", + "memory usage: 106.1 MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "0b1c7f73", + "metadata": {}, + "outputs": [], + "source": [ + "df = df[['review_description']].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "90e6d653", + "metadata": {}, + "outputs": [], + "source": [ + "df.head(5)\n", + "df.to_csv(\"../data/raw/review_description.csv\", index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "multitag", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/preprocessing_tagged.ipynb b/notebooks/preprocessing_tagged.ipynb index a1675ee..e863cf9 100644 --- a/notebooks/preprocessing_tagged.ipynb +++ b/notebooks/preprocessing_tagged.ipynb @@ -5,19 +5,7 @@ "execution_count": 1, "id": "2b7cfa1a", "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'sklearn'", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01msklearn\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mmodel_selection\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnp\u001b[39;00m\n", - "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'sklearn'" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", @@ -1209,7 +1197,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.11" + "version": "3.11.15" } }, "nbformat": 4, diff --git a/src/infer.py b/src/infer.py index f324e53..2f48f41 100644 --- a/src/infer.py +++ b/src/infer.py @@ -2,14 +2,21 @@ import pandas as pd import numpy as np import torch import argparse -from transformers import AutoTokenizer from torch.utils.tensorboard import SummaryWriter +from transformers import AutoTokenizer +from transformers import AutoModelForSequenceClassification # mappings binary_map = {1:'Yes', 0:'No'} aspect_map = {0:'App', 1:'Driver', 2:'General', 3:'Payment', 4:'Pricing', 5:'Service'} sentiment_map = {0:'Positive', 1:'Neutral', 2:'Negative'} +label_names = { + 'bug_report': ['No', 'Yes'], + 'feature_request': ['No', 'Yes'], + 'aspect': ['App', 'Driver', 'General', 'Payment', 'Pricing', 'Service'], + 'aspect_sentiment': ['Positive', 'Neutral', 'Negative'] +} SEED = 4321 torch.manual_seed(SEED) @@ -17,9 +24,31 @@ np.random.seed(SEED) def parse_args(): parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.") - parser.add_argument("--model_path", type=str, help="Enter the models path / the desired .pt file") - parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"], help="Specific task to train for stl usage only" ) - parser.add_argument("--interactive", help="Loops reading input until exit") + parser.add_argument("--model_path", type=str, required=True, help=".pt file path") + parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"]) + parser.add_argument("--interactive", help="Loops reading input until exit()") parser.add_argument("--text", help="Use command line text for input") + parser.add_argument("--dataset", type=str, required=True, help="Enter a file for inference") + return parser.parse_args() + + +def main(): + args = parse_args() + print(f'='*50) + print(f' '*15 + "Starting inference") + if torch.cuda.is_available(): + print(f' '*15 + "GPU:", torch.cuda.get_device_name(0)) + torch.cuda.manual_seed_all(SEED) + torch.cuda.manual_seed(SEED) + else: + print(f' '*15 + "No GPUs available") + print(f'='*50 + "\n") + print(f"Running inference on: {args.model_path.upper()} using {args.dataset}") + + tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") + infer_data = f"data/processed/{args.dataset}_infer.csv" + +if __name__ == main(): + main() \ No newline at end of file diff --git a/src/model.py b/src/model.py index e6cbe70..23b5e2f 100644 --- a/src/model.py +++ b/src/model.py @@ -52,7 +52,7 @@ class Model(nn.Module): # Applied across shared cls token, before all task heads self.dropout = nn.Dropout(dropout_rate) - + # get logits for each head self.bug_head = nn.Linear(hidden_size, 2) self.feature_head = nn.Linear(hidden_size, 2) self.aspect_head = nn.Linear(hidden_size, 6) diff --git a/src/preprocess.py b/src/preprocess.py index dd8465a..88452e7 100644 --- a/src/preprocess.py +++ b/src/preprocess.py @@ -160,8 +160,8 @@ def preprocess_uber_reviews(input_path, output_path): return df_clean if __name__ == "__main__": - input_file = "multitag/data/uber_reviews.csv" - output_file = "multitag/data/uber_reviews_cleaned.csv" + input_file = "data/raw/uber_reviews.csv" + output_file = "data/raw/uber_reviews_cleaned.csv" df_clean = preprocess_uber_reviews(input_file, output_file) print("\nPreprocessing complete!") diff --git a/src/sampler.py b/src/sampler.py index 7a9b444..bf03d73 100644 --- a/src/sampler.py +++ b/src/sampler.py @@ -190,7 +190,6 @@ class Sampler: mini_sample = self.data.sample(200) # reading some samples manually return mini_sample - def save_sample(self, sample_df,output_path): """Save sample and display statistics""" diff --git a/src/train.py b/src/train.py index f14b98d..7e00aea 100644 --- a/src/train.py +++ b/src/train.py @@ -126,6 +126,7 @@ def main(): print("Aspect sentiment class weights:", aspect_sentiment_weights.cpu().numpy()) # equal weighted task losses. unequal was considered but equal weights performed well without adding complexity + # CrossEntropyLoss = LogSoftmax + NLLLoss (negative log likelihood) criterions = { 'bug_report': nn.CrossEntropyLoss(weight=bug_weights), 'feature_request': nn.CrossEntropyLoss(weight=feature_weights), @@ -134,6 +135,7 @@ def main(): } # -------------------- Optimizer and scheduler ------------------- + # adaptive momentum and weight decay keeps track of previous weight adaptions and ensures they dont get too large (weight also shrinks towards 0 each pass) optimizer = torch.optim.AdamW( model.parameters(), lr=args.lr,