Added comments and made a start on infer.py

2026-03-28 22:31:10 +00:00
parent 753723694b
commit 0af8bff4a8
7 changed files with 301 additions and 22 deletions
--- a/notebooks/getting_csv_for_inference.ipynb
+++ b/notebooks/getting_csv_for_inference.ipynb
@@ -0,0 +1,261 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "79ac71dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "aa9117f0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source</th>\n",
+       "      <th>review_id</th>\n",
+       "      <th>user_name</th>\n",
+       "      <th>review_title</th>\n",
+       "      <th>review_description</th>\n",
+       "      <th>rating</th>\n",
+       "      <th>thumbs_up</th>\n",
+       "      <th>review_date</th>\n",
+       "      <th>developer_response</th>\n",
+       "      <th>developer_response_date</th>\n",
+       "      <th>appVersion</th>\n",
+       "      <th>laguage_code</th>\n",
+       "      <th>country_code</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Google Play</td>\n",
+       "      <td>18d6584c-d0e9-4833-a744-f607058aee97</td>\n",
+       "      <td>Milky Way</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Suddenly, the driver can't have my location an...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2023-08-10 17:48:51</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>en</td>\n",
+       "      <td>in</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Google Play</td>\n",
+       "      <td>50a08f18-cece-4ddf-b617-028844c8aa28</td>\n",
+       "      <td>Bradlee Severa</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Very cordial.. And helped with a quick turnaro...</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2023-08-10 17:38:35</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.485.10000</td>\n",
+       "      <td>en</td>\n",
+       "      <td>in</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Google Play</td>\n",
+       "      <td>b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7</td>\n",
+       "      <td>Amit Aggarwal</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Very good experience</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2023-08-10 17:38:17</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.486.10002</td>\n",
+       "      <td>en</td>\n",
+       "      <td>in</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Google Play</td>\n",
+       "      <td>502702a9-25ed-4373-a96c-7fa1f06caacd</td>\n",
+       "      <td>Bryant Inman</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>All I use</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2023-08-10 17:37:45</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.467.10008</td>\n",
+       "      <td>en</td>\n",
+       "      <td>in</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Google Play</td>\n",
+       "      <td>f47a3fb6-23db-49bd-9e63-f33c8d724d07</td>\n",
+       "      <td>Addie Whittaker</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>I have enjoyed traveling by Uber my drivers ha...</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2023-08-10 17:36:56</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.486.10002</td>\n",
+       "      <td>en</td>\n",
+       "      <td>in</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        source                             review_id        user_name  \\\n",
+       "0  Google Play  18d6584c-d0e9-4833-a744-f607058aee97        Milky Way   \n",
+       "1  Google Play  50a08f18-cece-4ddf-b617-028844c8aa28   Bradlee Severa   \n",
+       "2  Google Play  b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7    Amit Aggarwal   \n",
+       "3  Google Play  502702a9-25ed-4373-a96c-7fa1f06caacd     Bryant Inman   \n",
+       "4  Google Play  f47a3fb6-23db-49bd-9e63-f33c8d724d07  Addie Whittaker   \n",
+       "\n",
+       "  review_title                                 review_description  rating  \\\n",
+       "0          NaN  Suddenly, the driver can't have my location an...       1   \n",
+       "1          NaN  Very cordial.. And helped with a quick turnaro...       5   \n",
+       "2          NaN                               Very good experience       5   \n",
+       "3          NaN                                          All I use       5   \n",
+       "4          NaN  I have enjoyed traveling by Uber my drivers ha...       5   \n",
+       "\n",
+       "   thumbs_up          review_date developer_response developer_response_date  \\\n",
+       "0        0.0  2023-08-10 17:48:51                NaN                     NaN   \n",
+       "1        0.0  2023-08-10 17:38:35                NaN                     NaN   \n",
+       "2        0.0  2023-08-10 17:38:17                NaN                     NaN   \n",
+       "3        0.0  2023-08-10 17:37:45                NaN                     NaN   \n",
+       "4        0.0  2023-08-10 17:36:56                NaN                     NaN   \n",
+       "\n",
+       "    appVersion laguage_code country_code  \n",
+       "0          NaN           en           in  \n",
+       "1  4.485.10000           en           in  \n",
+       "2  4.486.10002           en           in  \n",
+       "3  4.467.10008           en           in  \n",
+       "4  4.486.10002           en           in  "
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(\"../data/raw/uber_reviews.csv\", low_memory=False)\n",
+    "df.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "36683790",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.DataFrame'>\n",
+      "RangeIndex: 1069616 entries, 0 to 1069615\n",
+      "Data columns (total 13 columns):\n",
+      " #   Column                   Non-Null Count    Dtype  \n",
+      "---  ------                   --------------    -----  \n",
+      " 0   source                   1069616 non-null  str    \n",
+      " 1   review_id                1069616 non-null  str    \n",
+      " 2   user_name                1069615 non-null  str    \n",
+      " 3   review_title             2180 non-null     str    \n",
+      " 4   review_description       1069447 non-null  str    \n",
+      " 5   rating                   1069616 non-null  int64  \n",
+      " 6   thumbs_up                1067436 non-null  float64\n",
+      " 7   review_date              1069616 non-null  str    \n",
+      " 8   developer_response       198264 non-null   str    \n",
+      " 9   developer_response_date  197278 non-null   str    \n",
+      " 10  appVersion               828068 non-null   str    \n",
+      " 11  laguage_code             1069616 non-null  str    \n",
+      " 12  country_code             1069616 non-null  str    \n",
+      "dtypes: float64(1), int64(1), str(11)\n",
+      "memory usage: 106.1 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "0b1c7f73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df[['review_description']].reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "90e6d653",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head(5)\n",
+    "df.to_csv(\"../data/raw/review_description.csv\", index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "multitag",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/notebooks/preprocessing_tagged.ipynb
+++ b/notebooks/preprocessing_tagged.ipynb
@@ -5,19 +5,7 @@
   "execution_count": 1,
   "id": "2b7cfa1a",
   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'sklearn'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpd\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01msklearn\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mmodel_selection\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[32m      3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnp\u001b[39;00m\n",
-      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'sklearn'"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
@@ -1209,7 +1197,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.13.11"
+   "version": "3.11.15"
  }
 },
 "nbformat": 4,
--- a/src/infer.py
+++ b/src/infer.py
@@ -2,14 +2,21 @@ import pandas as pd
 import numpy as np
 import torch
 import argparse
-from transformers import AutoTokenizer
 from torch.utils.tensorboard import SummaryWriter
+from transformers import AutoTokenizer
+from transformers import AutoModelForSequenceClassification

 # mappings
 binary_map = {1:'Yes', 0:'No'}
 aspect_map = {0:'App', 1:'Driver', 2:'General', 3:'Payment', 4:'Pricing', 5:'Service'}
 sentiment_map = {0:'Positive', 1:'Neutral', 2:'Negative'}

+label_names = {
+    'bug_report': ['No', 'Yes'],
+    'feature_request': ['No', 'Yes'],
+    'aspect': ['App', 'Driver', 'General', 'Payment', 'Pricing', 'Service'],
+    'aspect_sentiment': ['Positive', 'Neutral', 'Negative']
+}

 SEED = 4321
 torch.manual_seed(SEED)
@@ -17,9 +24,31 @@ np.random.seed(SEED)

 def parse_args():
    parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
-    parser.add_argument("--model_path", type=str, help="Enter the models path / the desired .pt file")
-    parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"], help="Specific task to train for stl usage only" )
-    parser.add_argument("--interactive", help="Loops reading input until exit")
+    parser.add_argument("--model_path", type=str, required=True, help=".pt file path")
+    parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
+    parser.add_argument("--interactive", help="Loops reading input until exit()")
    parser.add_argument("--text", help="Use command line text for input")
+    parser.add_argument("--dataset", type=str, required=True, help="Enter a file for inference")
+
    return parser.parse_args()

+
+
+def main():
+    args = parse_args()
+    print(f'='*50)
+    print(f' '*15 + "Starting inference")
+    if torch.cuda.is_available():
+        print(f' '*15 + "GPU:", torch.cuda.get_device_name(0))
+        torch.cuda.manual_seed_all(SEED)
+        torch.cuda.manual_seed(SEED)
+    else:
+        print(f' '*15 + "No GPUs available")
+    print(f'='*50 + "\n")
+    print(f"Running inference on: {args.model_path.upper()} using {args.dataset}")
+
+    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
+    infer_data = f"data/processed/{args.dataset}_infer.csv"
+
+if __name__ == main():
+    main()
--- a/src/model.py
+++ b/src/model.py
@@ -52,7 +52,7 @@ class Model(nn.Module):

        # Applied across shared cls token, before all task heads 
        self.dropout = nn.Dropout(dropout_rate)
-
+        # get logits for each head
        self.bug_head = nn.Linear(hidden_size, 2)
        self.feature_head = nn.Linear(hidden_size, 2)
        self.aspect_head = nn.Linear(hidden_size, 6)
--- a/src/preprocess.py
+++ b/src/preprocess.py
@@ -160,8 +160,8 @@ def preprocess_uber_reviews(input_path, output_path):
    return df_clean

 if __name__ == "__main__":
-    input_file = "multitag/data/uber_reviews.csv"
-    output_file = "multitag/data/uber_reviews_cleaned.csv"
+    input_file = "data/raw/uber_reviews.csv"
+    output_file = "data/raw/uber_reviews_cleaned.csv"
    
    df_clean = preprocess_uber_reviews(input_file, output_file)
    print("\nPreprocessing complete!")
--- a/src/sampler.py
+++ b/src/sampler.py
@@ -190,7 +190,6 @@ class Sampler:
        mini_sample = self.data.sample(200)     #   reading some samples manually
        return mini_sample

-         
    
    def save_sample(self, sample_df,output_path):
        """Save sample and display statistics"""
--- a/src/train.py
+++ b/src/train.py
@@ -126,6 +126,7 @@ def main():
    print("Aspect sentiment class weights:", aspect_sentiment_weights.cpu().numpy())
    
    # equal weighted task losses. unequal was considered but equal weights performed well without adding complexity
+    # CrossEntropyLoss = LogSoftmax + NLLLoss (negative log likelihood) 
    criterions = {
        'bug_report': nn.CrossEntropyLoss(weight=bug_weights),
        'feature_request': nn.CrossEntropyLoss(weight=feature_weights),
@@ -134,6 +135,7 @@ def main():
    }

    # -------------------- Optimizer and scheduler -------------------
+    # adaptive momentum and weight decay keeps track of previous weight adaptions and ensures they dont get too large (weight also shrinks towards 0 each pass) 
    optimizer = torch.optim.AdamW(
        model.parameters(), 
        lr=args.lr,