added further documentation across all files

2026-04-05 14:19:57 +01:00
parent 7fa67af6c0
commit 1cca27e0b8
9 changed files with 173 additions and 246 deletions
--- a/README.md
+++ b/README.md
@@ -4,9 +4,11 @@

 ---

-## Project Overview
+# README not finished

-RECLASS is a multi-task learning system which uses a shared BERT encoder with task-specific classification heads.
+## Overview
+
+RECLASS is a multitask learning system which uses a shared multilingual transformer encoder with task-specific heads and single-task implementations for optional comparison.

 | Task | Output | Classes |
 |------|--------|---------|
@@ -18,57 +20,104 @@ RECLASS is a multi-task learning system which uses a shared BERT encoder with ta
 ## Dataset

 - **Source**: [Uber Customer Reviews (Kaggle)](https://www.kaggle.com/datasets/khushipitroda/ola-vs-uber-play-store-reviews)
- **Original size**: 1,069,616 reviews
- **Cleaned size**: 495,036 reviews (after removing short/duplicate reviews)
- **Annotation target**: 5,000 manually labelled reviews
+- **Original size**: ~1.07M Reviews
+- **After Preprocessing**: ~495K Reviews
+- **Annotation subsets**: 5,000 from the original distribution, 5,000 from a keyword boosted sample
+
+## Preprocessing Steps
+
+- Removed URLS and emails
+- Normalised text and punctuation
+- Removed duplicate reviews
+- Filtered reviews less than 5 words
+
+- Output sets
+    -   Original: matches the original distribution of the raw dataset
+    -   Boosted: oversamples bug reports and feature requests using keyword heuristics
+
+## Model
+
+- Encoder: XLM-RoBERTa (large multilingual transformer model)
+- Architecture:
+    - Shared encoder
+    - Task-specific classification heads
+- Training setups:
+    - MTL (Multitask learning)
+    - STL (Single-task learning)
+
+Class weights are applied to reduce imbalance effects.

 ## Repository Structure

-```
-6013/
-    README.md
-    .gitignore
-    data/
-        uber_reviews.csv           # Raw dataset
-        uber_reviews_cleaned.csv   # Preprocessed reviews
-        uber_reviews_sampled.csv   # Stratified sample for annotation
-        uber_reviews_tagged.csv    # Annotated reviews (in progress)
-    notebooks/
-        preprocessing_uber.ipynb   # Preprocessing analysis
-        uber_cleaned.ipynb         # Cleaned data verification
-    src/
-        preprocess.py              # Text cleaning and filtering pipeline
-        sampler.py                 # Stratified sampling strategies
-        multitag.py                # GUI annotation tool
-        train.py                   # Model training (in progress)
-        infer.py                   # Inference pipeline (in progress)
-        outputs/
-            figures/
-```
+.
+├── data
+│   └── processed
+│       ├── boosted_test.csv
+│       ├── boosted_train.csv
+│       ├── boosted_val.csv
+│       ├── original_test.csv
+│       ├── original_train.csv
+│       ├── original_val.csv
+│       └── review.csv
+├── notebooks/
+│   
+├── outputs
+│   └── figures/
+├── README.md
+├── architecture.png
+└── src
+    ├── dataset.py
+    ├── evaluate.py
+    ├── infer.py
+    ├── model.py
+    ├── multitag.py
+    ├── preprocess.py
+    ├── sampler.py
+    └── train.py

-## Current Progress
+## Results

- Manual annotation of 5,000 reviews
- BERT baseline implementation
- Multi-task model architecture
- Training and evaluation
- Comparative analysis (MTL vs single-task)
- Final report and presentation
+Evaluation includes Precision, Recall, Macro F1, Confusion matrices and confidence analysis.
+
+Results and summaries are found in outputs/*.json and outputs/figures/

 ## Installation

 ```
-# Clone repository
-...
 # Create conda environment
-...
+conda create -n reclass python=3.11 
+conda activate reclass
+```
+
+```
 # Install dependencies
-...requirements.txt
+conda install --file requirements.txt
 ```

 ## Usage
-## References
-## Licenses
+
+#### Train Model
+
+```
+python src/train.py --mode mtl --dataset original
+```
+
+#### Evaluate Model
+
+```
+python src/evaluate.py --mode mtl --dataset original --model_path <model>.pt
+```
+
+#### Run Inference
+
+```
+python src/infer.py --mode mtl --model_path <model>.pt --dataset review
+```
+
+## Notes
+
+- The same tokenizer is used across training, evaluation and inference to ensure consistency
+- Sampling and preprocessing choices are documented further in src files and dissertation

 ---

--- a/src/dataset.py
+++ b/src/dataset.py
@@ -1,22 +1,17 @@
 # dataset.py
-# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer
-# Takes a row from the csv, tokenizes the review and returns a tensor
+# Takes a row from the csv, tokenizes the review and returns a tensor ready for the model
 import torch
 import pandas as pd
 from torch.utils.data import Dataset
 from transformers import AutoTokenizer

 class ReviewDataset(Dataset):
-    """Pytorch Dataset for loading tokenized reviews
+    """
+    Dataset for tokenized reviews with labels for all 4 tasks.

    Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
-    Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
+    Expects a csv and tokenizes reviews using XLM-RoBERTa (SentencePiece), returning a dictionary with of
    input tensors and integer labels for all 4 tasks.
-     
-      Args:
-        path (str): Path to the csv file containing the reviews and labels.
-        tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
-        max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
    """

    def __init__(self, path, tokenizer, max_length=256):
@@ -30,25 +25,14 @@ class ReviewDataset(Dataset):
    def __getitem__(self, idx):
        review = self.df.iloc[idx]['review']

-        # encoding['input_ids'] 1D tensor of token ids, shape [max_length]
-        # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
-        # Both have shape [1, max_length] because of return_tensors='pt'
-        # Squeeze them to [max_length] with .squeeze(0)
+        # Tokenize with padding and truncation to max_length, returning PyTorch tensors
        encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        
-        # Returns a dictionary with:
-        #   'input_ids': tensor of shape [max_length]
-        
-        #   'attention_mask': tensor of shape [max_length]
-
-        # MTL structure labels as tensor scalars:
-        #   'bug_report': tensor scalar (torch.tensor(label_value))
-        #   'feature_request': tensor scalar (torch.tensor(label_value))
-        #   'aspect': tensor scalar (torch.tensor(label_value))
-        #   'aspect_sentiment': tensor scalar (torch.tensor(label_value))
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
+
+            # Labels for all 4 tasks, converted to tensors
            'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long),
            'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long),
            'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
@@ -65,18 +49,23 @@ class InferenceDataset(Dataset):
                return len(self.df)
        
        def __getitem__(self, idx):
-                #review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews 
                review = str(self.df.iloc[idx][self.text_column])
+
                if review == 'nan' or review.strip() == '':
                    review = ' '
+
+                # Same as training dataset but without labels, for inference on test sets
                encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
                return {
                        'input_ids': encoding['input_ids'].squeeze(0),
                        'attention_mask': encoding['attention_mask'].squeeze(0),
                }    
    
-        
+
+
+
 if __name__ == "__main__":
+    # Quick test
    dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
    print(dataset.__getitem__(1))
    
--- a/src/evaluate.py
+++ b/src/evaluate.py
@@ -1,4 +1,6 @@
 # evauluate.py
+# Evaluate MTL or STL models on the test split
+
 import os
 import torch
 import time
@@ -17,7 +19,6 @@ from sklearn.metrics import classification_report, confusion_matrix, f1_score
 from dataset import ReviewDataset
 from model import Model, SingleTaskModel

-# TODO: load checkpoint, produce tables of evaluation figures
 SEED = 4321
 torch.manual_seed(SEED)
 np.random.seed(SEED)
@@ -31,6 +32,7 @@ label_names = {
 }

 def parse_args():
+    """Parse command line arguments for evaluation"""
    parser = argparse.ArgumentParser(description="RECLASS Evaluation Script")
    parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl")
    parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
@@ -47,11 +49,13 @@ def main():
    os.makedirs("outputs/figures", exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

-
+    # Load test dataset and model
    test = f"data/processed/{args.dataset}_test.csv"
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
    test_dataset = ReviewDataset(test, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size)
+
+    # MTL evaluates all tasks, STL needs to know a single task to evaluate on
    if args.mode == "mtl":
        model = Model().to(device)
        active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
@@ -86,6 +90,7 @@ def main():
                logits = outputs[task]
                preds = torch.argmax(logits, dim=1)

+                # Kepp max softmax as confidence estimate
                probs = F.softmax(logits, dim=1)
                confidence = probs.max(dim=1).values

@@ -93,6 +98,7 @@ def main():
                all_preds[task].extend(preds.cpu().numpy())
                all_confidences[task].extend(confidence.cpu().numpy())
    
+    # Detailed JSON summary along with printed results
    summary = {
        "mode": args.mode,
        "dataset": args.dataset,
@@ -137,7 +143,7 @@ def main():
        print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}")
        print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}")

-        # save summary to JSON
+        # Store main metrics and full per class report to JSON
        summary["results"][task] = {
            "macro_f1": float(report_dict["macro avg"]["f1-score"]),
            "macro_precision": float(report_dict["macro avg"]["precision"]),
@@ -150,8 +156,7 @@ def main():
            "per_class": report_dict
        }

-        # Confusion matrix
-
+        # Confusion matrix for each evaluated task
        cm = confusion_matrix(labels_arr, preds_arr)
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.heatmap(
@@ -172,7 +177,6 @@ def main():
        test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable
        test_df[f'{task}_confidence'] = conf_arr

-    # to JSON
    run_name = args.task if args.mode == "stl" else "mtl"
    json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json"
    with open(json_path, "w") as f:
--- a/src/infer.py
+++ b/src/infer.py
@@ -1,4 +1,6 @@
 # infer.py
+# Run inference using MTL or STL on various inputs (CSV or User)
+
 from datetime import datetime
 import os
 import torch
@@ -20,8 +22,6 @@ from torch.utils.data import Dataset
 from dataset import InferenceDataset
 from model import Model, SingleTaskModel

-
-
 label_names = {
    'bug_report': ['No', 'Yes'],
    'feature_request': ['No', 'Yes'],
@@ -33,9 +33,6 @@ SEED = 4321
 torch.manual_seed(SEED)
 np.random.seed(SEED)

-
-
-
 def parse_args():
    parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
    parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/")
@@ -55,7 +52,7 @@ def main():
    os.makedirs("outputs/inference", exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

-    # this section is nearly identical to the first part of evaluate.py
+    # Mirrors the evaluation script with addition of interactive modes
    args = parse_args()
    print(f'{"="*50}')
    print(f'{"Starting inference"}')
@@ -70,7 +67,7 @@ def main():
    print("Loading model, tokenizer and datasets ...")
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

-    # Let the user decide if they want to run inference on the whole dataset or via the shell input
+    # Support CSV and interactive input
    if not args.interactive and not args.text:
        infer = f"data/processed/{args.dataset}.csv"
        infer_df = pd.read_csv(infer)
--- a/src/model.py
+++ b/src/model.py
@@ -1,27 +1,13 @@
 # model.py
-# One encoder, four shared heads(bug report, feature request, aspect, aspect sentiment)
-# 12 transformer layers, 12 attention heads
+# Shared encoder (XLM-RoBERTa) with either multitask heads for all 4 tasks or single task head for comparison

 from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel
 import torch.nn as nn

-# Using dropout, This has proven to be an effective technique 
-# for regularization and preventing the co-adaptation of neurons as described in https://arxiv.org/abs/1207.0580
-
-# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
-# Each hidden representation is size 768
-
+# Using dropout before classification to reduce overfitting
 class SingleTaskModel(nn.Module):
-    """Single task model to compare MTL approach to review classification
-    
-    Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
-    just different args
-    
-        Args:
-            task_name: which of the 4 tasks are we training for
-            num_classes: number of output classes for the task
-            dropout_rate: probability applied to cls representation, randomly drops tokens for better results
-        """
+    """Single task model with one head to compare MTL approach to review classification"""
+
    def __init__(self, task_name, num_classes, dropout_rate=0.2):
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -35,15 +21,7 @@ class SingleTaskModel(nn.Module):
        return {self.task_name: logits}

 class Model(nn.Module): 
-    """ Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads
-
-    Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through 
-    shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
-    allowing the encoder to learn from the shared representations / generalisations
-
-        Args:
-            dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
-    """
+    """ Multitask model with shared encoder and 4 task specific heads."""
    def __init__(self, dropout_rate=0.2):
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -58,8 +36,7 @@ class Model(nn.Module):
        self.aspect_head = nn.Linear(hidden_size, 6)
        self.aspect_sentiment_head = nn.Linear(hidden_size, 3)

-    # Pass through encoder then extract the token representation through [batch_size, 768]
-    # Apply droupout to it, take scores for each head, return them in a dictionary
+    # Pass through encoder once then extract the token representation, then reuse the shared represenetation across all tasks
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # index 0 from [batch_size, 768]
@@ -67,7 +44,6 @@ class Model(nn.Module):

        output = self.dropout(output)

-        # Logits for each head:
        bug_logits = self.bug_head(output)
        feature_logits = self.feature_head(output)
        aspect_logits = self.aspect_head(output)
--- a/src/multitag.py
+++ b/src/multitag.py
@@ -1,13 +1,9 @@
 # multitag.py
-# This app enables manual annotation of reviews in the Uber dataset, for training with 
-# to achieve review classifications with multi task deep learning
+# Manual annotation  tool for labelling reviews in the Uber reviews dataset, for multitask training

-# In another time I would have had much more tasks / classifications so mtl can perform better (that would mean better labelling), 
-#at least that is my prediction of why this may not be as good as I wanted
 import tkinter as tk
 from tkinter import ttk
 import pandas as pd
-# import langdetect
 import os

 class MultiTag:
@@ -41,9 +37,6 @@ class MultiTag:
        self.number_of_aspects = 6  # number of aspect buttons
        self.root.title("MultiTag")

-        #self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
-        #self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
-
        # Colors for active label
        self.color_incomplete = "#003366"
        self.color_complete = "#00AA00"
@@ -51,8 +44,7 @@ class MultiTag:
        # Paths
        tagged_path = "data/uber_reviews_tagged.csv"
        sampled_path = "data/uber_reviews_sampled.csv"
-        # self.load_review_data("data/uber_reviews_sampled.csv")
-        # self.load_review_data("data/uber_reviews_tagged.csv")
+
        if not os.path.exists(tagged_path):
            print(f"Tagged file did not exist, making one at: {sampled_path}")
            sampled_df = pd.read_csv(sampled_path, low_memory=False)
@@ -89,13 +81,13 @@ class MultiTag:
        self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))


-        #   Labels ROW 3
+        # ROW 3: Field labels
        ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
        ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
        ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
        ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))

-        # ROW 4 |Buttons| 
+        # ROW 4: Input buttons
        # Feature Requests
        self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
        self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
@@ -132,20 +124,15 @@ class MultiTag:
        self.root.bind("f", self.handle_key)
        self.root.bind("g", self.handle_key)
        self.root.bind("h", self.handle_key)
-        # self.root.bind("j", self.handle_key)
-        # self.root.bind("k", self.handle_key)
-        # self.root.bind("l", self.handle_key)

-
-    
+   
        self.display_next_review()
-        #   self.save_tags("data/uber_reviews_tagged.csv")
        self.root.mainloop()

    def handle_key(self, event):
        key = event.char
    
-        # Column 0 or 1: feature/bug (1 and 0)
+        # Feature Request and Bug Report are binary input (1 and 0 keys)
        if key in ['1', '0']:
            if self.active_column == 0:
                self.feature_pressed(key)
@@ -159,7 +146,7 @@ class MultiTag:
            self.sentiment_pressed(key.upper())

    def update_status(self):
-        """Update status label and highlight color based on completion state"""
+        """Update status label and highlight"""
        if self.all_labels_complete():
            self.highlight.configure(bg=self.color_complete)
            self.status_label.configure(
@@ -212,22 +199,22 @@ class MultiTag:


    def load_review_data(self, data_path):
-        """Load review data from a CSV file."""
+        """Load review data from a CSV file. Adds annotation columns if they don't exist."""
        self.review_data = pd.read_csv(data_path, low_memory=False)
        if "tagged" not in self.review_data.columns:
-            self.review_data["tagged"] = 0              # Initialize tagged column if not present
+            self.review_data["tagged"] = 0             
        if "feature_request" not in self.review_data.columns:
-            self.review_data["feature_request"] = ""    # Initialize feature_request column if not present
+            self.review_data["feature_request"] = ""   
        if "bug_report" not in self.review_data.columns:
-            self.review_data["bug_report"] = ""         # Initialize bug_report column if not present
+            self.review_data["bug_report"] = ""         
        if "aspect" not in self.review_data.columns:
-            self.review_data["aspect"] = ""             # Initialize aspect column if not present
+            self.review_data["aspect"] = ""           
        if "aspect_sentiment" not in self.review_data.columns:
-            self.review_data["aspect_sentiment"] = ""   # Initialize aspect_sentiment column if not present
+            self.review_data["aspect_sentiment"] = "" 
        print(f"Loaded {len(self.review_data)} reviews from {data_path}")
    
    def display_next_review(self):
-        """Display the next review in the text box."""
+        """Display the next unlabelled review in the text box."""
        self.current_review_index = self.get_current_review_index()
        if self.current_review_index < len(self.review_data):
            review = self.review_data.iloc[self.current_review_index]
@@ -283,9 +270,8 @@ class MultiTag:
            row["aspect_sentiment"] != "")
    
    def save_tags(self, save_path):
-        """Save the tagged data to a CSV file."""
+        """Save the current tagged data to a CSV file."""
        self.review_data.to_csv(save_path, index=False)
-        # print(f"Tagged data saved to {save_path}")

    def quit_app(self, event):
        tagged_count = (self.review_data['tagged'] == 1).sum()
--- a/src/preprocess.py
+++ b/src/preprocess.py
@@ -1,19 +1,13 @@
 # preprocess.py

+# Text cleaning and preprocessing for the Uber Reviews Dataset
 # langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually

 import pandas as pd
 import re

 def clean_text(text) -> str:
-    """Clean review text by removing URLS, emails, excessive whitespace
-
-    Input: 
-    text - the review text to clean
-
-    Outputs:
-    str: the cleaned review text
-    """
+    """Normalise review text by removing URLS, emails, excessive whitespace"""
    if pd.isna(text):
        return ""
    
@@ -53,12 +47,6 @@ def preprocess_uber_reviews(input_path, output_path):
    6. Removes less than 5 word reviews
    6. Saves the cleaned dataset to uber_reviews_cleaned.csv

-    Inputs:
-    input_path (str): Path to uber_reviews.csv
-    output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv
-
-    Outputs:
-    pd.df_clean: the dataframe of cleaned processed reviews
    """
    print("="*50)
    print("PREPROCESSING UBER REVIEWS")
@@ -117,10 +105,6 @@ def preprocess_uber_reviews(input_path, output_path):
    print("="*50)
    print(f"\nFinal dataset: {len(df_clean):,} reviews")
    print(f"Quality filters: word_count >= 5, duplicates removed") 
-    # while this does remove a some legitimate reviews which would provide use in classification
-    # it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5 
-    # it showed the most amount of formative reviews without seeming excessive in data removal
-    
    print("\nRating distribution:")
    rating_dist = df_clean['rating'].value_counts().sort_index()
    for rating, count in rating_dist.items():
@@ -137,10 +121,7 @@ def preprocess_uber_reviews(input_path, output_path):
    print(f"  Short reviews: {df_clean[df_clean['word_count'] < 5]}")
    print(f"  Null values: {df_clean.isnull().sum().to_dict()}")
    print(f"  Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
-    # lang detection takes 5+ mins so leaving it commented for now 
-    #df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
-    #print(f"  Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
-    
+
    # Sample reviews from each rating
    print("\n" + "="*50)
    print("SAMPLE CLEANED REVIEWS")
@@ -152,11 +133,6 @@ def preprocess_uber_reviews(input_path, output_path):
            for index, row in sample.iterrows():
                print(f"  • ({row['word_count']} words) {row['review'][:100]}")
    
-    # Note about language
-    print("Language detection not applied due to unreliability on short")
-    print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
-    print(" ...Manual annotation phase will identify any non-English reviews")
-    
    return df_clean

 if __name__ == "__main__":
--- a/src/sampler.py
+++ b/src/sampler.py
@@ -11,11 +11,12 @@ class Sampler:
    def __init__(self, data_path, target_samples):

        self.data_path = data_path
-        self.stratify_column = "rating"  # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
+        # Default stratification method is based on original rating distribution
+        self.stratify_column = "rating"  

        self.original_data = pd.read_csv(original_path, low_memory=False)
        self.data = pd.read_csv(self.data_path, low_memory=False)
-        self.total = len(self.data)  # total number of records in the dataset
+        self.total = len(self.data)  # total number of records in the working dataset

        print("="*50)
        print("SAMPLER INITIALIZED")
@@ -35,25 +36,10 @@ class Sampler:
        print((_origdist*100).round(1),"\n")

        self.data.info(verbose=True)
+    """
+    Kept for reference with later sampling methods

-    #   add sampling method here
-    #   random sample 5000 entries with stratifiying by rating
-    """
-    rating
-    5    57.1% (611133)
-    1    26.5% (283895)
-    4     7.8% (82953)
-    3     4.7% (49928)
-    2     3.9% (41707)
-    Name: proportion, dtype: object
-    """
-    
-    """
-    IGNORE --- Left in just in case
-
-    Sample randomly
-    Redundant calculation
-    Doesn't factor that the distribution changed greatly after preprocessing
+    Samples from current processed data rather than matching the original distribution
    """
    def get_stratified_sample(self) -> pd.DataFrame:
           stratified_sample = (
@@ -67,9 +53,8 @@ class Sampler:
    
    def sample_col(self, column) -> pd.DataFrame:    
        """
-        IGNORE --- Left in just in case
-
-        Randomly sample, including conflicting math, I guess I was going to stratify
+        Samples a proportional number of rows from one column
+        Deprecated: Not used in final pipeline, kept for reference
        """
        samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
        samples_per_column = max(samples_per_column,1) # also pointless
@@ -77,24 +62,9 @@ class Sampler:


    """
-    original_distribution_sample()
-    The main sampling method for our labelling as it 
-    keeps composition of the original uber dataset, verified in 
-    which is a fairer comparison, may also work better in general
-
-    verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
-    and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
-
-    
-    manually coded distributions taken from notebooks
-
-    for ratings and actual number of samples 
-    rating data is the whole data for a rating as we iterate
-    has error handling if totals doesn't match the required amount of samples per the orig distrib
-    randomise the indexes (samples) and appends to the new dataset
-
-
-
+    Main sampling method to annotate
+    Samples reviews matching the original raw dataset distribution, so the labelled set
+    better represents the original data and is more comparable to the unlabelled set.
    """
    def original_distribution_sample(self):
        original_dist = {
@@ -117,21 +87,14 @@ class Sampler:
        return original_sample
    
    """
-    sample_with_keywords()
-
-    In order to train on more bugs and features data in 
-    future this method was created
+    Build a sample with more likely bugs and feature reviews
    - 2000 balanced by rating (400 per)
    - 1500 likely bugs using bug_keywords list
    - 1500 likely features using feature_keywords list
-
-    inputs:
-    outputs:
-    
    """

    def sample_with_keywords(self):
-        #TODO add keywords for feature classification
+        # Keyword lists for oversampling likely bug reports and feature requests
        print(f"\n{'='*50}")
        print("Keyword influenced / rating stratified set")
        print(f"\n{'='*50}")
@@ -181,18 +144,16 @@ class Sampler:
        # Drop helper columns
        keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])

-        
-        
        print(f"\n Total samples: {len(keyword_sample):,}")
        return keyword_sample

    def sample_tiny_size(self):
-        mini_sample = self.data.sample(200)     #   reading some samples manually
+        mini_sample = self.data.sample(200)     # for reading some samples manually
        return mini_sample

    
    def save_sample(self, sample_df,output_path):
-        """Save sample and display statistics"""
+        """Save sample and display summary statistics"""
        sample_df.to_csv(output_path, index=False)
        
        print(f"\n{'='*50}")
--- a/src/train.py
+++ b/src/train.py
@@ -1,5 +1,6 @@
 # train.py
-# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
+# Training script for both MTL and STL setups 
+# Structure adapted and adjusted from standard PyTorch training loops
 import argparse
 import os
 from datetime import datetime
@@ -17,40 +18,26 @@ from transformers import get_linear_schedule_with_warmup
 from sklearn.metrics import classification_report, f1_score
 from sklearn.utils.class_weight import compute_class_weight

-
 from dataset import ReviewDataset
 from model import Model, SingleTaskModel



-
-# =======================================================================
-#         Training script for MTL and STL training configurations
-# =======================================================================
-
-# NFR5, reproducibility
+# Fixed seed for near reproducibile runs
 SEED = 4321
 torch.manual_seed(SEED)
 np.random.seed(SEED)
 random.seed(SEED)


-# ------------------- Class weights -------------------
-# Using weights inversely proportional to class frequencies to avoid majority class bias, 
-# prioritize useful bug reports / feature requests
+
 def compute_weights(df, column, device):
-    """Copmutes inverse frequency class weights for a label column
-    
-    Uses sklearns balanced mode
-    Rare classes receive higher weights to penalise so it can learn more from less
-    """
+    """Computes inverse frequency class weights for a label column"""
    classes = np.unique(df[column])
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
    return torch.tensor(weights, dtype=torch.float).to(device)

-# parse_args() - NFR7 and NFR9
-#   Example Usages: python src/train.py --dataset boosted
-#   python src/train.py --epochs 15 NOTE: 8 - 12 epochs has seen best results so far
+
 def parse_args():
    parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
    parser.add_argument("--mode", type=str, default="mtl", choices=["mtl", "stl"], help="Choose between 'mtl' (multitask learning) and 'stl' (single task learning).")
@@ -67,23 +54,25 @@ def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Starting training...", flush=True)
    print("Using device:", device)
-    # Set cuda seeds for reproducibility
+
+    # Set cuda seeds for reproducibility on GPU
    if torch.cuda.is_available():
        print("GPU:", torch.cuda.get_device_name(0))
        torch.cuda.manual_seed_all(SEED)
        torch.cuda.manual_seed(SEED)
    print(f"Using dataset: {args.dataset.upper()}")
+
    # Force deterministic for reproducibility at a slight performance cost
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

-    # load data
+    # load data into train/val splits
    train = f"data/processed/{args.dataset}_train.csv"
    val = f"data/processed/{args.dataset}_val.csv"
    os.makedirs("outputs", exist_ok=True)
    os.makedirs("runs", exist_ok=True)

-    # FR1, FR2, Multilingual tokenizer initilization
+    # Tokenizer initilization
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

    train_dataset = ReviewDataset(train, tokenizer)
@@ -92,7 +81,7 @@ def main():
    training_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    validation_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)

-    # FR3, shared multilingual model with task-specific heads
+    # Shared model uses encoder across all tasks, STL model trains one task at a time
    if args.mode == "mtl":
        model = Model().to(device)
        active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
@@ -113,7 +102,7 @@ def main():
    
    train_df = pd.read_csv(train)
    
-    # Class weights
+    # Compute per-task weights from the training split
    print("\n Computing class weights...")
    bug_weights = compute_weights(train_df, 'bug_report', device)
    feature_weights = compute_weights(train_df, 'feature_request', device)
@@ -151,7 +140,7 @@ def main():
        num_training_steps=total_steps
    )

-    # ------------------- Training loop -------------------
+    # Entry point for training loop, with Tensorboard logging and early stopping based on validation macro F1 score
    start_time = time.time()
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter(f'runs/reclass_{run_name}_{timestamp}')
@@ -175,7 +164,7 @@ def main():
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

-            # FR8, Multitask forward pass
+            # Multitask forward pass
            outputs = model(input_ids, attention_mask)
            
            loss = 0
@@ -199,7 +188,7 @@ def main():
        writer.add_scalar("Loss/train", avg_train_loss, epoch) 
        print(f"Average training loss: {avg_train_loss:.4f}")

-        # -------------------- Validation loop -------------------
+        # Validation phase
        model.eval()
        total_val_loss = 0.0

@@ -226,7 +215,7 @@ def main():
        avg_vloss = total_val_loss / len(validation_loader)
        writer.add_scalar("Loss/val", avg_vloss, epoch)

-        # FR11, Performance evaluation
+        # Performance evaluation summary
        print("\nValidation Metrics (MACRO F1):")    
        epoch_f1 = []
        for task in active_tasks:
@@ -239,7 +228,7 @@ def main():
        writer.add_scalar("F1/val_macro_avg", avg_macro_f1, epoch)
        print(f" Average Macro F1: {avg_macro_f1:.4f}")

-        # NFR4, Early stopping
+        # Early stopping
        if avg_macro_f1 > best_f1:
            best_f1 = avg_macro_f1
            patience_counter = 0