added further documentation across all files

2026-04-05 14:19:57 +01:00
parent 7fa67af6c0
commit 1cca27e0b8
9 changed files with 173 additions and 246 deletions
--- a/README.md
+++ b/README.md
@@ -4,9 +4,11 @@
 ---
-## Project Overview
+# README not finished
-RECLASS is a multi-task learning system which uses a shared BERT encoder with task-specific classification heads.
+## Overview
 RECLASS is a multitask learning system which uses a shared multilingual transformer encoder with task-specific heads and single-task implementations for optional comparison.
 | Task | Output | Classes |
 |------|--------|---------|
@@ -18,57 +20,104 @@ RECLASS is a multi-task learning system which uses a shared BERT encoder with ta
 ## Dataset
 - **Source**: [Uber Customer Reviews (Kaggle)](https://www.kaggle.com/datasets/khushipitroda/ola-vs-uber-play-store-reviews)
- **Original size**: 1,069,616 reviews
+- **Original size**: ~1.07M Reviews
- **Cleaned size**: 495,036 reviews (after removing short/duplicate reviews)
+- **After Preprocessing**: ~495K Reviews
- **Annotation target**: 5,000 manually labelled reviews
+- **Annotation subsets**: 5,000 from the original distribution, 5,000 from a keyword boosted sample
 ## Preprocessing Steps
 - Removed URLS and emails
 - Normalised text and punctuation
 - Removed duplicate reviews
 - Filtered reviews less than 5 words
 - Output sets
    -   Original: matches the original distribution of the raw dataset
    -   Boosted: oversamples bug reports and feature requests using keyword heuristics
 ## Model
 - Encoder: XLM-RoBERTa (large multilingual transformer model)
 - Architecture:
    - Shared encoder
    - Task-specific classification heads
 - Training setups:
    - MTL (Multitask learning)
    - STL (Single-task learning)
 Class weights are applied to reduce imbalance effects.
 ## Repository Structure
-```
+.
-6013/
+├── data
-    README.md
+│   └── processed
-    .gitignore
+│       ├── boosted_test.csv
-    data/
+│       ├── boosted_train.csv
-        uber_reviews.csv           # Raw dataset
+│       ├── boosted_val.csv
-        uber_reviews_cleaned.csv   # Preprocessed reviews
+│       ├── original_test.csv
-        uber_reviews_sampled.csv   # Stratified sample for annotation
+│       ├── original_train.csv
-        uber_reviews_tagged.csv    # Annotated reviews (in progress)
+│       ├── original_val.csv
-    notebooks/
+│       └── review.csv
-        preprocessing_uber.ipynb   # Preprocessing analysis
+├── notebooks/
-        uber_cleaned.ipynb         # Cleaned data verification
+│   
-    src/
+├── outputs
-        preprocess.py              # Text cleaning and filtering pipeline
+│   └── figures/
-        sampler.py                 # Stratified sampling strategies
+├── README.md
-        multitag.py                # GUI annotation tool
+├── architecture.png
-        train.py                   # Model training (in progress)
+└── src
-        infer.py                   # Inference pipeline (in progress)
+    ├── dataset.py
-        outputs/
+    ├── evaluate.py
-            figures/
+    ├── infer.py
-```
+    ├── model.py
    ├── multitag.py
    ├── preprocess.py
    ├── sampler.py
    └── train.py
-## Current Progress
+## Results
- Manual annotation of 5,000 reviews
+Evaluation includes Precision, Recall, Macro F1, Confusion matrices and confidence analysis.
- BERT baseline implementation
+
- Multi-task model architecture
+Results and summaries are found in outputs/*.json and outputs/figures/
 - Training and evaluation
 - Comparative analysis (MTL vs single-task)
 - Final report and presentation
 ## Installation
 ```
 # Clone repository
 ...
 # Create conda environment
-...
+conda create -n reclass python=3.11 
 conda activate reclass
 ```
 ```
 # Install dependencies
-...requirements.txt
+conda install --file requirements.txt
 ```
 ## Usage
-## References
+
-## Licenses
+#### Train Model
 ```
 python src/train.py --mode mtl --dataset original
 ```
 #### Evaluate Model
 ```
 python src/evaluate.py --mode mtl --dataset original --model_path <model>.pt
 ```
 #### Run Inference
 ```
 python src/infer.py --mode mtl --model_path <model>.pt --dataset review
 ```
 ## Notes
 - The same tokenizer is used across training, evaluation and inference to ensure consistency
 - Sampling and preprocessing choices are documented further in src files and dissertation
 ---
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -1,22 +1,17 @@
 # dataset.py
-# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer
+# Takes a row from the csv, tokenizes the review and returns a tensor ready for the model
 # Takes a row from the csv, tokenizes the review and returns a tensor
 import torch
 import pandas as pd
 from torch.utils.data import Dataset
 from transformers import AutoTokenizer
 class ReviewDataset(Dataset):
-    """Pytorch Dataset for loading tokenized reviews
+    """
    Dataset for tokenized reviews with labels for all 4 tasks.
    Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
-    Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
+    Expects a csv and tokenizes reviews using XLM-RoBERTa (SentencePiece), returning a dictionary with of
    input tensors and integer labels for all 4 tasks.
      Args:
        path (str): Path to the csv file containing the reviews and labels.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
        max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
    """
    def __init__(self, path, tokenizer, max_length=256):
@@ -30,25 +25,14 @@ class ReviewDataset(Dataset):
    def __getitem__(self, idx):
        review = self.df.iloc[idx]['review']
-        # encoding['input_ids'] 1D tensor of token ids, shape [max_length]
+        # Tokenize with padding and truncation to max_length, returning PyTorch tensors
        # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
        # Both have shape [1, max_length] because of return_tensors='pt'
        # Squeeze them to [max_length] with .squeeze(0)
        encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        # Returns a dictionary with:
        #   'input_ids': tensor of shape [max_length]
        #   'attention_mask': tensor of shape [max_length]
        # MTL structure labels as tensor scalars:
        #   'bug_report': tensor scalar (torch.tensor(label_value))
        #   'feature_request': tensor scalar (torch.tensor(label_value))
        #   'aspect': tensor scalar (torch.tensor(label_value))
        #   'aspect_sentiment': tensor scalar (torch.tensor(label_value))
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            # Labels for all 4 tasks, converted to tensors
            'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long),
            'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long),
            'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
@@ -65,10 +49,12 @@ class InferenceDataset(Dataset):
                return len(self.df)
        def __getitem__(self, idx):
                #review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews 
                review = str(self.df.iloc[idx][self.text_column])
                if review == 'nan' or review.strip() == '':
                    review = ' '
                # Same as training dataset but without labels, for inference on test sets
                encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
                return {
                        'input_ids': encoding['input_ids'].squeeze(0),
@@ -76,7 +62,10 @@ class InferenceDataset(Dataset):
                }    
 if __name__ == "__main__":
    # Quick test
    dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
    print(dataset.__getitem__(1))
--- a/src/evaluate.py
+++ b/src/evaluate.py
@@ -1,4 +1,6 @@
 # evauluate.py
 # Evaluate MTL or STL models on the test split
 import os
 import torch
 import time
@@ -17,7 +19,6 @@ from sklearn.metrics import classification_report, confusion_matrix, f1_score
 from dataset import ReviewDataset
 from model import Model, SingleTaskModel
 # TODO: load checkpoint, produce tables of evaluation figures
 SEED = 4321
 torch.manual_seed(SEED)
 np.random.seed(SEED)
@@ -31,6 +32,7 @@ label_names = {
 }
 def parse_args():
    """Parse command line arguments for evaluation"""
    parser = argparse.ArgumentParser(description="RECLASS Evaluation Script")
    parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl")
    parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
@@ -47,11 +49,13 @@ def main():
    os.makedirs("outputs/figures", exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
+    # Load test dataset and model
    test = f"data/processed/{args.dataset}_test.csv"
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
    test_dataset = ReviewDataset(test, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size)
    # MTL evaluates all tasks, STL needs to know a single task to evaluate on
    if args.mode == "mtl":
        model = Model().to(device)
        active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
@@ -86,6 +90,7 @@ def main():
                logits = outputs[task]
                preds = torch.argmax(logits, dim=1)
                # Kepp max softmax as confidence estimate
                probs = F.softmax(logits, dim=1)
                confidence = probs.max(dim=1).values
@@ -93,6 +98,7 @@ def main():
                all_preds[task].extend(preds.cpu().numpy())
                all_confidences[task].extend(confidence.cpu().numpy())
    # Detailed JSON summary along with printed results
    summary = {
        "mode": args.mode,
        "dataset": args.dataset,
@@ -137,7 +143,7 @@ def main():
        print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}")
        print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}")
-        # save summary to JSON
+        # Store main metrics and full per class report to JSON
        summary["results"][task] = {
            "macro_f1": float(report_dict["macro avg"]["f1-score"]),
            "macro_precision": float(report_dict["macro avg"]["precision"]),
@@ -150,8 +156,7 @@ def main():
            "per_class": report_dict
        }
-        # Confusion matrix
+        # Confusion matrix for each evaluated task
        cm = confusion_matrix(labels_arr, preds_arr)
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.heatmap(
@@ -172,7 +177,6 @@ def main():
        test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable
        test_df[f'{task}_confidence'] = conf_arr
    # to JSON
    run_name = args.task if args.mode == "stl" else "mtl"
    json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json"
    with open(json_path, "w") as f:
--- a/src/infer.py
+++ b/src/infer.py
@@ -1,4 +1,6 @@
 # infer.py
 # Run inference using MTL or STL on various inputs (CSV or User)
 from datetime import datetime
 import os
 import torch
@@ -20,8 +22,6 @@ from torch.utils.data import Dataset
 from dataset import InferenceDataset
 from model import Model, SingleTaskModel
 label_names = {
    'bug_report': ['No', 'Yes'],
    'feature_request': ['No', 'Yes'],
@@ -33,9 +33,6 @@ SEED = 4321
 torch.manual_seed(SEED)
 np.random.seed(SEED)
 def parse_args():
    parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
    parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/")
@@ -55,7 +52,7 @@ def main():
    os.makedirs("outputs/inference", exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    # this section is nearly identical to the first part of evaluate.py
+    # Mirrors the evaluation script with addition of interactive modes
    args = parse_args()
    print(f'{"="*50}')
    print(f'{"Starting inference"}')
@@ -70,7 +67,7 @@ def main():
    print("Loading model, tokenizer and datasets ...")
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
-    # Let the user decide if they want to run inference on the whole dataset or via the shell input
+    # Support CSV and interactive input
    if not args.interactive and not args.text:
        infer = f"data/processed/{args.dataset}.csv"
        infer_df = pd.read_csv(infer)
--- a/src/model.py
+++ b/src/model.py
@@ -1,27 +1,13 @@
 # model.py
-# One encoder, four shared heads(bug report, feature request, aspect, aspect sentiment)
+# Shared encoder (XLM-RoBERTa) with either multitask heads for all 4 tasks or single task head for comparison
 # 12 transformer layers, 12 attention heads
 from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel
 import torch.nn as nn
-# Using dropout, This has proven to be an effective technique 
+# Using dropout before classification to reduce overfitting
 # for regularization and preventing the co-adaptation of neurons as described in https://arxiv.org/abs/1207.0580
 # Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
 # Each hidden representation is size 768
 class SingleTaskModel(nn.Module):
-    """Single task model to compare MTL approach to review classification
+    """Single task model with one head to compare MTL approach to review classification"""
    Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
    just different args
        Args:
            task_name: which of the 4 tasks are we training for
            num_classes: number of output classes for the task
            dropout_rate: probability applied to cls representation, randomly drops tokens for better results
        """
    def __init__(self, task_name, num_classes, dropout_rate=0.2):
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -35,15 +21,7 @@ class SingleTaskModel(nn.Module):
        return {self.task_name: logits}
 class Model(nn.Module): 
-    """ Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads
+    """ Multitask model with shared encoder and 4 task specific heads."""
    Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through 
    shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
    allowing the encoder to learn from the shared representations / generalisations
        Args:
            dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
    """
    def __init__(self, dropout_rate=0.2):
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -58,8 +36,7 @@ class Model(nn.Module):
        self.aspect_head = nn.Linear(hidden_size, 6)
        self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
-    # Pass through encoder then extract the token representation through [batch_size, 768]
+    # Pass through encoder once then extract the token representation, then reuse the shared represenetation across all tasks
    # Apply droupout to it, take scores for each head, return them in a dictionary
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # index 0 from [batch_size, 768]
@@ -67,7 +44,6 @@ class Model(nn.Module):
        output = self.dropout(output)
        # Logits for each head:
        bug_logits = self.bug_head(output)
        feature_logits = self.feature_head(output)
        aspect_logits = self.aspect_head(output)
--- a/src/multitag.py
+++ b/src/multitag.py
@@ -1,13 +1,9 @@
 # multitag.py
-# This app enables manual annotation of reviews in the Uber dataset, for training with 
+# Manual annotation  tool for labelling reviews in the Uber reviews dataset, for multitask training
 # to achieve review classifications with multi task deep learning
 # In another time I would have had much more tasks / classifications so mtl can perform better (that would mean better labelling), 
 #at least that is my prediction of why this may not be as good as I wanted
 import tkinter as tk
 from tkinter import ttk
 import pandas as pd
 # import langdetect
 import os
 class MultiTag:
@@ -41,9 +37,6 @@ class MultiTag:
        self.number_of_aspects = 6  # number of aspect buttons
        self.root.title("MultiTag")
        #self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
        #self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
        # Colors for active label
        self.color_incomplete = "#003366"
        self.color_complete = "#00AA00"
@@ -51,8 +44,7 @@ class MultiTag:
        # Paths
        tagged_path = "data/uber_reviews_tagged.csv"
        sampled_path = "data/uber_reviews_sampled.csv"
-        # self.load_review_data("data/uber_reviews_sampled.csv")
+
        # self.load_review_data("data/uber_reviews_tagged.csv")
        if not os.path.exists(tagged_path):
            print(f"Tagged file did not exist, making one at: {sampled_path}")
            sampled_df = pd.read_csv(sampled_path, low_memory=False)
@@ -89,13 +81,13 @@ class MultiTag:
        self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))
-        #   Labels ROW 3
+        # ROW 3: Field labels
        ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
        ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
        ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
        ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))
-        # ROW 4 |Buttons| 
+        # ROW 4: Input buttons
        # Feature Requests
        self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
        self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
@@ -132,20 +124,15 @@ class MultiTag:
        self.root.bind("f", self.handle_key)
        self.root.bind("g", self.handle_key)
        self.root.bind("h", self.handle_key)
        # self.root.bind("j", self.handle_key)
        # self.root.bind("k", self.handle_key)
        # self.root.bind("l", self.handle_key)
        self.display_next_review()
        #   self.save_tags("data/uber_reviews_tagged.csv")
        self.root.mainloop()
    def handle_key(self, event):
        key = event.char
-        # Column 0 or 1: feature/bug (1 and 0)
+        # Feature Request and Bug Report are binary input (1 and 0 keys)
        if key in ['1', '0']:
            if self.active_column == 0:
                self.feature_pressed(key)
@@ -159,7 +146,7 @@ class MultiTag:
            self.sentiment_pressed(key.upper())
    def update_status(self):
-        """Update status label and highlight color based on completion state"""
+        """Update status label and highlight"""
        if self.all_labels_complete():
            self.highlight.configure(bg=self.color_complete)
            self.status_label.configure(
@@ -212,22 +199,22 @@ class MultiTag:
    def load_review_data(self, data_path):
-        """Load review data from a CSV file."""
+        """Load review data from a CSV file. Adds annotation columns if they don't exist."""
        self.review_data = pd.read_csv(data_path, low_memory=False)
        if "tagged" not in self.review_data.columns:
-            self.review_data["tagged"] = 0              # Initialize tagged column if not present
+            self.review_data["tagged"] = 0             
        if "feature_request" not in self.review_data.columns:
-            self.review_data["feature_request"] = ""    # Initialize feature_request column if not present
+            self.review_data["feature_request"] = ""   
        if "bug_report" not in self.review_data.columns:
-            self.review_data["bug_report"] = ""         # Initialize bug_report column if not present
+            self.review_data["bug_report"] = ""         
        if "aspect" not in self.review_data.columns:
-            self.review_data["aspect"] = ""             # Initialize aspect column if not present
+            self.review_data["aspect"] = ""           
        if "aspect_sentiment" not in self.review_data.columns:
-            self.review_data["aspect_sentiment"] = ""   # Initialize aspect_sentiment column if not present
+            self.review_data["aspect_sentiment"] = "" 
        print(f"Loaded {len(self.review_data)} reviews from {data_path}")
    def display_next_review(self):
-        """Display the next review in the text box."""
+        """Display the next unlabelled review in the text box."""
        self.current_review_index = self.get_current_review_index()
        if self.current_review_index < len(self.review_data):
            review = self.review_data.iloc[self.current_review_index]
@@ -283,9 +270,8 @@ class MultiTag:
            row["aspect_sentiment"] != "")
    def save_tags(self, save_path):
-        """Save the tagged data to a CSV file."""
+        """Save the current tagged data to a CSV file."""
        self.review_data.to_csv(save_path, index=False)
        # print(f"Tagged data saved to {save_path}")
    def quit_app(self, event):
        tagged_count = (self.review_data['tagged'] == 1).sum()
--- a/src/preprocess.py
+++ b/src/preprocess.py
@@ -1,19 +1,13 @@
 # preprocess.py
 # Text cleaning and preprocessing for the Uber Reviews Dataset
 # langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
 import pandas as pd
 import re
 def clean_text(text) -> str:
-    """Clean review text by removing URLS, emails, excessive whitespace
+    """Normalise review text by removing URLS, emails, excessive whitespace"""
    Input: 
    text - the review text to clean
    Outputs:
    str: the cleaned review text
    """
    if pd.isna(text):
        return ""
@@ -53,12 +47,6 @@ def preprocess_uber_reviews(input_path, output_path):
    6. Removes less than 5 word reviews
    6. Saves the cleaned dataset to uber_reviews_cleaned.csv
    Inputs:
    input_path (str): Path to uber_reviews.csv
    output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv
    Outputs:
    pd.df_clean: the dataframe of cleaned processed reviews
    """
    print("="*50)
    print("PREPROCESSING UBER REVIEWS")
@@ -117,10 +105,6 @@ def preprocess_uber_reviews(input_path, output_path):
    print("="*50)
    print(f"\nFinal dataset: {len(df_clean):,} reviews")
    print(f"Quality filters: word_count >= 5, duplicates removed") 
    # while this does remove a some legitimate reviews which would provide use in classification
    # it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5 
    # it showed the most amount of formative reviews without seeming excessive in data removal
    print("\nRating distribution:")
    rating_dist = df_clean['rating'].value_counts().sort_index()
    for rating, count in rating_dist.items():
@@ -137,9 +121,6 @@ def preprocess_uber_reviews(input_path, output_path):
    print(f"  Short reviews: {df_clean[df_clean['word_count'] < 5]}")
    print(f"  Null values: {df_clean.isnull().sum().to_dict()}")
    print(f"  Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
    # lang detection takes 5+ mins so leaving it commented for now 
    #df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
    #print(f"  Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
    # Sample reviews from each rating
    print("\n" + "="*50)
@@ -152,11 +133,6 @@ def preprocess_uber_reviews(input_path, output_path):
            for index, row in sample.iterrows():
                print(f"  • ({row['word_count']} words) {row['review'][:100]}")
    # Note about language
    print("Language detection not applied due to unreliability on short")
    print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
    print(" ...Manual annotation phase will identify any non-English reviews")
    return df_clean
 if __name__ == "__main__":
--- a/src/sampler.py
+++ b/src/sampler.py
@@ -11,11 +11,12 @@ class Sampler:
    def __init__(self, data_path, target_samples):
        self.data_path = data_path
-        self.stratify_column = "rating"  # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
+        # Default stratification method is based on original rating distribution
        self.stratify_column = "rating"  
        self.original_data = pd.read_csv(original_path, low_memory=False)
        self.data = pd.read_csv(self.data_path, low_memory=False)
-        self.total = len(self.data)  # total number of records in the dataset
+        self.total = len(self.data)  # total number of records in the working dataset
        print("="*50)
        print("SAMPLER INITIALIZED")
@@ -35,25 +36,10 @@ class Sampler:
        print((_origdist*100).round(1),"\n")
        self.data.info(verbose=True)
    #   add sampling method here
    #   random sample 5000 entries with stratifiying by rating
    """
    rating
    5    57.1% (611133)
    1    26.5% (283895)
    4     7.8% (82953)
    3     4.7% (49928)
    2     3.9% (41707)
    Name: proportion, dtype: object
    """
    Kept for reference with later sampling methods
-    """
+    Samples from current processed data rather than matching the original distribution
    IGNORE --- Left in just in case
    Sample randomly
    Redundant calculation
    Doesn't factor that the distribution changed greatly after preprocessing
    """
    def get_stratified_sample(self) -> pd.DataFrame:
           stratified_sample = (
@@ -67,9 +53,8 @@ class Sampler:
    def sample_col(self, column) -> pd.DataFrame:    
        """
-        IGNORE --- Left in just in case
+        Samples a proportional number of rows from one column
-
+        Deprecated: Not used in final pipeline, kept for reference
        Randomly sample, including conflicting math, I guess I was going to stratify
        """
        samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
        samples_per_column = max(samples_per_column,1) # also pointless
@@ -77,24 +62,9 @@ class Sampler:
    """
-    original_distribution_sample()
+    Main sampling method to annotate
-    The main sampling method for our labelling as it 
+    Samples reviews matching the original raw dataset distribution, so the labelled set
-    keeps composition of the original uber dataset, verified in 
+    better represents the original data and is more comparable to the unlabelled set.
    which is a fairer comparison, may also work better in general
    verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
    and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
    manually coded distributions taken from notebooks
    for ratings and actual number of samples 
    rating data is the whole data for a rating as we iterate
    has error handling if totals doesn't match the required amount of samples per the orig distrib
    randomise the indexes (samples) and appends to the new dataset
    """
    def original_distribution_sample(self):
        original_dist = {
@@ -117,21 +87,14 @@ class Sampler:
        return original_sample
    """
-    sample_with_keywords()
+    Build a sample with more likely bugs and feature reviews
    In order to train on more bugs and features data in 
    future this method was created
    - 2000 balanced by rating (400 per)
    - 1500 likely bugs using bug_keywords list
    - 1500 likely features using feature_keywords list
    inputs:
    outputs:
    """
    def sample_with_keywords(self):
-        #TODO add keywords for feature classification
+        # Keyword lists for oversampling likely bug reports and feature requests
        print(f"\n{'='*50}")
        print("Keyword influenced / rating stratified set")
        print(f"\n{'='*50}")
@@ -181,18 +144,16 @@ class Sampler:
        # Drop helper columns
        keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
        print(f"\n Total samples: {len(keyword_sample):,}")
        return keyword_sample
    def sample_tiny_size(self):
-        mini_sample = self.data.sample(200)     #   reading some samples manually
+        mini_sample = self.data.sample(200)     # for reading some samples manually
        return mini_sample
    def save_sample(self, sample_df,output_path):
-        """Save sample and display statistics"""
+        """Save sample and display summary statistics"""
        sample_df.to_csv(output_path, index=False)
        print(f"\n{'='*50}")
--- a/src/train.py
+++ b/src/train.py
@@ -1,5 +1,6 @@
 # train.py
-# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
+# Training script for both MTL and STL setups 
 # Structure adapted and adjusted from standard PyTorch training loops
 import argparse
 import os
 from datetime import datetime
@@ -17,40 +18,26 @@ from transformers import get_linear_schedule_with_warmup
 from sklearn.metrics import classification_report, f1_score
 from sklearn.utils.class_weight import compute_class_weight
 from dataset import ReviewDataset
 from model import Model, SingleTaskModel
-
+# Fixed seed for near reproducibile runs
 # =======================================================================
 #         Training script for MTL and STL training configurations
 # =======================================================================
 # NFR5, reproducibility
 SEED = 4321
 torch.manual_seed(SEED)
 np.random.seed(SEED)
 random.seed(SEED)
 # ------------------- Class weights -------------------
 # Using weights inversely proportional to class frequencies to avoid majority class bias, 
 # prioritize useful bug reports / feature requests
 def compute_weights(df, column, device):
    """Copmutes inverse frequency class weights for a label column
-    Uses sklearns balanced mode
+def compute_weights(df, column, device):
-    Rare classes receive higher weights to penalise so it can learn more from less
+    """Computes inverse frequency class weights for a label column"""
    """
    classes = np.unique(df[column])
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
    return torch.tensor(weights, dtype=torch.float).to(device)
-# parse_args() - NFR7 and NFR9
+
 #   Example Usages: python src/train.py --dataset boosted
 #   python src/train.py --epochs 15 NOTE: 8 - 12 epochs has seen best results so far
 def parse_args():
    parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
    parser.add_argument("--mode", type=str, default="mtl", choices=["mtl", "stl"], help="Choose between 'mtl' (multitask learning) and 'stl' (single task learning).")
@@ -67,23 +54,25 @@ def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Starting training...", flush=True)
    print("Using device:", device)
-    # Set cuda seeds for reproducibility
+
    # Set cuda seeds for reproducibility on GPU
    if torch.cuda.is_available():
        print("GPU:", torch.cuda.get_device_name(0))
        torch.cuda.manual_seed_all(SEED)
        torch.cuda.manual_seed(SEED)
    print(f"Using dataset: {args.dataset.upper()}")
    # Force deterministic for reproducibility at a slight performance cost
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
-    # load data
+    # load data into train/val splits
    train = f"data/processed/{args.dataset}_train.csv"
    val = f"data/processed/{args.dataset}_val.csv"
    os.makedirs("outputs", exist_ok=True)
    os.makedirs("runs", exist_ok=True)
-    # FR1, FR2, Multilingual tokenizer initilization
+    # Tokenizer initilization
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
    train_dataset = ReviewDataset(train, tokenizer)
@@ -92,7 +81,7 @@ def main():
    training_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    validation_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
-    # FR3, shared multilingual model with task-specific heads
+    # Shared model uses encoder across all tasks, STL model trains one task at a time
    if args.mode == "mtl":
        model = Model().to(device)
        active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
@@ -113,7 +102,7 @@ def main():
    train_df = pd.read_csv(train)
-    # Class weights
+    # Compute per-task weights from the training split
    print("\n Computing class weights...")
    bug_weights = compute_weights(train_df, 'bug_report', device)
    feature_weights = compute_weights(train_df, 'feature_request', device)
@@ -151,7 +140,7 @@ def main():
        num_training_steps=total_steps
    )
-    # ------------------- Training loop -------------------
+    # Entry point for training loop, with Tensorboard logging and early stopping based on validation macro F1 score
    start_time = time.time()
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    writer = SummaryWriter(f'runs/reclass_{run_name}_{timestamp}')
@@ -175,7 +164,7 @@ def main():
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
-            # FR8, Multitask forward pass
+            # Multitask forward pass
            outputs = model(input_ids, attention_mask)
            loss = 0
@@ -199,7 +188,7 @@ def main():
        writer.add_scalar("Loss/train", avg_train_loss, epoch) 
        print(f"Average training loss: {avg_train_loss:.4f}")
-        # -------------------- Validation loop -------------------
+        # Validation phase
        model.eval()
        total_val_loss = 0.0
@@ -226,7 +215,7 @@ def main():
        avg_vloss = total_val_loss / len(validation_loader)
        writer.add_scalar("Loss/val", avg_vloss, epoch)
-        # FR11, Performance evaluation
+        # Performance evaluation summary
        print("\nValidation Metrics (MACRO F1):")    
        epoch_f1 = []
        for task in active_tasks:
@@ -239,7 +228,7 @@ def main():
        writer.add_scalar("F1/val_macro_avg", avg_macro_f1, epoch)
        print(f" Average Macro F1: {avg_macro_f1:.4f}")
-        # NFR4, Early stopping
+        # Early stopping
        if avg_macro_f1 > best_f1:
            best_f1 = avg_macro_f1
            patience_counter = 0