diff --git a/README.md b/README.md
index 9349957..b787b26 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,11 @@
 
 ---
 
-## Project Overview
+# README not finished
 
-RECLASS is a multi-task learning system which uses a shared BERT encoder with task-specific classification heads.
+## Overview
+
+RECLASS is a multitask learning system which uses a shared multilingual transformer encoder with task-specific heads and single-task implementations for optional comparison.
 
 | Task | Output | Classes |
 |------|--------|---------|
@@ -18,57 +20,104 @@ RECLASS is a multi-task learning system which uses a shared BERT encoder with ta
 ## Dataset
 
 - **Source**: [Uber Customer Reviews (Kaggle)](https://www.kaggle.com/datasets/khushipitroda/ola-vs-uber-play-store-reviews)
-- **Original size**: 1,069,616 reviews
-- **Cleaned size**: 495,036 reviews (after removing short/duplicate reviews)
-- **Annotation target**: 5,000 manually labelled reviews
+- **Original size**: ~1.07M Reviews
+- **After Preprocessing**: ~495K Reviews
+- **Annotation subsets**: 5,000 from the original distribution, 5,000 from a keyword boosted sample
+
+## Preprocessing Steps
+
+- Removed URLS and emails
+- Normalised text and punctuation
+- Removed duplicate reviews
+- Filtered reviews less than 5 words
+
+- Output sets
+    -   Original: matches the original distribution of the raw dataset
+    -   Boosted: oversamples bug reports and feature requests using keyword heuristics
+
+## Model
+
+- Encoder: XLM-RoBERTa (large multilingual transformer model)
+- Architecture:
+    - Shared encoder
+    - Task-specific classification heads
+- Training setups:
+    - MTL (Multitask learning)
+    - STL (Single-task learning)
+
+Class weights are applied to reduce imbalance effects.
 
 ## Repository Structure
 
-```
-6013/
-    README.md
-    .gitignore
-    data/
-        uber_reviews.csv           # Raw dataset
-        uber_reviews_cleaned.csv   # Preprocessed reviews
-        uber_reviews_sampled.csv   # Stratified sample for annotation
-        uber_reviews_tagged.csv    # Annotated reviews (in progress)
-    notebooks/
-        preprocessing_uber.ipynb   # Preprocessing analysis
-        uber_cleaned.ipynb         # Cleaned data verification
-    src/
-        preprocess.py              # Text cleaning and filtering pipeline
-        sampler.py                 # Stratified sampling strategies
-        multitag.py                # GUI annotation tool
-        train.py                   # Model training (in progress)
-        infer.py                   # Inference pipeline (in progress)
-        outputs/
-            figures/
-```
+.
+├── data
+│   └── processed
+│       ├── boosted_test.csv
+│       ├── boosted_train.csv
+│       ├── boosted_val.csv
+│       ├── original_test.csv
+│       ├── original_train.csv
+│       ├── original_val.csv
+│       └── review.csv
+├── notebooks/
+│   
+├── outputs
+│   └── figures/
+├── README.md
+├── architecture.png
+└── src
+    ├── dataset.py
+    ├── evaluate.py
+    ├── infer.py
+    ├── model.py
+    ├── multitag.py
+    ├── preprocess.py
+    ├── sampler.py
+    └── train.py
 
-## Current Progress
+## Results
 
-- Manual annotation of 5,000 reviews
-- BERT baseline implementation
-- Multi-task model architecture
-- Training and evaluation
-- Comparative analysis (MTL vs single-task)
-- Final report and presentation
+Evaluation includes Precision, Recall, Macro F1, Confusion matrices and confidence analysis.
+
+Results and summaries are found in outputs/*.json and outputs/figures/
 
 ## Installation
 
 ```
-# Clone repository
-...
 # Create conda environment
-...
+conda create -n reclass python=3.11 
+conda activate reclass
+```
+
+```
 # Install dependencies
-...requirements.txt
+conda install --file requirements.txt
 ```
 
 ## Usage
-## References
-## Licenses
+
+#### Train Model
+
+```
+python src/train.py --mode mtl --dataset original
+```
+
+#### Evaluate Model
+
+```
+python src/evaluate.py --mode mtl --dataset original --model_path <model>.pt
+```
+
+#### Run Inference
+
+```
+python src/infer.py --mode mtl --model_path <model>.pt --dataset review
+```
+
+## Notes
+
+- The same tokenizer is used across training, evaluation and inference to ensure consistency
+- Sampling and preprocessing choices are documented further in src files and dissertation
 
 ---
 
diff --git a/src/dataset.py b/src/dataset.py
index 3530a1a..580726c 100644
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -1,22 +1,17 @@
 # dataset.py
-# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer
-# Takes a row from the csv, tokenizes the review and returns a tensor
+# Takes a row from the csv, tokenizes the review and returns a tensor ready for the model
 import torch
 import pandas as pd
 from torch.utils.data import Dataset
 from transformers import AutoTokenizer
 
 class ReviewDataset(Dataset):
-    """Pytorch Dataset for loading tokenized reviews
+    """
+    Dataset for tokenized reviews with labels for all 4 tasks.
 
     Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
-    Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
+    Expects a csv and tokenizes reviews using XLM-RoBERTa (SentencePiece), returning a dictionary with of
     input tensors and integer labels for all 4 tasks.
-     
-      Args:
-        path (str): Path to the csv file containing the reviews and labels.
-        tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
-        max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
     """
 
     def __init__(self, path, tokenizer, max_length=256):
@@ -30,25 +25,14 @@ class ReviewDataset(Dataset):
     def __getitem__(self, idx):
         review = self.df.iloc[idx]['review']
 
-        # encoding['input_ids'] 1D tensor of token ids, shape [max_length]
-        # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
-        # Both have shape [1, max_length] because of return_tensors='pt'
-        # Squeeze them to [max_length] with .squeeze(0)
+        # Tokenize with padding and truncation to max_length, returning PyTorch tensors
         encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
         
-        # Returns a dictionary with:
-        #   'input_ids': tensor of shape [max_length]
-        
-        #   'attention_mask': tensor of shape [max_length]
-
-        # MTL structure labels as tensor scalars:
-        #   'bug_report': tensor scalar (torch.tensor(label_value))
-        #   'feature_request': tensor scalar (torch.tensor(label_value))
-        #   'aspect': tensor scalar (torch.tensor(label_value))
-        #   'aspect_sentiment': tensor scalar (torch.tensor(label_value))
         return {
             'input_ids': encoding['input_ids'].squeeze(0),
             'attention_mask': encoding['attention_mask'].squeeze(0),
+
+            # Labels for all 4 tasks, converted to tensors
             'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long),
             'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long),
             'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
@@ -65,18 +49,23 @@ class InferenceDataset(Dataset):
                 return len(self.df)
         
         def __getitem__(self, idx):
-                #review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews 
                 review = str(self.df.iloc[idx][self.text_column])
+
                 if review == 'nan' or review.strip() == '':
                     review = ' '
+
+                # Same as training dataset but without labels, for inference on test sets
                 encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
                 return {
                         'input_ids': encoding['input_ids'].squeeze(0),
                         'attention_mask': encoding['attention_mask'].squeeze(0),
                 }    
     
-        
+
+
+
 if __name__ == "__main__":
+    # Quick test
     dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
     print(dataset.__getitem__(1))
     
diff --git a/src/evaluate.py b/src/evaluate.py
index 6eb4d77..c34644b 100644
--- a/src/evaluate.py
+++ b/src/evaluate.py
@@ -1,4 +1,6 @@
 # evauluate.py
+# Evaluate MTL or STL models on the test split
+
 import os
 import torch
 import time
@@ -17,7 +19,6 @@ from sklearn.metrics import classification_report, confusion_matrix, f1_score
 from dataset import ReviewDataset
 from model import Model, SingleTaskModel
 
-# TODO: load checkpoint, produce tables of evaluation figures
 SEED = 4321
 torch.manual_seed(SEED)
 np.random.seed(SEED)
@@ -31,6 +32,7 @@ label_names = {
 }
 
 def parse_args():
+    """Parse command line arguments for evaluation"""
     parser = argparse.ArgumentParser(description="RECLASS Evaluation Script")
     parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl")
     parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
@@ -47,11 +49,13 @@ def main():
     os.makedirs("outputs/figures", exist_ok=True)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-
+    # Load test dataset and model
     test = f"data/processed/{args.dataset}_test.csv"
     tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
     test_dataset = ReviewDataset(test, tokenizer)
     test_loader = DataLoader(test_dataset, batch_size=args.batch_size)
+
+    # MTL evaluates all tasks, STL needs to know a single task to evaluate on
     if args.mode == "mtl":
         model = Model().to(device)
         active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
@@ -86,6 +90,7 @@ def main():
                 logits = outputs[task]
                 preds = torch.argmax(logits, dim=1)
 
+                # Kepp max softmax as confidence estimate
                 probs = F.softmax(logits, dim=1)
                 confidence = probs.max(dim=1).values
 
@@ -93,6 +98,7 @@ def main():
                 all_preds[task].extend(preds.cpu().numpy())
                 all_confidences[task].extend(confidence.cpu().numpy())
     
+    # Detailed JSON summary along with printed results
     summary = {
         "mode": args.mode,
         "dataset": args.dataset,
@@ -137,7 +143,7 @@ def main():
         print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}")
         print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}")
 
-        # save summary to JSON
+        # Store main metrics and full per class report to JSON
         summary["results"][task] = {
             "macro_f1": float(report_dict["macro avg"]["f1-score"]),
             "macro_precision": float(report_dict["macro avg"]["precision"]),
@@ -150,8 +156,7 @@ def main():
             "per_class": report_dict
         }
 
-        # Confusion matrix
-
+        # Confusion matrix for each evaluated task
         cm = confusion_matrix(labels_arr, preds_arr)
         fig, ax = plt.subplots(figsize=(8, 6))
         sns.heatmap(
@@ -172,7 +177,6 @@ def main():
         test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable
         test_df[f'{task}_confidence'] = conf_arr
 
-    # to JSON
     run_name = args.task if args.mode == "stl" else "mtl"
     json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json"
     with open(json_path, "w") as f:
diff --git a/src/infer.py b/src/infer.py
index 3955b8b..776bb33 100644
--- a/src/infer.py
+++ b/src/infer.py
@@ -1,4 +1,6 @@
 # infer.py
+# Run inference using MTL or STL on various inputs (CSV or User)
+
 from datetime import datetime
 import os
 import torch
@@ -20,8 +22,6 @@ from torch.utils.data import Dataset
 from dataset import InferenceDataset
 from model import Model, SingleTaskModel
 
-
-
 label_names = {
     'bug_report': ['No', 'Yes'],
     'feature_request': ['No', 'Yes'],
@@ -33,9 +33,6 @@ SEED = 4321
 torch.manual_seed(SEED)
 np.random.seed(SEED)
 
-
-
-
 def parse_args():
     parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
     parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/")
@@ -55,7 +52,7 @@ def main():
     os.makedirs("outputs/inference", exist_ok=True)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-    # this section is nearly identical to the first part of evaluate.py
+    # Mirrors the evaluation script with addition of interactive modes
     args = parse_args()
     print(f'{"="*50}')
     print(f'{"Starting inference"}')
@@ -70,7 +67,7 @@ def main():
     print("Loading model, tokenizer and datasets ...")
     tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
 
-    # Let the user decide if they want to run inference on the whole dataset or via the shell input
+    # Support CSV and interactive input
     if not args.interactive and not args.text:
         infer = f"data/processed/{args.dataset}.csv"
         infer_df = pd.read_csv(infer)
diff --git a/src/model.py b/src/model.py
index 23b5e2f..706780b 100644
--- a/src/model.py
+++ b/src/model.py
@@ -1,27 +1,13 @@
 # model.py
-# One encoder, four shared heads(bug report, feature request, aspect, aspect sentiment)
-# 12 transformer layers, 12 attention heads
+# Shared encoder (XLM-RoBERTa) with either multitask heads for all 4 tasks or single task head for comparison
 
 from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel
 import torch.nn as nn
 
-# Using dropout, This has proven to be an effective technique 
-# for regularization and preventing the co-adaptation of neurons as described in https://arxiv.org/abs/1207.0580
-
-# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
-# Each hidden representation is size 768
-
+# Using dropout before classification to reduce overfitting
 class SingleTaskModel(nn.Module):
-    """Single task model to compare MTL approach to review classification
-    
-    Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
-    just different args
-    
-        Args:
-            task_name: which of the 4 tasks are we training for
-            num_classes: number of output classes for the task
-            dropout_rate: probability applied to cls representation, randomly drops tokens for better results
-        """
+    """Single task model with one head to compare MTL approach to review classification"""
+
     def __init__(self, task_name, num_classes, dropout_rate=0.2):
         super().__init__()
         self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -35,15 +21,7 @@ class SingleTaskModel(nn.Module):
         return {self.task_name: logits}
 
 class Model(nn.Module): 
-    """ Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads
-
-    Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through 
-    shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
-    allowing the encoder to learn from the shared representations / generalisations
-
-        Args:
-            dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
-    """
+    """ Multitask model with shared encoder and 4 task specific heads."""
     def __init__(self, dropout_rate=0.2):
         super().__init__()
         self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -58,8 +36,7 @@ class Model(nn.Module):
         self.aspect_head = nn.Linear(hidden_size, 6)
         self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
 
-    # Pass through encoder then extract the token representation through [batch_size, 768]
-    # Apply droupout to it, take scores for each head, return them in a dictionary
+    # Pass through encoder once then extract the token representation, then reuse the shared represenetation across all tasks
     def forward(self, input_ids, attention_mask):
         outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
         # index 0 from [batch_size, 768]
@@ -67,7 +44,6 @@ class Model(nn.Module):
 
         output = self.dropout(output)
 
-        # Logits for each head:
         bug_logits = self.bug_head(output)
         feature_logits = self.feature_head(output)
         aspect_logits = self.aspect_head(output)
diff --git a/src/multitag.py b/src/multitag.py
index f862941..f9d91ce 100644
--- a/src/multitag.py
+++ b/src/multitag.py
@@ -1,13 +1,9 @@
 # multitag.py
-# This app enables manual annotation of reviews in the Uber dataset, for training with 
-# to achieve review classifications with multi task deep learning
+# Manual annotation  tool for labelling reviews in the Uber reviews dataset, for multitask training
 
-# In another time I would have had much more tasks / classifications so mtl can perform better (that would mean better labelling), 
-#at least that is my prediction of why this may not be as good as I wanted
 import tkinter as tk
 from tkinter import ttk
 import pandas as pd
-# import langdetect
 import os
 
 class MultiTag:
@@ -41,9 +37,6 @@ class MultiTag:
         self.number_of_aspects = 6  # number of aspect buttons
         self.root.title("MultiTag")
 
-        #self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
-        #self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
-
         # Colors for active label
         self.color_incomplete = "#003366"
         self.color_complete = "#00AA00"
@@ -51,8 +44,7 @@ class MultiTag:
         # Paths
         tagged_path = "data/uber_reviews_tagged.csv"
         sampled_path = "data/uber_reviews_sampled.csv"
-        # self.load_review_data("data/uber_reviews_sampled.csv")
-        # self.load_review_data("data/uber_reviews_tagged.csv")
+
         if not os.path.exists(tagged_path):
             print(f"Tagged file did not exist, making one at: {sampled_path}")
             sampled_df = pd.read_csv(sampled_path, low_memory=False)
@@ -89,13 +81,13 @@ class MultiTag:
         self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))
 
 
-        #   Labels ROW 3
+        # ROW 3: Field labels
         ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
         ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
         ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
         ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))
 
-        # ROW 4 |Buttons| 
+        # ROW 4: Input buttons
         # Feature Requests
         self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
         self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
@@ -132,20 +124,15 @@ class MultiTag:
         self.root.bind("f", self.handle_key)
         self.root.bind("g", self.handle_key)
         self.root.bind("h", self.handle_key)
-        # self.root.bind("j", self.handle_key)
-        # self.root.bind("k", self.handle_key)
-        # self.root.bind("l", self.handle_key)
 
-
-    
+   
         self.display_next_review()
-        #   self.save_tags("data/uber_reviews_tagged.csv")
         self.root.mainloop()
 
     def handle_key(self, event):
         key = event.char
     
-        # Column 0 or 1: feature/bug (1 and 0)
+        # Feature Request and Bug Report are binary input (1 and 0 keys)
         if key in ['1', '0']:
             if self.active_column == 0:
                 self.feature_pressed(key)
@@ -159,7 +146,7 @@ class MultiTag:
             self.sentiment_pressed(key.upper())
 
     def update_status(self):
-        """Update status label and highlight color based on completion state"""
+        """Update status label and highlight"""
         if self.all_labels_complete():
             self.highlight.configure(bg=self.color_complete)
             self.status_label.configure(
@@ -212,22 +199,22 @@ class MultiTag:
 
 
     def load_review_data(self, data_path):
-        """Load review data from a CSV file."""
+        """Load review data from a CSV file. Adds annotation columns if they don't exist."""
         self.review_data = pd.read_csv(data_path, low_memory=False)
         if "tagged" not in self.review_data.columns:
-            self.review_data["tagged"] = 0              # Initialize tagged column if not present
+            self.review_data["tagged"] = 0             
         if "feature_request" not in self.review_data.columns:
-            self.review_data["feature_request"] = ""    # Initialize feature_request column if not present
+            self.review_data["feature_request"] = ""   
         if "bug_report" not in self.review_data.columns:
-            self.review_data["bug_report"] = ""         # Initialize bug_report column if not present
+            self.review_data["bug_report"] = ""         
         if "aspect" not in self.review_data.columns:
-            self.review_data["aspect"] = ""             # Initialize aspect column if not present
+            self.review_data["aspect"] = ""           
         if "aspect_sentiment" not in self.review_data.columns:
-            self.review_data["aspect_sentiment"] = ""   # Initialize aspect_sentiment column if not present
+            self.review_data["aspect_sentiment"] = "" 
         print(f"Loaded {len(self.review_data)} reviews from {data_path}")
     
     def display_next_review(self):
-        """Display the next review in the text box."""
+        """Display the next unlabelled review in the text box."""
         self.current_review_index = self.get_current_review_index()
         if self.current_review_index < len(self.review_data):
             review = self.review_data.iloc[self.current_review_index]
@@ -283,9 +270,8 @@ class MultiTag:
             row["aspect_sentiment"] != "")
     
     def save_tags(self, save_path):
-        """Save the tagged data to a CSV file."""
+        """Save the current tagged data to a CSV file."""
         self.review_data.to_csv(save_path, index=False)
-        # print(f"Tagged data saved to {save_path}")
 
     def quit_app(self, event):
         tagged_count = (self.review_data['tagged'] == 1).sum()
diff --git a/src/preprocess.py b/src/preprocess.py
index 88452e7..af95cab 100644
--- a/src/preprocess.py
+++ b/src/preprocess.py
@@ -1,19 +1,13 @@
 # preprocess.py
 
+# Text cleaning and preprocessing for the Uber Reviews Dataset
 # langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
 
 import pandas as pd
 import re
 
 def clean_text(text) -> str:
-    """Clean review text by removing URLS, emails, excessive whitespace
-
-    Input: 
-    text - the review text to clean
-
-    Outputs:
-    str: the cleaned review text
-    """
+    """Normalise review text by removing URLS, emails, excessive whitespace"""
     if pd.isna(text):
         return ""
     
@@ -53,12 +47,6 @@ def preprocess_uber_reviews(input_path, output_path):
     6. Removes less than 5 word reviews
     6. Saves the cleaned dataset to uber_reviews_cleaned.csv
 
-    Inputs:
-    input_path (str): Path to uber_reviews.csv
-    output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv
-
-    Outputs:
-    pd.df_clean: the dataframe of cleaned processed reviews
     """
     print("="*50)
     print("PREPROCESSING UBER REVIEWS")
@@ -117,10 +105,6 @@ def preprocess_uber_reviews(input_path, output_path):
     print("="*50)
     print(f"\nFinal dataset: {len(df_clean):,} reviews")
     print(f"Quality filters: word_count >= 5, duplicates removed") 
-    # while this does remove a some legitimate reviews which would provide use in classification
-    # it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5 
-    # it showed the most amount of formative reviews without seeming excessive in data removal
-    
     print("\nRating distribution:")
     rating_dist = df_clean['rating'].value_counts().sort_index()
     for rating, count in rating_dist.items():
@@ -137,10 +121,7 @@ def preprocess_uber_reviews(input_path, output_path):
     print(f"  Short reviews: {df_clean[df_clean['word_count'] < 5]}")
     print(f"  Null values: {df_clean.isnull().sum().to_dict()}")
     print(f"  Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
-    # lang detection takes 5+ mins so leaving it commented for now 
-    #df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
-    #print(f"  Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
-    
+
     # Sample reviews from each rating
     print("\n" + "="*50)
     print("SAMPLE CLEANED REVIEWS")
@@ -152,11 +133,6 @@ def preprocess_uber_reviews(input_path, output_path):
             for index, row in sample.iterrows():
                 print(f"  • ({row['word_count']} words) {row['review'][:100]}")
     
-    # Note about language
-    print("Language detection not applied due to unreliability on short")
-    print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
-    print(" ...Manual annotation phase will identify any non-English reviews")
-    
     return df_clean
 
 if __name__ == "__main__":
diff --git a/src/sampler.py b/src/sampler.py
index bf03d73..152489d 100644
--- a/src/sampler.py
+++ b/src/sampler.py
@@ -11,11 +11,12 @@ class Sampler:
     def __init__(self, data_path, target_samples):
 
         self.data_path = data_path
-        self.stratify_column = "rating"  # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
+        # Default stratification method is based on original rating distribution
+        self.stratify_column = "rating"  
 
         self.original_data = pd.read_csv(original_path, low_memory=False)
         self.data = pd.read_csv(self.data_path, low_memory=False)
-        self.total = len(self.data)  # total number of records in the dataset
+        self.total = len(self.data)  # total number of records in the working dataset
 
         print("="*50)
         print("SAMPLER INITIALIZED")
@@ -35,25 +36,10 @@ class Sampler:
         print((_origdist*100).round(1),"\n")
 
         self.data.info(verbose=True)
+    """
+    Kept for reference with later sampling methods
 
-    #   add sampling method here
-    #   random sample 5000 entries with stratifiying by rating
-    """
-    rating
-    5    57.1% (611133)
-    1    26.5% (283895)
-    4     7.8% (82953)
-    3     4.7% (49928)
-    2     3.9% (41707)
-    Name: proportion, dtype: object
-    """
-    
-    """
-    IGNORE --- Left in just in case
-
-    Sample randomly
-    Redundant calculation
-    Doesn't factor that the distribution changed greatly after preprocessing
+    Samples from current processed data rather than matching the original distribution
     """
     def get_stratified_sample(self) -> pd.DataFrame:
            stratified_sample = (
@@ -67,9 +53,8 @@ class Sampler:
     
     def sample_col(self, column) -> pd.DataFrame:    
         """
-        IGNORE --- Left in just in case
-
-        Randomly sample, including conflicting math, I guess I was going to stratify
+        Samples a proportional number of rows from one column
+        Deprecated: Not used in final pipeline, kept for reference
         """
         samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
         samples_per_column = max(samples_per_column,1) # also pointless
@@ -77,24 +62,9 @@ class Sampler:
 
 
     """
-    original_distribution_sample()
-    The main sampling method for our labelling as it 
-    keeps composition of the original uber dataset, verified in 
-    which is a fairer comparison, may also work better in general
-
-    verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
-    and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
-
-    
-    manually coded distributions taken from notebooks
-
-    for ratings and actual number of samples 
-    rating data is the whole data for a rating as we iterate
-    has error handling if totals doesn't match the required amount of samples per the orig distrib
-    randomise the indexes (samples) and appends to the new dataset
-
-
-
+    Main sampling method to annotate
+    Samples reviews matching the original raw dataset distribution, so the labelled set
+    better represents the original data and is more comparable to the unlabelled set.
     """
     def original_distribution_sample(self):
         original_dist = {
@@ -117,21 +87,14 @@ class Sampler:
         return original_sample
     
     """
-    sample_with_keywords()
-
-    In order to train on more bugs and features data in 
-    future this method was created
+    Build a sample with more likely bugs and feature reviews
     - 2000 balanced by rating (400 per)
     - 1500 likely bugs using bug_keywords list
     - 1500 likely features using feature_keywords list
-
-    inputs:
-    outputs:
-    
     """
 
     def sample_with_keywords(self):
-        #TODO add keywords for feature classification
+        # Keyword lists for oversampling likely bug reports and feature requests
         print(f"\n{'='*50}")
         print("Keyword influenced / rating stratified set")
         print(f"\n{'='*50}")
@@ -181,18 +144,16 @@ class Sampler:
         # Drop helper columns
         keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
 
-        
-        
         print(f"\n Total samples: {len(keyword_sample):,}")
         return keyword_sample
 
     def sample_tiny_size(self):
-        mini_sample = self.data.sample(200)     #   reading some samples manually
+        mini_sample = self.data.sample(200)     # for reading some samples manually
         return mini_sample
 
     
     def save_sample(self, sample_df,output_path):
-        """Save sample and display statistics"""
+        """Save sample and display summary statistics"""
         sample_df.to_csv(output_path, index=False)
         
         print(f"\n{'='*50}")
diff --git a/src/train.py b/src/train.py
index 7e00aea..da5a0dd 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,5 +1,6 @@
 # train.py
-# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
+# Training script for both MTL and STL setups 
+# Structure adapted and adjusted from standard PyTorch training loops
 import argparse
 import os
 from datetime import datetime
@@ -17,40 +18,26 @@ from transformers import get_linear_schedule_with_warmup
 from sklearn.metrics import classification_report, f1_score
 from sklearn.utils.class_weight import compute_class_weight
 
-
 from dataset import ReviewDataset
 from model import Model, SingleTaskModel
 
 
 
-
-# =======================================================================
-#         Training script for MTL and STL training configurations
-# =======================================================================
-
-# NFR5, reproducibility
+# Fixed seed for near reproducibile runs
 SEED = 4321
 torch.manual_seed(SEED)
 np.random.seed(SEED)
 random.seed(SEED)
 
 
-# ------------------- Class weights -------------------
-# Using weights inversely proportional to class frequencies to avoid majority class bias, 
-# prioritize useful bug reports / feature requests
+
 def compute_weights(df, column, device):
-    """Copmutes inverse frequency class weights for a label column
-    
-    Uses sklearns balanced mode
-    Rare classes receive higher weights to penalise so it can learn more from less
-    """
+    """Computes inverse frequency class weights for a label column"""
     classes = np.unique(df[column])
     weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
     return torch.tensor(weights, dtype=torch.float).to(device)
 
-# parse_args() - NFR7 and NFR9
-#   Example Usages: python src/train.py --dataset boosted
-#   python src/train.py --epochs 15 NOTE: 8 - 12 epochs has seen best results so far
+
 def parse_args():
     parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
     parser.add_argument("--mode", type=str, default="mtl", choices=["mtl", "stl"], help="Choose between 'mtl' (multitask learning) and 'stl' (single task learning).")
@@ -67,23 +54,25 @@ def main():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("Starting training...", flush=True)
     print("Using device:", device)
-    # Set cuda seeds for reproducibility
+
+    # Set cuda seeds for reproducibility on GPU
     if torch.cuda.is_available():
         print("GPU:", torch.cuda.get_device_name(0))
         torch.cuda.manual_seed_all(SEED)
         torch.cuda.manual_seed(SEED)
     print(f"Using dataset: {args.dataset.upper()}")
+
     # Force deterministic for reproducibility at a slight performance cost
     torch.backends.cudnn.deterministic = True
     torch.backends.cudnn.benchmark = False
 
-    # load data
+    # load data into train/val splits
     train = f"data/processed/{args.dataset}_train.csv"
     val = f"data/processed/{args.dataset}_val.csv"
     os.makedirs("outputs", exist_ok=True)
     os.makedirs("runs", exist_ok=True)
 
-    # FR1, FR2, Multilingual tokenizer initilization
+    # Tokenizer initilization
     tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
 
     train_dataset = ReviewDataset(train, tokenizer)
@@ -92,7 +81,7 @@ def main():
     training_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
     validation_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
 
-    # FR3, shared multilingual model with task-specific heads
+    # Shared model uses encoder across all tasks, STL model trains one task at a time
     if args.mode == "mtl":
         model = Model().to(device)
         active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
@@ -113,7 +102,7 @@ def main():
     
     train_df = pd.read_csv(train)
     
-    # Class weights
+    # Compute per-task weights from the training split
     print("\n Computing class weights...")
     bug_weights = compute_weights(train_df, 'bug_report', device)
     feature_weights = compute_weights(train_df, 'feature_request', device)
@@ -151,7 +140,7 @@ def main():
         num_training_steps=total_steps
     )
 
-    # ------------------- Training loop -------------------
+    # Entry point for training loop, with Tensorboard logging and early stopping based on validation macro F1 score
     start_time = time.time()
     timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
     writer = SummaryWriter(f'runs/reclass_{run_name}_{timestamp}')
@@ -175,7 +164,7 @@ def main():
             input_ids = batch["input_ids"].to(device)
             attention_mask = batch["attention_mask"].to(device)
 
-            # FR8, Multitask forward pass
+            # Multitask forward pass
             outputs = model(input_ids, attention_mask)
             
             loss = 0
@@ -199,7 +188,7 @@ def main():
         writer.add_scalar("Loss/train", avg_train_loss, epoch) 
         print(f"Average training loss: {avg_train_loss:.4f}")
 
-        # -------------------- Validation loop -------------------
+        # Validation phase
         model.eval()
         total_val_loss = 0.0
 
@@ -226,7 +215,7 @@ def main():
         avg_vloss = total_val_loss / len(validation_loader)
         writer.add_scalar("Loss/val", avg_vloss, epoch)
 
-        # FR11, Performance evaluation
+        # Performance evaluation summary
         print("\nValidation Metrics (MACRO F1):")    
         epoch_f1 = []
         for task in active_tasks:
@@ -239,7 +228,7 @@ def main():
         writer.add_scalar("F1/val_macro_avg", avg_macro_f1, epoch)
         print(f" Average Macro F1: {avg_macro_f1:.4f}")
 
-        # NFR4, Early stopping
+        # Early stopping
         if avg_macro_f1 > best_f1:
             best_f1 = avg_macro_f1
             patience_counter = 0