diff --git a/src/dataset.py b/src/dataset.py
index e5b8ccc..e3963f3 100644
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -1,5 +1,5 @@
 # dataset.py
-
+# tokenize data using (sentencepiece) XLM-RoBERTa
 # Takes a row from the csv, tokenizes the review and returns a tensor
 import torch
 import pandas as pd
@@ -7,6 +7,18 @@ from torch.utils.data import Dataset
 from transformers import AutoTokenizer
 
 class ReviewDataset(Dataset):
+    """Pytorch Dataset for loading tokenized reviews
+
+    Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
+    Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
+    input tensors and integer labels for all 4 tasks.
+     
+      Args:
+        path (str): Path to the csv file containing the reviews and labels.
+        tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
+        max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
+    """
+
     def __init__(self, path, tokenizer, max_length=256):
         self.df = pd.read_csv(path)
         self.tokenizer = tokenizer
@@ -22,13 +34,7 @@ class ReviewDataset(Dataset):
         # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
         # Both have shape [1, max_length] because of return_tensors='pt'
         # Squeeze them to [max_length] with .squeeze(0)
-        encoding = self.tokenizer(
-                review,
-                max_length=self.max_length,
-                padding='max_length',
-                truncation=True,
-                return_tensors='pt'
-            )
+        encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
         
         # Returns a dictionary with:
         #   'input_ids': tensor of shape [max_length]
diff --git a/src/model.py b/src/model.py
index f74b849..e6cbe70 100644
--- a/src/model.py
+++ b/src/model.py
@@ -11,7 +11,17 @@ import torch.nn as nn
 # Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
 # Each hidden representation is size 768
 
-class SingleTaskModel(nn.Module): #   TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTURE
+class SingleTaskModel(nn.Module):
+    """Single task model to compare MTL approach to review classification
+    
+    Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
+    just different args
+    
+        Args:
+            task_name: which of the 4 tasks are we training for
+            num_classes: number of output classes for the task
+            dropout_rate: probability applied to cls representation, randomly drops tokens for better results
+        """
     def __init__(self, task_name, num_classes, dropout_rate=0.2):
         super().__init__()
         self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -24,14 +34,23 @@ class SingleTaskModel(nn.Module): #   TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTU
         logits = self.head(output)
         return {self.task_name: logits}
 
-class Model(nn.Module): #   MULTITASK MODEL ARCHITECTURE
-    def __init__(self, dropout_rate=0.2): # Try other p values
+class Model(nn.Module): 
+    """ Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads
+
+    Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through 
+    shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
+    allowing the encoder to learn from the shared representations / generalisations
+
+        Args:
+            dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
+    """
+    def __init__(self, dropout_rate=0.2):
         super().__init__()
         self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
 
         hidden_size = self.encoder.config.hidden_size
 
-        # Applied across whole output, shared
+        # Applied across shared cls token, before all task heads 
         self.dropout = nn.Dropout(dropout_rate)
 
         self.bug_head = nn.Linear(hidden_size, 2)
@@ -39,10 +58,11 @@ class Model(nn.Module): #   MULTITASK MODEL ARCHITECTURE
         self.aspect_head = nn.Linear(hidden_size, 6)
         self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
 
-    # Pass through encoder then extract the token representation
+    # Pass through encoder then extract the token representation through [batch_size, 768]
     # Apply droupout to it, take scores for each head, return them in a dictionary
     def forward(self, input_ids, attention_mask):
         outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        # index 0 from [batch_size, 768]
         output = outputs.last_hidden_state[:, 0, :]
 
         output = self.dropout(output)
diff --git a/src/preprocess.py b/src/preprocess.py
index d80b034..dd8465a 100644
--- a/src/preprocess.py
+++ b/src/preprocess.py
@@ -1,10 +1,11 @@
 # preprocess.py
 
+# langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
+
 import pandas as pd
 import re
-from langdetect import detect, LangDetectException
 
-def clean_text(text):
+def clean_text(text) -> str:
     """Clean review text by removing URLS, emails, excessive whitespace
 
     Input: 
@@ -19,31 +20,22 @@ def clean_text(text):
     # Convert to lower for uniformity
     text = str(text).lower()
     
-    # Remove URLs using regex
+    # Remove URLs using regex, match http in any non whitespace char (\S)  numerous (+) times. same with either ( | ) www
     text = re.sub(r'http\S+|www\S+', '', text)
     
-    # Remove emails
+    # Remove emails, one or more (+) non whitespace (\S) before "@" with trailing \S up replace with '' for each text (review)
     text = re.sub(r'\S+@\S+', '', text)
 
-    # Normalize punctuation
+    # Normalize punctuation, any character except line terminators (\.) at least 2 times {2,} for . ! ? replace with a single
     text = re.sub(r'\.{2,}', '.', text)
     text = re.sub(r'!{2,}', '!', text)
     text = re.sub(r'\?{2,}', '?', text)
     
-    # Remove excessive whitespace by replacing with single whitespace where there is trailing spaces
+    # Remove excessive whitespace (\s) by replacing with single whitespace where there is trailing spaces
     text = re.sub(r'\s+', ' ', text).strip()
     
     return text
 
-def detect_language(text):
-    """Detect language of text"""
-    try:
-        if pd.isna(text) or len(str(text).strip()) < 10:
-            return 'unknown'
-        return detect(str(text))
-    except LangDetectException:
-        return 'unknown'
-
 def preprocess_uber_reviews(input_path, output_path):
     """
     preprocess_uber_reviews by loading, cleaning, and filtering the data.
diff --git a/src/sampler.py b/src/sampler.py
index 35eb5f6..7a9b444 100644
--- a/src/sampler.py
+++ b/src/sampler.py
@@ -1,21 +1,16 @@
-#   TODO:   Add verification comparison between ratings
-#   TODO:   Clean up the logging print statements
-
-
 import pandas as pd
 import numpy as np
 
 print(pd.__version__)
 print(np.__version__)
 
-path = "multitag/data/uber_reviews_cleaned.csv"
-sampled_path = "multitag/data/uber_reviews_sampled.csv"
-original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
+path = "data/raw/uber_reviews_cleaned.csv"
+sampled_path = "data/raw/uber_reviews_sampled.csv"
+original_path = "data/raw/uber_reviews.csv" ### only for distribution comparison
 class Sampler:
     def __init__(self, data_path, target_samples):
 
         self.data_path = data_path
-        self.target_samples = 5000  # target number of samples
         self.stratify_column = "rating"  # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
 
         self.original_data = pd.read_csv(original_path, low_memory=False)
@@ -39,7 +34,7 @@ class Sampler:
         print(f"Original Distribution from {original_path}:")
         print((_origdist*100).round(1),"\n")
 
-        self.data.info()
+        self.data.info(verbose=True)
 
     #   add sampling method here
     #   random sample 5000 entries with stratifiying by rating
@@ -52,43 +47,53 @@ class Sampler:
     2     3.9% (41707)
     Name: proportion, dtype: object
     """
-    """
     
-    Sample size by rating
-    Redundant calculation, kept for clarity
-    Doesn't factor that the distribution changed greatly after preprocessing
+    """
+    IGNORE --- Left in just in case
 
+    Sample randomly
+    Redundant calculation
+    Doesn't factor that the distribution changed greatly after preprocessing
     """
     def get_stratified_sample(self) -> pd.DataFrame:
            stratified_sample = (
             self.data
-            .reset_index(drop=True)
-            .apply(self.x)
-            .sample(n=self.target_samples, random_state=42)
+            .reset_index(drop=True) # remove messy indexes
+            .apply(self.sample_col) # applies to each column
+            .sample(n=self.target_samples, random_state=42) # 42 on sampler 4321 on any other file
             )
            return stratified_sample
         
     
+    def sample_col(self, column) -> pd.DataFrame:    
+        """
+        IGNORE --- Left in just in case
+
+        Randomly sample, including conflicting math, I guess I was going to stratify
+        """
+        samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
+        samples_per_column = max(samples_per_column,1) # also pointless
+        return column.sample(n=samples_per_column, random_state=42)
 
-    # x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
-    def x(self, x):    
-        n = int(len(x) / self.total * self.target_samples)
-        n = max(n,1)
-        return x.sample(n=n, random_state=42)
-    """
-    get_proportional_sample()
 
-    """
-    
     """
     original_distribution_sample()
     The main sampling method for our labelling as it 
-    keeps composition of the original uber dataset
+    keeps composition of the original uber dataset, verified in 
     which is a fairer comparison, may also work better in general
 
-    inputs:
+    verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
+    and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
+
+    
+    manually coded distributions taken from notebooks
+
+    for ratings and actual number of samples 
+    rating data is the whole data for a rating as we iterate
+    has error handling if totals doesn't match the required amount of samples per the orig distrib
+    randomise the indexes (samples) and appends to the new dataset
+
 
-    outputs:
 
     """
     def original_distribution_sample(self):
@@ -102,8 +107,8 @@ class Sampler:
         print("Target Distribution =", original_dist)
         samples = []
         for rating, num_samples in original_dist.items():
-            rating_data = self.data[self.data[self.stratify_column] == rating]
-            if len(rating_data) < num_samples:
+            rating_data = self.data[self.data[self.stratify_column] == rating] # stratify_column = "rating"
+            if len(rating_data) < num_samples:                                 # data is a pd.dataframe of the set
                 print("Missing samples available for rating")
                 num_samples = len(rating_data)
             sample = rating_data.sample(n = num_samples,random_state=42)
@@ -127,9 +132,9 @@ class Sampler:
 
     def sample_with_keywords(self):
         #TODO add keywords for feature classification
-        print(f"\n{"="*50}")
+        print(f"\n{'='*50}")
         print("Keyword influenced / rating stratified set")
-        print(f"\n{"="*50}")
+        print(f"\n{'='*50}")
 
         bug_keywords = ["crash","freeze", "error",
                         "stop", "doesnt work", "doesn't work","loading",
@@ -204,7 +209,7 @@ class Sampler:
 
 def main():
     
-    sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
+    sampler = Sampler("data/raw/uber_reviews_cleaned.csv", target_samples=5000)
 
     # Choose sampling strategy
     print(f"\n{'='*50}")
@@ -218,19 +223,19 @@ def main():
     
     if choice == '1':
         sample = sampler.get_stratified_sample()
-        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+        sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
         
     elif choice == '2':
         sample = sampler.original_distribution_sample()
-        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+        sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
         
     elif choice == '3':
         sample = sampler.sample_with_keywords()
-        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+        sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
 
     elif choice == '4':
         sample = sampler.sample_tiny_size()
-        sampler.save_sample(sample,"multitag/data/uber_review_temp.csv")
+        sampler.save_sample(sample,"data/raw/uber_review_temp.csv")
         
 
 
diff --git a/src/train.py b/src/train.py
index e9d6f45..f14b98d 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,6 +1,6 @@
 # train.py
-# some code directly from pytorch docs https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
-import argparse # argparse for later switching to boosted data
+# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
+import argparse
 import os
 from datetime import datetime
 import time
@@ -25,7 +25,7 @@ from model import Model, SingleTaskModel
 
 
 # =======================================================================
-#                       Multitask implementation
+#         Training script for MTL and STL training configurations
 # =======================================================================
 
 # NFR5, reproducibility
@@ -34,12 +34,16 @@ torch.manual_seed(SEED)
 np.random.seed(SEED)
 random.seed(SEED)
 
-# class weights, training loop and early stopping
 
 # ------------------- Class weights -------------------
 # Using weights inversely proportional to class frequencies to avoid majority class bias, 
 # prioritize useful bug reports / feature requests
 def compute_weights(df, column, device):
+    """Copmutes inverse frequency class weights for a label column
+    
+    Uses sklearns balanced mode
+    Rare classes receive higher weights to penalise so it can learn more from less
+    """
     classes = np.unique(df[column])
     weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
     return torch.tensor(weights, dtype=torch.float).to(device)
@@ -63,19 +67,17 @@ def main():
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("Starting training...", flush=True)
     print("Using device:", device)
-    # Remove randomness
+    # Set cuda seeds for reproducibility
     if torch.cuda.is_available():
         print("GPU:", torch.cuda.get_device_name(0))
         torch.cuda.manual_seed_all(SEED)
         torch.cuda.manual_seed(SEED)
     print(f"Using dataset: {args.dataset.upper()}")
+    # Force deterministic for reproducibility at a slight performance cost
     torch.backends.cudnn.deterministic = True
     torch.backends.cudnn.benchmark = False
 
-    """
-    Data loading:
-
-    """
+    # load data
     train = f"data/processed/{args.dataset}_train.csv"
     val = f"data/processed/{args.dataset}_val.csv"
     os.makedirs("outputs", exist_ok=True)
@@ -117,19 +119,13 @@ def main():
     feature_weights = compute_weights(train_df, 'feature_request', device)
     aspect_weights = compute_weights(train_df, 'aspect', device)    
     aspect_sentiment_weights = compute_weights(train_df, 'aspect_sentiment', device)
-    # Move tensors to cpu and conver to numpy for usage with sklearn classification report
-    # Use detatch() later for predictions
+    
     print("Bug report class weights:", bug_weights.cpu().numpy())
     print("Feature request class weights:", feature_weights.cpu().numpy())
     print("Aspect class weights:", aspect_weights.cpu().numpy())
     print("Aspect sentiment class weights:", aspect_sentiment_weights.cpu().numpy())
     
-    #   for later
-    #   1.0 * bug_loss +
-    #   1.0 * feature_loss +
-    #   0.5 * aspect_loss +
-    #   0.5 * sentiment_loss
-    # FR4-FR7: Task specific loss functions
+    # equal weighted task losses. unequal was considered but equal weights performed well without adding complexity
     criterions = {
         'bug_report': nn.CrossEntropyLoss(weight=bug_weights),
         'feature_request': nn.CrossEntropyLoss(weight=feature_weights),
@@ -140,7 +136,7 @@ def main():
     # -------------------- Optimizer and scheduler -------------------
     optimizer = torch.optim.AdamW(
         model.parameters(), 
-        lr=args.lr,        # change
+        lr=args.lr,        
         weight_decay=0.01 
         )
     
@@ -163,7 +159,7 @@ def main():
 
     # Initialize with inf to capture best validation loss easily
     best_vloss = float('inf')
-
+    
     for epoch in range(args.epochs):
         print(f"EPOCH {epoch + 1}/{args.epochs}")
         model.train(True)