Added some comments and readability

2026-03-24 18:11:31 +00:00
parent afe61eaaa2
commit 753723694b
5 changed files with 103 additions and 84 deletions
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -1,5 +1,5 @@
 # dataset.py
-
+# tokenize data using (sentencepiece) XLM-RoBERTa
 # Takes a row from the csv, tokenizes the review and returns a tensor
 import torch
 import pandas as pd
@@ -7,6 +7,18 @@ from torch.utils.data import Dataset
 from transformers import AutoTokenizer

 class ReviewDataset(Dataset):
+    """Pytorch Dataset for loading tokenized reviews
+
+    Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
+    Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
+    input tensors and integer labels for all 4 tasks.
+     
+      Args:
+        path (str): Path to the csv file containing the reviews and labels.
+        tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
+        max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
+    """
+
    def __init__(self, path, tokenizer, max_length=256):
        self.df = pd.read_csv(path)
        self.tokenizer = tokenizer
@@ -22,13 +34,7 @@ class ReviewDataset(Dataset):
        # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
        # Both have shape [1, max_length] because of return_tensors='pt'
        # Squeeze them to [max_length] with .squeeze(0)
-        encoding = self.tokenizer(
-                review,
-                max_length=self.max_length,
-                padding='max_length',
-                truncation=True,
-                return_tensors='pt'
-            )
+        encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        
        # Returns a dictionary with:
        #   'input_ids': tensor of shape [max_length]
--- a/src/model.py
+++ b/src/model.py
@@ -11,7 +11,17 @@ import torch.nn as nn
 # Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
 # Each hidden representation is size 768

-class SingleTaskModel(nn.Module): #   TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTURE
+class SingleTaskModel(nn.Module):
+    """Single task model to compare MTL approach to review classification
+    
+    Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
+    just different args
+    
+        Args:
+            task_name: which of the 4 tasks are we training for
+            num_classes: number of output classes for the task
+            dropout_rate: probability applied to cls representation, randomly drops tokens for better results
+        """
    def __init__(self, task_name, num_classes, dropout_rate=0.2):
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -24,14 +34,23 @@ class SingleTaskModel(nn.Module): #   TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTU
        logits = self.head(output)
        return {self.task_name: logits}

-class Model(nn.Module): #   MULTITASK MODEL ARCHITECTURE
-    def __init__(self, dropout_rate=0.2): # Try other p values
+class Model(nn.Module): 
+    """ Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads
+
+    Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through 
+    shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
+    allowing the encoder to learn from the shared representations / generalisations
+
+        Args:
+            dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
+    """
+    def __init__(self, dropout_rate=0.2):
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")

        hidden_size = self.encoder.config.hidden_size

-        # Applied across whole output, shared
+        # Applied across shared cls token, before all task heads 
        self.dropout = nn.Dropout(dropout_rate)

        self.bug_head = nn.Linear(hidden_size, 2)
@@ -39,10 +58,11 @@ class Model(nn.Module): #   MULTITASK MODEL ARCHITECTURE
        self.aspect_head = nn.Linear(hidden_size, 6)
        self.aspect_sentiment_head = nn.Linear(hidden_size, 3)

-    # Pass through encoder then extract the token representation
+    # Pass through encoder then extract the token representation through [batch_size, 768]
    # Apply droupout to it, take scores for each head, return them in a dictionary
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        # index 0 from [batch_size, 768]
        output = outputs.last_hidden_state[:, 0, :]

        output = self.dropout(output)
--- a/src/preprocess.py
+++ b/src/preprocess.py
@@ -1,10 +1,11 @@
 # preprocess.py

+# langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
+
 import pandas as pd
 import re
-from langdetect import detect, LangDetectException

-def clean_text(text):
+def clean_text(text) -> str:
    """Clean review text by removing URLS, emails, excessive whitespace

    Input: 
@@ -19,31 +20,22 @@ def clean_text(text):
    # Convert to lower for uniformity
    text = str(text).lower()
    
-    # Remove URLs using regex
+    # Remove URLs using regex, match http in any non whitespace char (\S)  numerous (+) times. same with either ( | ) www
    text = re.sub(r'http\S+|www\S+', '', text)
    
-    # Remove emails
+    # Remove emails, one or more (+) non whitespace (\S) before "@" with trailing \S up replace with '' for each text (review)
    text = re.sub(r'\S+@\S+', '', text)

-    # Normalize punctuation
+    # Normalize punctuation, any character except line terminators (\.) at least 2 times {2,} for . ! ? replace with a single
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'!{2,}', '!', text)
    text = re.sub(r'\?{2,}', '?', text)
    
-    # Remove excessive whitespace by replacing with single whitespace where there is trailing spaces
+    # Remove excessive whitespace (\s) by replacing with single whitespace where there is trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

-def detect_language(text):
-    """Detect language of text"""
-    try:
-        if pd.isna(text) or len(str(text).strip()) < 10:
-            return 'unknown'
-        return detect(str(text))
-    except LangDetectException:
-        return 'unknown'
-
 def preprocess_uber_reviews(input_path, output_path):
    """
    preprocess_uber_reviews by loading, cleaning, and filtering the data.
--- a/src/sampler.py
+++ b/src/sampler.py
@@ -1,21 +1,16 @@
-#   TODO:   Add verification comparison between ratings
-#   TODO:   Clean up the logging print statements
-
-
 import pandas as pd
 import numpy as np

 print(pd.__version__)
 print(np.__version__)

-path = "multitag/data/uber_reviews_cleaned.csv"
-sampled_path = "multitag/data/uber_reviews_sampled.csv"
-original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
+path = "data/raw/uber_reviews_cleaned.csv"
+sampled_path = "data/raw/uber_reviews_sampled.csv"
+original_path = "data/raw/uber_reviews.csv" ### only for distribution comparison
 class Sampler:
    def __init__(self, data_path, target_samples):

        self.data_path = data_path
-        self.target_samples = 5000  # target number of samples
        self.stratify_column = "rating"  # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)

        self.original_data = pd.read_csv(original_path, low_memory=False)
@@ -39,7 +34,7 @@ class Sampler:
        print(f"Original Distribution from {original_path}:")
        print((_origdist*100).round(1),"\n")

-        self.data.info()
+        self.data.info(verbose=True)

    #   add sampling method here
    #   random sample 5000 entries with stratifiying by rating
@@ -52,43 +47,53 @@ class Sampler:
    2     3.9% (41707)
    Name: proportion, dtype: object
    """
+    
    """
+    IGNORE --- Left in just in case

-    Sample size by rating
-    Redundant calculation, kept for clarity
+    Sample randomly
+    Redundant calculation
    Doesn't factor that the distribution changed greatly after preprocessing
-
    """
    def get_stratified_sample(self) -> pd.DataFrame:
           stratified_sample = (
            self.data
-            .reset_index(drop=True)
-            .apply(self.x)
-            .sample(n=self.target_samples, random_state=42)
+            .reset_index(drop=True) # remove messy indexes
+            .apply(self.sample_col) # applies to each column
+            .sample(n=self.target_samples, random_state=42) # 42 on sampler 4321 on any other file
            )
           return stratified_sample
        
    
-
-    # x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
-    def x(self, x):    
-        n = int(len(x) / self.total * self.target_samples)
-        n = max(n,1)
-        return x.sample(n=n, random_state=42)
+    def sample_col(self, column) -> pd.DataFrame:    
        """
-    get_proportional_sample()
+        IGNORE --- Left in just in case

+        Randomly sample, including conflicting math, I guess I was going to stratify
        """
+        samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
+        samples_per_column = max(samples_per_column,1) # also pointless
+        return column.sample(n=samples_per_column, random_state=42)
+

    """
    original_distribution_sample()
    The main sampling method for our labelling as it 
-    keeps composition of the original uber dataset
+    keeps composition of the original uber dataset, verified in 
    which is a fairer comparison, may also work better in general

-    inputs:
+    verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
+    and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
+
+    
+    manually coded distributions taken from notebooks
+
+    for ratings and actual number of samples 
+    rating data is the whole data for a rating as we iterate
+    has error handling if totals doesn't match the required amount of samples per the orig distrib
+    randomise the indexes (samples) and appends to the new dataset
+

-    outputs:

    """
    def original_distribution_sample(self):
@@ -102,8 +107,8 @@ class Sampler:
        print("Target Distribution =", original_dist)
        samples = []
        for rating, num_samples in original_dist.items():
-            rating_data = self.data[self.data[self.stratify_column] == rating]
-            if len(rating_data) < num_samples:
+            rating_data = self.data[self.data[self.stratify_column] == rating] # stratify_column = "rating"
+            if len(rating_data) < num_samples:                                 # data is a pd.dataframe of the set
                print("Missing samples available for rating")
                num_samples = len(rating_data)
            sample = rating_data.sample(n = num_samples,random_state=42)
@@ -127,9 +132,9 @@ class Sampler:

    def sample_with_keywords(self):
        #TODO add keywords for feature classification
-        print(f"\n{"="*50}")
+        print(f"\n{'='*50}")
        print("Keyword influenced / rating stratified set")
-        print(f"\n{"="*50}")
+        print(f"\n{'='*50}")

        bug_keywords = ["crash","freeze", "error",
                        "stop", "doesnt work", "doesn't work","loading",
@@ -204,7 +209,7 @@ class Sampler:

 def main():
    
-    sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
+    sampler = Sampler("data/raw/uber_reviews_cleaned.csv", target_samples=5000)

    # Choose sampling strategy
    print(f"\n{'='*50}")
@@ -218,19 +223,19 @@ def main():
    
    if choice == '1':
        sample = sampler.get_stratified_sample()
-        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+        sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
        
    elif choice == '2':
        sample = sampler.original_distribution_sample()
-        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+        sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
        
    elif choice == '3':
        sample = sampler.sample_with_keywords()
-        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+        sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")

    elif choice == '4':
        sample = sampler.sample_tiny_size()
-        sampler.save_sample(sample,"multitag/data/uber_review_temp.csv")
+        sampler.save_sample(sample,"data/raw/uber_review_temp.csv")
        


--- a/src/train.py
+++ b/src/train.py
@@ -1,6 +1,6 @@
 # train.py
-# some code directly from pytorch docs https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
-import argparse # argparse for later switching to boosted data
+# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
+import argparse
 import os
 from datetime import datetime
 import time
@@ -25,7 +25,7 @@ from model import Model, SingleTaskModel


 # =======================================================================
-#                       Multitask implementation
+#         Training script for MTL and STL training configurations
 # =======================================================================

 # NFR5, reproducibility
@@ -34,12 +34,16 @@ torch.manual_seed(SEED)
 np.random.seed(SEED)
 random.seed(SEED)

-# class weights, training loop and early stopping

 # ------------------- Class weights -------------------
 # Using weights inversely proportional to class frequencies to avoid majority class bias, 
 # prioritize useful bug reports / feature requests
 def compute_weights(df, column, device):
+    """Copmutes inverse frequency class weights for a label column
+    
+    Uses sklearns balanced mode
+    Rare classes receive higher weights to penalise so it can learn more from less
+    """
    classes = np.unique(df[column])
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
    return torch.tensor(weights, dtype=torch.float).to(device)
@@ -63,19 +67,17 @@ def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Starting training...", flush=True)
    print("Using device:", device)
-    # Remove randomness
+    # Set cuda seeds for reproducibility
    if torch.cuda.is_available():
        print("GPU:", torch.cuda.get_device_name(0))
        torch.cuda.manual_seed_all(SEED)
        torch.cuda.manual_seed(SEED)
    print(f"Using dataset: {args.dataset.upper()}")
+    # Force deterministic for reproducibility at a slight performance cost
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

-    """
-    Data loading:
-
-    """
+    # load data
    train = f"data/processed/{args.dataset}_train.csv"
    val = f"data/processed/{args.dataset}_val.csv"
    os.makedirs("outputs", exist_ok=True)
@@ -117,19 +119,13 @@ def main():
    feature_weights = compute_weights(train_df, 'feature_request', device)
    aspect_weights = compute_weights(train_df, 'aspect', device)    
    aspect_sentiment_weights = compute_weights(train_df, 'aspect_sentiment', device)
-    # Move tensors to cpu and conver to numpy for usage with sklearn classification report
-    # Use detatch() later for predictions
+    
    print("Bug report class weights:", bug_weights.cpu().numpy())
    print("Feature request class weights:", feature_weights.cpu().numpy())
    print("Aspect class weights:", aspect_weights.cpu().numpy())
    print("Aspect sentiment class weights:", aspect_sentiment_weights.cpu().numpy())
    
-    #   for later
-    #   1.0 * bug_loss +
-    #   1.0 * feature_loss +
-    #   0.5 * aspect_loss +
-    #   0.5 * sentiment_loss
-    # FR4-FR7: Task specific loss functions
+    # equal weighted task losses. unequal was considered but equal weights performed well without adding complexity
    criterions = {
        'bug_report': nn.CrossEntropyLoss(weight=bug_weights),
        'feature_request': nn.CrossEntropyLoss(weight=feature_weights),
@@ -140,7 +136,7 @@ def main():
    # -------------------- Optimizer and scheduler -------------------
    optimizer = torch.optim.AdamW(
        model.parameters(), 
-        lr=args.lr,        # change
+        lr=args.lr,        
        weight_decay=0.01 
        )