Added some comments and readability

2026-03-24 18:11:31 +00:00
parent afe61eaaa2
commit 753723694b
5 changed files with 103 additions and 84 deletions
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -1,5 +1,5 @@
 # dataset.py
-
+# tokenize data using (sentencepiece) XLM-RoBERTa
 # Takes a row from the csv, tokenizes the review and returns a tensor
 import torch
 import pandas as pd
@@ -7,6 +7,18 @@ from torch.utils.data import Dataset
 from transformers import AutoTokenizer
 class ReviewDataset(Dataset):
    """Pytorch Dataset for loading tokenized reviews
    Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
    Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
    input tensors and integer labels for all 4 tasks.
      Args:
        path (str): Path to the csv file containing the reviews and labels.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
        max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
    """
    def __init__(self, path, tokenizer, max_length=256):
        self.df = pd.read_csv(path)
        self.tokenizer = tokenizer
@@ -22,13 +34,7 @@ class ReviewDataset(Dataset):
        # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
        # Both have shape [1, max_length] because of return_tensors='pt'
        # Squeeze them to [max_length] with .squeeze(0)
-        encoding = self.tokenizer(
+        encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
                review,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
        # Returns a dictionary with:
        #   'input_ids': tensor of shape [max_length]
--- a/src/model.py
+++ b/src/model.py
@@ -11,7 +11,17 @@ import torch.nn as nn
 # Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
 # Each hidden representation is size 768
-class SingleTaskModel(nn.Module): #   TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTURE
+class SingleTaskModel(nn.Module):
    """Single task model to compare MTL approach to review classification
    Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
    just different args
        Args:
            task_name: which of the 4 tasks are we training for
            num_classes: number of output classes for the task
            dropout_rate: probability applied to cls representation, randomly drops tokens for better results
        """
    def __init__(self, task_name, num_classes, dropout_rate=0.2):
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -24,14 +34,23 @@ class SingleTaskModel(nn.Module): #   TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTU
        logits = self.head(output)
        return {self.task_name: logits}
-class Model(nn.Module): #   MULTITASK MODEL ARCHITECTURE
+class Model(nn.Module): 
-    def __init__(self, dropout_rate=0.2): # Try other p values
+    """ Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads
    Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through 
    shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
    allowing the encoder to learn from the shared representations / generalisations
        Args:
            dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
    """
    def __init__(self, dropout_rate=0.2):
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
        hidden_size = self.encoder.config.hidden_size
-        # Applied across whole output, shared
+        # Applied across shared cls token, before all task heads 
        self.dropout = nn.Dropout(dropout_rate)
        self.bug_head = nn.Linear(hidden_size, 2)
@@ -39,10 +58,11 @@ class Model(nn.Module): #   MULTITASK MODEL ARCHITECTURE
        self.aspect_head = nn.Linear(hidden_size, 6)
        self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
-    # Pass through encoder then extract the token representation
+    # Pass through encoder then extract the token representation through [batch_size, 768]
    # Apply droupout to it, take scores for each head, return them in a dictionary
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # index 0 from [batch_size, 768]
        output = outputs.last_hidden_state[:, 0, :]
        output = self.dropout(output)
--- a/src/preprocess.py
+++ b/src/preprocess.py
@@ -1,10 +1,11 @@
 # preprocess.py
 # langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
 import pandas as pd
 import re
 from langdetect import detect, LangDetectException
-def clean_text(text):
+def clean_text(text) -> str:
    """Clean review text by removing URLS, emails, excessive whitespace
    Input: 
@@ -19,31 +20,22 @@ def clean_text(text):
    # Convert to lower for uniformity
    text = str(text).lower()
-    # Remove URLs using regex
+    # Remove URLs using regex, match http in any non whitespace char (\S)  numerous (+) times. same with either ( | ) www
    text = re.sub(r'http\S+|www\S+', '', text)
-    # Remove emails
+    # Remove emails, one or more (+) non whitespace (\S) before "@" with trailing \S up replace with '' for each text (review)
    text = re.sub(r'\S+@\S+', '', text)
-    # Normalize punctuation
+    # Normalize punctuation, any character except line terminators (\.) at least 2 times {2,} for . ! ? replace with a single
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'!{2,}', '!', text)
    text = re.sub(r'\?{2,}', '?', text)
-    # Remove excessive whitespace by replacing with single whitespace where there is trailing spaces
+    # Remove excessive whitespace (\s) by replacing with single whitespace where there is trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def detect_language(text):
    """Detect language of text"""
    try:
        if pd.isna(text) or len(str(text).strip()) < 10:
            return 'unknown'
        return detect(str(text))
    except LangDetectException:
        return 'unknown'
 def preprocess_uber_reviews(input_path, output_path):
    """
    preprocess_uber_reviews by loading, cleaning, and filtering the data.
--- a/src/sampler.py
+++ b/src/sampler.py
@@ -1,21 +1,16 @@
 #   TODO:   Add verification comparison between ratings
 #   TODO:   Clean up the logging print statements
 import pandas as pd
 import numpy as np
 print(pd.__version__)
 print(np.__version__)
-path = "multitag/data/uber_reviews_cleaned.csv"
+path = "data/raw/uber_reviews_cleaned.csv"
-sampled_path = "multitag/data/uber_reviews_sampled.csv"
+sampled_path = "data/raw/uber_reviews_sampled.csv"
-original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
+original_path = "data/raw/uber_reviews.csv" ### only for distribution comparison
 class Sampler:
    def __init__(self, data_path, target_samples):
        self.data_path = data_path
        self.target_samples = 5000  # target number of samples
        self.stratify_column = "rating"  # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
        self.original_data = pd.read_csv(original_path, low_memory=False)
@@ -39,7 +34,7 @@ class Sampler:
        print(f"Original Distribution from {original_path}:")
        print((_origdist*100).round(1),"\n")
-        self.data.info()
+        self.data.info(verbose=True)
    #   add sampling method here
    #   random sample 5000 entries with stratifiying by rating
@@ -52,43 +47,53 @@ class Sampler:
    2     3.9% (41707)
    Name: proportion, dtype: object
    """
    """
    IGNORE --- Left in just in case
-    Sample size by rating
+    Sample randomly
-    Redundant calculation, kept for clarity
+    Redundant calculation
    Doesn't factor that the distribution changed greatly after preprocessing
    """
    def get_stratified_sample(self) -> pd.DataFrame:
           stratified_sample = (
            self.data
-            .reset_index(drop=True)
+            .reset_index(drop=True) # remove messy indexes
-            .apply(self.x)
+            .apply(self.sample_col) # applies to each column
-            .sample(n=self.target_samples, random_state=42)
+            .sample(n=self.target_samples, random_state=42) # 42 on sampler 4321 on any other file
            )
           return stratified_sample
-
+    def sample_col(self, column) -> pd.DataFrame:    
    # x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
    def x(self, x):    
        n = int(len(x) / self.total * self.target_samples)
        n = max(n,1)
        return x.sample(n=n, random_state=42)
        """
-    get_proportional_sample()
+        IGNORE --- Left in just in case
        Randomly sample, including conflicting math, I guess I was going to stratify
        """
        samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
        samples_per_column = max(samples_per_column,1) # also pointless
        return column.sample(n=samples_per_column, random_state=42)
    """
    original_distribution_sample()
    The main sampling method for our labelling as it 
-    keeps composition of the original uber dataset
+    keeps composition of the original uber dataset, verified in 
    which is a fairer comparison, may also work better in general
-    inputs:
+    verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
    and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
    manually coded distributions taken from notebooks
    for ratings and actual number of samples 
    rating data is the whole data for a rating as we iterate
    has error handling if totals doesn't match the required amount of samples per the orig distrib
    randomise the indexes (samples) and appends to the new dataset
    outputs:
    """
    def original_distribution_sample(self):
@@ -102,8 +107,8 @@ class Sampler:
        print("Target Distribution =", original_dist)
        samples = []
        for rating, num_samples in original_dist.items():
-            rating_data = self.data[self.data[self.stratify_column] == rating]
+            rating_data = self.data[self.data[self.stratify_column] == rating] # stratify_column = "rating"
-            if len(rating_data) < num_samples:
+            if len(rating_data) < num_samples:                                 # data is a pd.dataframe of the set
                print("Missing samples available for rating")
                num_samples = len(rating_data)
            sample = rating_data.sample(n = num_samples,random_state=42)
@@ -127,9 +132,9 @@ class Sampler:
    def sample_with_keywords(self):
        #TODO add keywords for feature classification
-        print(f"\n{"="*50}")
+        print(f"\n{'='*50}")
        print("Keyword influenced / rating stratified set")
-        print(f"\n{"="*50}")
+        print(f"\n{'='*50}")
        bug_keywords = ["crash","freeze", "error",
                        "stop", "doesnt work", "doesn't work","loading",
@@ -204,7 +209,7 @@ class Sampler:
 def main():
-    sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
+    sampler = Sampler("data/raw/uber_reviews_cleaned.csv", target_samples=5000)
    # Choose sampling strategy
    print(f"\n{'='*50}")
@@ -218,19 +223,19 @@ def main():
    if choice == '1':
        sample = sampler.get_stratified_sample()
-        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+        sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
    elif choice == '2':
        sample = sampler.original_distribution_sample()
-        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+        sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
    elif choice == '3':
        sample = sampler.sample_with_keywords()
-        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+        sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
    elif choice == '4':
        sample = sampler.sample_tiny_size()
-        sampler.save_sample(sample,"multitag/data/uber_review_temp.csv")
+        sampler.save_sample(sample,"data/raw/uber_review_temp.csv")
--- a/src/train.py
+++ b/src/train.py
@@ -1,6 +1,6 @@
 # train.py
-# some code directly from pytorch docs https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
+# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
-import argparse # argparse for later switching to boosted data
+import argparse
 import os
 from datetime import datetime
 import time
@@ -25,7 +25,7 @@ from model import Model, SingleTaskModel
 # =======================================================================
-#                       Multitask implementation
+#         Training script for MTL and STL training configurations
 # =======================================================================
 # NFR5, reproducibility
@@ -34,12 +34,16 @@ torch.manual_seed(SEED)
 np.random.seed(SEED)
 random.seed(SEED)
 # class weights, training loop and early stopping
 # ------------------- Class weights -------------------
 # Using weights inversely proportional to class frequencies to avoid majority class bias, 
 # prioritize useful bug reports / feature requests
 def compute_weights(df, column, device):
    """Copmutes inverse frequency class weights for a label column
    Uses sklearns balanced mode
    Rare classes receive higher weights to penalise so it can learn more from less
    """
    classes = np.unique(df[column])
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
    return torch.tensor(weights, dtype=torch.float).to(device)
@@ -63,19 +67,17 @@ def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Starting training...", flush=True)
    print("Using device:", device)
-    # Remove randomness
+    # Set cuda seeds for reproducibility
    if torch.cuda.is_available():
        print("GPU:", torch.cuda.get_device_name(0))
        torch.cuda.manual_seed_all(SEED)
        torch.cuda.manual_seed(SEED)
    print(f"Using dataset: {args.dataset.upper()}")
    # Force deterministic for reproducibility at a slight performance cost
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
-    """
+    # load data
    Data loading:
    """
    train = f"data/processed/{args.dataset}_train.csv"
    val = f"data/processed/{args.dataset}_val.csv"
    os.makedirs("outputs", exist_ok=True)
@@ -117,19 +119,13 @@ def main():
    feature_weights = compute_weights(train_df, 'feature_request', device)
    aspect_weights = compute_weights(train_df, 'aspect', device)    
    aspect_sentiment_weights = compute_weights(train_df, 'aspect_sentiment', device)
-    # Move tensors to cpu and conver to numpy for usage with sklearn classification report
+    
    # Use detatch() later for predictions
    print("Bug report class weights:", bug_weights.cpu().numpy())
    print("Feature request class weights:", feature_weights.cpu().numpy())
    print("Aspect class weights:", aspect_weights.cpu().numpy())
    print("Aspect sentiment class weights:", aspect_sentiment_weights.cpu().numpy())
-    #   for later
+    # equal weighted task losses. unequal was considered but equal weights performed well without adding complexity
    #   1.0 * bug_loss +
    #   1.0 * feature_loss +
    #   0.5 * aspect_loss +
    #   0.5 * sentiment_loss
    # FR4-FR7: Task specific loss functions
    criterions = {
        'bug_report': nn.CrossEntropyLoss(weight=bug_weights),
        'feature_request': nn.CrossEntropyLoss(weight=feature_weights),
@@ -140,7 +136,7 @@ def main():
    # -------------------- Optimizer and scheduler -------------------
    optimizer = torch.optim.AdamW(
        model.parameters(), 
-        lr=args.lr,        # change
+        lr=args.lr,        
        weight_decay=0.01 
        )