diff --git a/src/dataset.py b/src/dataset.py index e5b8ccc..e3963f3 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -1,5 +1,5 @@ # dataset.py - +# tokenize data using (sentencepiece) XLM-RoBERTa # Takes a row from the csv, tokenizes the review and returns a tensor import torch import pandas as pd @@ -7,6 +7,18 @@ from torch.utils.data import Dataset from transformers import AutoTokenizer class ReviewDataset(Dataset): + """Pytorch Dataset for loading tokenized reviews + + Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams). + Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of + input tensors and integer labels for all 4 tasks. + + Args: + path (str): Path to the csv file containing the reviews and labels. + tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews. + max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes + """ + def __init__(self, path, tokenizer, max_length=256): self.df = pd.read_csv(path) self.tokenizer = tokenizer @@ -22,13 +34,7 @@ class ReviewDataset(Dataset): # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length] # Both have shape [1, max_length] because of return_tensors='pt' # Squeeze them to [max_length] with .squeeze(0) - encoding = self.tokenizer( - review, - max_length=self.max_length, - padding='max_length', - truncation=True, - return_tensors='pt' - ) + encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') # Returns a dictionary with: # 'input_ids': tensor of shape [max_length] diff --git a/src/model.py b/src/model.py index f74b849..e6cbe70 100644 --- a/src/model.py +++ b/src/model.py @@ -11,7 +11,17 @@ import torch.nn as nn # Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head # Each hidden representation is size 768 -class SingleTaskModel(nn.Module): # TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTURE +class SingleTaskModel(nn.Module): + """Single task model to compare MTL approach to review classification + + Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same + just different args + + Args: + task_name: which of the 4 tasks are we training for + num_classes: number of output classes for the task + dropout_rate: probability applied to cls representation, randomly drops tokens for better results + """ def __init__(self, task_name, num_classes, dropout_rate=0.2): super().__init__() self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base") @@ -24,14 +34,23 @@ class SingleTaskModel(nn.Module): # TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTU logits = self.head(output) return {self.task_name: logits} -class Model(nn.Module): # MULTITASK MODEL ARCHITECTURE - def __init__(self, dropout_rate=0.2): # Try other p values +class Model(nn.Module): + """ Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads + + Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through + shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously, + allowing the encoder to learn from the shared representations / generalisations + + Args: + dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default + """ + def __init__(self, dropout_rate=0.2): super().__init__() self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base") hidden_size = self.encoder.config.hidden_size - # Applied across whole output, shared + # Applied across shared cls token, before all task heads self.dropout = nn.Dropout(dropout_rate) self.bug_head = nn.Linear(hidden_size, 2) @@ -39,10 +58,11 @@ class Model(nn.Module): # MULTITASK MODEL ARCHITECTURE self.aspect_head = nn.Linear(hidden_size, 6) self.aspect_sentiment_head = nn.Linear(hidden_size, 3) - # Pass through encoder then extract the token representation + # Pass through encoder then extract the token representation through [batch_size, 768] # Apply droupout to it, take scores for each head, return them in a dictionary def forward(self, input_ids, attention_mask): outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) + # index 0 from [batch_size, 768] output = outputs.last_hidden_state[:, 0, :] output = self.dropout(output) diff --git a/src/preprocess.py b/src/preprocess.py index d80b034..dd8465a 100644 --- a/src/preprocess.py +++ b/src/preprocess.py @@ -1,10 +1,11 @@ # preprocess.py +# langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually + import pandas as pd import re -from langdetect import detect, LangDetectException -def clean_text(text): +def clean_text(text) -> str: """Clean review text by removing URLS, emails, excessive whitespace Input: @@ -19,31 +20,22 @@ def clean_text(text): # Convert to lower for uniformity text = str(text).lower() - # Remove URLs using regex + # Remove URLs using regex, match http in any non whitespace char (\S) numerous (+) times. same with either ( | ) www text = re.sub(r'http\S+|www\S+', '', text) - # Remove emails + # Remove emails, one or more (+) non whitespace (\S) before "@" with trailing \S up replace with '' for each text (review) text = re.sub(r'\S+@\S+', '', text) - # Normalize punctuation + # Normalize punctuation, any character except line terminators (\.) at least 2 times {2,} for . ! ? replace with a single text = re.sub(r'\.{2,}', '.', text) text = re.sub(r'!{2,}', '!', text) text = re.sub(r'\?{2,}', '?', text) - # Remove excessive whitespace by replacing with single whitespace where there is trailing spaces + # Remove excessive whitespace (\s) by replacing with single whitespace where there is trailing spaces text = re.sub(r'\s+', ' ', text).strip() return text -def detect_language(text): - """Detect language of text""" - try: - if pd.isna(text) or len(str(text).strip()) < 10: - return 'unknown' - return detect(str(text)) - except LangDetectException: - return 'unknown' - def preprocess_uber_reviews(input_path, output_path): """ preprocess_uber_reviews by loading, cleaning, and filtering the data. diff --git a/src/sampler.py b/src/sampler.py index 35eb5f6..7a9b444 100644 --- a/src/sampler.py +++ b/src/sampler.py @@ -1,21 +1,16 @@ -# TODO: Add verification comparison between ratings -# TODO: Clean up the logging print statements - - import pandas as pd import numpy as np print(pd.__version__) print(np.__version__) -path = "multitag/data/uber_reviews_cleaned.csv" -sampled_path = "multitag/data/uber_reviews_sampled.csv" -original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison +path = "data/raw/uber_reviews_cleaned.csv" +sampled_path = "data/raw/uber_reviews_sampled.csv" +original_path = "data/raw/uber_reviews.csv" ### only for distribution comparison class Sampler: def __init__(self, data_path, target_samples): self.data_path = data_path - self.target_samples = 5000 # target number of samples self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers) self.original_data = pd.read_csv(original_path, low_memory=False) @@ -39,7 +34,7 @@ class Sampler: print(f"Original Distribution from {original_path}:") print((_origdist*100).round(1),"\n") - self.data.info() + self.data.info(verbose=True) # add sampling method here # random sample 5000 entries with stratifiying by rating @@ -52,43 +47,53 @@ class Sampler: 2 3.9% (41707) Name: proportion, dtype: object """ - """ - Sample size by rating - Redundant calculation, kept for clarity - Doesn't factor that the distribution changed greatly after preprocessing + """ + IGNORE --- Left in just in case + Sample randomly + Redundant calculation + Doesn't factor that the distribution changed greatly after preprocessing """ def get_stratified_sample(self) -> pd.DataFrame: stratified_sample = ( self.data - .reset_index(drop=True) - .apply(self.x) - .sample(n=self.target_samples, random_state=42) + .reset_index(drop=True) # remove messy indexes + .apply(self.sample_col) # applies to each column + .sample(n=self.target_samples, random_state=42) # 42 on sampler 4321 on any other file ) return stratified_sample + def sample_col(self, column) -> pd.DataFrame: + """ + IGNORE --- Left in just in case + + Randomly sample, including conflicting math, I guess I was going to stratify + """ + samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000 + samples_per_column = max(samples_per_column,1) # also pointless + return column.sample(n=samples_per_column, random_state=42) - # x(self): helper function for get_proportional_sample and get_stratified_sample =FIX= - def x(self, x): - n = int(len(x) / self.total * self.target_samples) - n = max(n,1) - return x.sample(n=n, random_state=42) - """ - get_proportional_sample() - """ - """ original_distribution_sample() The main sampling method for our labelling as it - keeps composition of the original uber dataset + keeps composition of the original uber dataset, verified in which is a fairer comparison, may also work better in general - inputs: + verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb + and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb + + + manually coded distributions taken from notebooks + + for ratings and actual number of samples + rating data is the whole data for a rating as we iterate + has error handling if totals doesn't match the required amount of samples per the orig distrib + randomise the indexes (samples) and appends to the new dataset + - outputs: """ def original_distribution_sample(self): @@ -102,8 +107,8 @@ class Sampler: print("Target Distribution =", original_dist) samples = [] for rating, num_samples in original_dist.items(): - rating_data = self.data[self.data[self.stratify_column] == rating] - if len(rating_data) < num_samples: + rating_data = self.data[self.data[self.stratify_column] == rating] # stratify_column = "rating" + if len(rating_data) < num_samples: # data is a pd.dataframe of the set print("Missing samples available for rating") num_samples = len(rating_data) sample = rating_data.sample(n = num_samples,random_state=42) @@ -127,9 +132,9 @@ class Sampler: def sample_with_keywords(self): #TODO add keywords for feature classification - print(f"\n{"="*50}") + print(f"\n{'='*50}") print("Keyword influenced / rating stratified set") - print(f"\n{"="*50}") + print(f"\n{'='*50}") bug_keywords = ["crash","freeze", "error", "stop", "doesnt work", "doesn't work","loading", @@ -204,7 +209,7 @@ class Sampler: def main(): - sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000) + sampler = Sampler("data/raw/uber_reviews_cleaned.csv", target_samples=5000) # Choose sampling strategy print(f"\n{'='*50}") @@ -218,19 +223,19 @@ def main(): if choice == '1': sample = sampler.get_stratified_sample() - sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") + sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv") elif choice == '2': sample = sampler.original_distribution_sample() - sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") + sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv") elif choice == '3': sample = sampler.sample_with_keywords() - sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") + sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv") elif choice == '4': sample = sampler.sample_tiny_size() - sampler.save_sample(sample,"multitag/data/uber_review_temp.csv") + sampler.save_sample(sample,"data/raw/uber_review_temp.csv") diff --git a/src/train.py b/src/train.py index e9d6f45..f14b98d 100644 --- a/src/train.py +++ b/src/train.py @@ -1,6 +1,6 @@ # train.py -# some code directly from pytorch docs https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html -import argparse # argparse for later switching to boosted data +# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html +import argparse import os from datetime import datetime import time @@ -25,7 +25,7 @@ from model import Model, SingleTaskModel # ======================================================================= -# Multitask implementation +# Training script for MTL and STL training configurations # ======================================================================= # NFR5, reproducibility @@ -34,12 +34,16 @@ torch.manual_seed(SEED) np.random.seed(SEED) random.seed(SEED) -# class weights, training loop and early stopping # ------------------- Class weights ------------------- # Using weights inversely proportional to class frequencies to avoid majority class bias, # prioritize useful bug reports / feature requests def compute_weights(df, column, device): + """Copmutes inverse frequency class weights for a label column + + Uses sklearns balanced mode + Rare classes receive higher weights to penalise so it can learn more from less + """ classes = np.unique(df[column]) weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column]) return torch.tensor(weights, dtype=torch.float).to(device) @@ -63,19 +67,17 @@ def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Starting training...", flush=True) print("Using device:", device) - # Remove randomness + # Set cuda seeds for reproducibility if torch.cuda.is_available(): print("GPU:", torch.cuda.get_device_name(0)) torch.cuda.manual_seed_all(SEED) torch.cuda.manual_seed(SEED) print(f"Using dataset: {args.dataset.upper()}") + # Force deterministic for reproducibility at a slight performance cost torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False - """ - Data loading: - - """ + # load data train = f"data/processed/{args.dataset}_train.csv" val = f"data/processed/{args.dataset}_val.csv" os.makedirs("outputs", exist_ok=True) @@ -117,19 +119,13 @@ def main(): feature_weights = compute_weights(train_df, 'feature_request', device) aspect_weights = compute_weights(train_df, 'aspect', device) aspect_sentiment_weights = compute_weights(train_df, 'aspect_sentiment', device) - # Move tensors to cpu and conver to numpy for usage with sklearn classification report - # Use detatch() later for predictions + print("Bug report class weights:", bug_weights.cpu().numpy()) print("Feature request class weights:", feature_weights.cpu().numpy()) print("Aspect class weights:", aspect_weights.cpu().numpy()) print("Aspect sentiment class weights:", aspect_sentiment_weights.cpu().numpy()) - # for later - # 1.0 * bug_loss + - # 1.0 * feature_loss + - # 0.5 * aspect_loss + - # 0.5 * sentiment_loss - # FR4-FR7: Task specific loss functions + # equal weighted task losses. unequal was considered but equal weights performed well without adding complexity criterions = { 'bug_report': nn.CrossEntropyLoss(weight=bug_weights), 'feature_request': nn.CrossEntropyLoss(weight=feature_weights), @@ -140,7 +136,7 @@ def main(): # -------------------- Optimizer and scheduler ------------------- optimizer = torch.optim.AdamW( model.parameters(), - lr=args.lr, # change + lr=args.lr, weight_decay=0.01 ) @@ -163,7 +159,7 @@ def main(): # Initialize with inf to capture best validation loss easily best_vloss = float('inf') - + for epoch in range(args.epochs): print(f"EPOCH {epoch + 1}/{args.epochs}") model.train(True)