diff --git a/README.md b/README.md index 9349957..b787b26 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,11 @@ --- -## Project Overview +# README not finished -RECLASS is a multi-task learning system which uses a shared BERT encoder with task-specific classification heads. +## Overview + +RECLASS is a multitask learning system which uses a shared multilingual transformer encoder with task-specific heads and single-task implementations for optional comparison. | Task | Output | Classes | |------|--------|---------| @@ -18,57 +20,104 @@ RECLASS is a multi-task learning system which uses a shared BERT encoder with ta ## Dataset - **Source**: [Uber Customer Reviews (Kaggle)](https://www.kaggle.com/datasets/khushipitroda/ola-vs-uber-play-store-reviews) -- **Original size**: 1,069,616 reviews -- **Cleaned size**: 495,036 reviews (after removing short/duplicate reviews) -- **Annotation target**: 5,000 manually labelled reviews +- **Original size**: ~1.07M Reviews +- **After Preprocessing**: ~495K Reviews +- **Annotation subsets**: 5,000 from the original distribution, 5,000 from a keyword boosted sample + +## Preprocessing Steps + +- Removed URLS and emails +- Normalised text and punctuation +- Removed duplicate reviews +- Filtered reviews less than 5 words + +- Output sets + - Original: matches the original distribution of the raw dataset + - Boosted: oversamples bug reports and feature requests using keyword heuristics + +## Model + +- Encoder: XLM-RoBERTa (large multilingual transformer model) +- Architecture: + - Shared encoder + - Task-specific classification heads +- Training setups: + - MTL (Multitask learning) + - STL (Single-task learning) + +Class weights are applied to reduce imbalance effects. ## Repository Structure -``` -6013/ - README.md - .gitignore - data/ - uber_reviews.csv # Raw dataset - uber_reviews_cleaned.csv # Preprocessed reviews - uber_reviews_sampled.csv # Stratified sample for annotation - uber_reviews_tagged.csv # Annotated reviews (in progress) - notebooks/ - preprocessing_uber.ipynb # Preprocessing analysis - uber_cleaned.ipynb # Cleaned data verification - src/ - preprocess.py # Text cleaning and filtering pipeline - sampler.py # Stratified sampling strategies - multitag.py # GUI annotation tool - train.py # Model training (in progress) - infer.py # Inference pipeline (in progress) - outputs/ - figures/ -``` +. +├── data +│ └── processed +│ ├── boosted_test.csv +│ ├── boosted_train.csv +│ ├── boosted_val.csv +│ ├── original_test.csv +│ ├── original_train.csv +│ ├── original_val.csv +│ └── review.csv +├── notebooks/ +│ +├── outputs +│ └── figures/ +├── README.md +├── architecture.png +└── src + ├── dataset.py + ├── evaluate.py + ├── infer.py + ├── model.py + ├── multitag.py + ├── preprocess.py + ├── sampler.py + └── train.py -## Current Progress +## Results -- Manual annotation of 5,000 reviews -- BERT baseline implementation -- Multi-task model architecture -- Training and evaluation -- Comparative analysis (MTL vs single-task) -- Final report and presentation +Evaluation includes Precision, Recall, Macro F1, Confusion matrices and confidence analysis. + +Results and summaries are found in outputs/*.json and outputs/figures/ ## Installation ``` -# Clone repository -... # Create conda environment -... +conda create -n reclass python=3.11 +conda activate reclass +``` + +``` # Install dependencies -...requirements.txt +conda install --file requirements.txt ``` ## Usage -## References -## Licenses + +#### Train Model + +``` +python src/train.py --mode mtl --dataset original +``` + +#### Evaluate Model + +``` +python src/evaluate.py --mode mtl --dataset original --model_path .pt +``` + +#### Run Inference + +``` +python src/infer.py --mode mtl --model_path .pt --dataset review +``` + +## Notes + +- The same tokenizer is used across training, evaluation and inference to ensure consistency +- Sampling and preprocessing choices are documented further in src files and dissertation --- diff --git a/src/dataset.py b/src/dataset.py index 3530a1a..580726c 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -1,22 +1,17 @@ # dataset.py -# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer -# Takes a row from the csv, tokenizes the review and returns a tensor +# Takes a row from the csv, tokenizes the review and returns a tensor ready for the model import torch import pandas as pd from torch.utils.data import Dataset from transformers import AutoTokenizer class ReviewDataset(Dataset): - """Pytorch Dataset for loading tokenized reviews + """ + Dataset for tokenized reviews with labels for all 4 tasks. Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams). - Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of + Expects a csv and tokenizes reviews using XLM-RoBERTa (SentencePiece), returning a dictionary with of input tensors and integer labels for all 4 tasks. - - Args: - path (str): Path to the csv file containing the reviews and labels. - tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews. - max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes """ def __init__(self, path, tokenizer, max_length=256): @@ -30,25 +25,14 @@ class ReviewDataset(Dataset): def __getitem__(self, idx): review = self.df.iloc[idx]['review'] - # encoding['input_ids'] 1D tensor of token ids, shape [max_length] - # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length] - # Both have shape [1, max_length] because of return_tensors='pt' - # Squeeze them to [max_length] with .squeeze(0) + # Tokenize with padding and truncation to max_length, returning PyTorch tensors encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') - # Returns a dictionary with: - # 'input_ids': tensor of shape [max_length] - - # 'attention_mask': tensor of shape [max_length] - - # MTL structure labels as tensor scalars: - # 'bug_report': tensor scalar (torch.tensor(label_value)) - # 'feature_request': tensor scalar (torch.tensor(label_value)) - # 'aspect': tensor scalar (torch.tensor(label_value)) - # 'aspect_sentiment': tensor scalar (torch.tensor(label_value)) return { 'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), + + # Labels for all 4 tasks, converted to tensors 'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long), 'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long), 'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long), @@ -65,18 +49,23 @@ class InferenceDataset(Dataset): return len(self.df) def __getitem__(self, idx): - #review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews review = str(self.df.iloc[idx][self.text_column]) + if review == 'nan' or review.strip() == '': review = ' ' + + # Same as training dataset but without labels, for inference on test sets encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') return { 'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), } - + + + if __name__ == "__main__": + # Quick test dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")) print(dataset.__getitem__(1)) diff --git a/src/evaluate.py b/src/evaluate.py index 6eb4d77..c34644b 100644 --- a/src/evaluate.py +++ b/src/evaluate.py @@ -1,4 +1,6 @@ # evauluate.py +# Evaluate MTL or STL models on the test split + import os import torch import time @@ -17,7 +19,6 @@ from sklearn.metrics import classification_report, confusion_matrix, f1_score from dataset import ReviewDataset from model import Model, SingleTaskModel -# TODO: load checkpoint, produce tables of evaluation figures SEED = 4321 torch.manual_seed(SEED) np.random.seed(SEED) @@ -31,6 +32,7 @@ label_names = { } def parse_args(): + """Parse command line arguments for evaluation""" parser = argparse.ArgumentParser(description="RECLASS Evaluation Script") parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl") parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"]) @@ -47,11 +49,13 @@ def main(): os.makedirs("outputs/figures", exist_ok=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - + # Load test dataset and model test = f"data/processed/{args.dataset}_test.csv" tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") test_dataset = ReviewDataset(test, tokenizer) test_loader = DataLoader(test_dataset, batch_size=args.batch_size) + + # MTL evaluates all tasks, STL needs to know a single task to evaluate on if args.mode == "mtl": model = Model().to(device) active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment'] @@ -86,6 +90,7 @@ def main(): logits = outputs[task] preds = torch.argmax(logits, dim=1) + # Kepp max softmax as confidence estimate probs = F.softmax(logits, dim=1) confidence = probs.max(dim=1).values @@ -93,6 +98,7 @@ def main(): all_preds[task].extend(preds.cpu().numpy()) all_confidences[task].extend(confidence.cpu().numpy()) + # Detailed JSON summary along with printed results summary = { "mode": args.mode, "dataset": args.dataset, @@ -137,7 +143,7 @@ def main(): print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}") print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}") - # save summary to JSON + # Store main metrics and full per class report to JSON summary["results"][task] = { "macro_f1": float(report_dict["macro avg"]["f1-score"]), "macro_precision": float(report_dict["macro avg"]["precision"]), @@ -150,8 +156,7 @@ def main(): "per_class": report_dict } - # Confusion matrix - + # Confusion matrix for each evaluated task cm = confusion_matrix(labels_arr, preds_arr) fig, ax = plt.subplots(figsize=(8, 6)) sns.heatmap( @@ -172,7 +177,6 @@ def main(): test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable test_df[f'{task}_confidence'] = conf_arr - # to JSON run_name = args.task if args.mode == "stl" else "mtl" json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json" with open(json_path, "w") as f: diff --git a/src/infer.py b/src/infer.py index 3955b8b..776bb33 100644 --- a/src/infer.py +++ b/src/infer.py @@ -1,4 +1,6 @@ # infer.py +# Run inference using MTL or STL on various inputs (CSV or User) + from datetime import datetime import os import torch @@ -20,8 +22,6 @@ from torch.utils.data import Dataset from dataset import InferenceDataset from model import Model, SingleTaskModel - - label_names = { 'bug_report': ['No', 'Yes'], 'feature_request': ['No', 'Yes'], @@ -33,9 +33,6 @@ SEED = 4321 torch.manual_seed(SEED) np.random.seed(SEED) - - - def parse_args(): parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.") parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/") @@ -55,7 +52,7 @@ def main(): os.makedirs("outputs/inference", exist_ok=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - # this section is nearly identical to the first part of evaluate.py + # Mirrors the evaluation script with addition of interactive modes args = parse_args() print(f'{"="*50}') print(f'{"Starting inference"}') @@ -70,7 +67,7 @@ def main(): print("Loading model, tokenizer and datasets ...") tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") - # Let the user decide if they want to run inference on the whole dataset or via the shell input + # Support CSV and interactive input if not args.interactive and not args.text: infer = f"data/processed/{args.dataset}.csv" infer_df = pd.read_csv(infer) diff --git a/src/model.py b/src/model.py index 23b5e2f..706780b 100644 --- a/src/model.py +++ b/src/model.py @@ -1,27 +1,13 @@ # model.py -# One encoder, four shared heads(bug report, feature request, aspect, aspect sentiment) -# 12 transformer layers, 12 attention heads +# Shared encoder (XLM-RoBERTa) with either multitask heads for all 4 tasks or single task head for comparison from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel import torch.nn as nn -# Using dropout, This has proven to be an effective technique -# for regularization and preventing the co-adaptation of neurons as described in https://arxiv.org/abs/1207.0580 - -# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head -# Each hidden representation is size 768 - +# Using dropout before classification to reduce overfitting class SingleTaskModel(nn.Module): - """Single task model to compare MTL approach to review classification - - Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same - just different args - - Args: - task_name: which of the 4 tasks are we training for - num_classes: number of output classes for the task - dropout_rate: probability applied to cls representation, randomly drops tokens for better results - """ + """Single task model with one head to compare MTL approach to review classification""" + def __init__(self, task_name, num_classes, dropout_rate=0.2): super().__init__() self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base") @@ -35,15 +21,7 @@ class SingleTaskModel(nn.Module): return {self.task_name: logits} class Model(nn.Module): - """ Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads - - Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through - shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously, - allowing the encoder to learn from the shared representations / generalisations - - Args: - dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default - """ + """ Multitask model with shared encoder and 4 task specific heads.""" def __init__(self, dropout_rate=0.2): super().__init__() self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base") @@ -58,8 +36,7 @@ class Model(nn.Module): self.aspect_head = nn.Linear(hidden_size, 6) self.aspect_sentiment_head = nn.Linear(hidden_size, 3) - # Pass through encoder then extract the token representation through [batch_size, 768] - # Apply droupout to it, take scores for each head, return them in a dictionary + # Pass through encoder once then extract the token representation, then reuse the shared represenetation across all tasks def forward(self, input_ids, attention_mask): outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) # index 0 from [batch_size, 768] @@ -67,7 +44,6 @@ class Model(nn.Module): output = self.dropout(output) - # Logits for each head: bug_logits = self.bug_head(output) feature_logits = self.feature_head(output) aspect_logits = self.aspect_head(output) diff --git a/src/multitag.py b/src/multitag.py index f862941..f9d91ce 100644 --- a/src/multitag.py +++ b/src/multitag.py @@ -1,13 +1,9 @@ # multitag.py -# This app enables manual annotation of reviews in the Uber dataset, for training with -# to achieve review classifications with multi task deep learning +# Manual annotation tool for labelling reviews in the Uber reviews dataset, for multitask training -# In another time I would have had much more tasks / classifications so mtl can perform better (that would mean better labelling), -#at least that is my prediction of why this may not be as good as I wanted import tkinter as tk from tkinter import ttk import pandas as pd -# import langdetect import os class MultiTag: @@ -41,9 +37,6 @@ class MultiTag: self.number_of_aspects = 6 # number of aspect buttons self.root.title("MultiTag") - #self.display_review = tk.Text(self.root, height=20, width=100, wrap='word') - #self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10) - # Colors for active label self.color_incomplete = "#003366" self.color_complete = "#00AA00" @@ -51,8 +44,7 @@ class MultiTag: # Paths tagged_path = "data/uber_reviews_tagged.csv" sampled_path = "data/uber_reviews_sampled.csv" - # self.load_review_data("data/uber_reviews_sampled.csv") - # self.load_review_data("data/uber_reviews_tagged.csv") + if not os.path.exists(tagged_path): print(f"Tagged file did not exist, making one at: {sampled_path}") sampled_df = pd.read_csv(sampled_path, low_memory=False) @@ -89,13 +81,13 @@ class MultiTag: self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5)) - # Labels ROW 3 + # ROW 3: Field labels ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2)) ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2)) ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2)) ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2)) - # ROW 4 |Buttons| + # ROW 4: Input buttons # Feature Requests self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2) self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2) @@ -132,20 +124,15 @@ class MultiTag: self.root.bind("f", self.handle_key) self.root.bind("g", self.handle_key) self.root.bind("h", self.handle_key) - # self.root.bind("j", self.handle_key) - # self.root.bind("k", self.handle_key) - # self.root.bind("l", self.handle_key) - - + self.display_next_review() - # self.save_tags("data/uber_reviews_tagged.csv") self.root.mainloop() def handle_key(self, event): key = event.char - # Column 0 or 1: feature/bug (1 and 0) + # Feature Request and Bug Report are binary input (1 and 0 keys) if key in ['1', '0']: if self.active_column == 0: self.feature_pressed(key) @@ -159,7 +146,7 @@ class MultiTag: self.sentiment_pressed(key.upper()) def update_status(self): - """Update status label and highlight color based on completion state""" + """Update status label and highlight""" if self.all_labels_complete(): self.highlight.configure(bg=self.color_complete) self.status_label.configure( @@ -212,22 +199,22 @@ class MultiTag: def load_review_data(self, data_path): - """Load review data from a CSV file.""" + """Load review data from a CSV file. Adds annotation columns if they don't exist.""" self.review_data = pd.read_csv(data_path, low_memory=False) if "tagged" not in self.review_data.columns: - self.review_data["tagged"] = 0 # Initialize tagged column if not present + self.review_data["tagged"] = 0 if "feature_request" not in self.review_data.columns: - self.review_data["feature_request"] = "" # Initialize feature_request column if not present + self.review_data["feature_request"] = "" if "bug_report" not in self.review_data.columns: - self.review_data["bug_report"] = "" # Initialize bug_report column if not present + self.review_data["bug_report"] = "" if "aspect" not in self.review_data.columns: - self.review_data["aspect"] = "" # Initialize aspect column if not present + self.review_data["aspect"] = "" if "aspect_sentiment" not in self.review_data.columns: - self.review_data["aspect_sentiment"] = "" # Initialize aspect_sentiment column if not present + self.review_data["aspect_sentiment"] = "" print(f"Loaded {len(self.review_data)} reviews from {data_path}") def display_next_review(self): - """Display the next review in the text box.""" + """Display the next unlabelled review in the text box.""" self.current_review_index = self.get_current_review_index() if self.current_review_index < len(self.review_data): review = self.review_data.iloc[self.current_review_index] @@ -283,9 +270,8 @@ class MultiTag: row["aspect_sentiment"] != "") def save_tags(self, save_path): - """Save the tagged data to a CSV file.""" + """Save the current tagged data to a CSV file.""" self.review_data.to_csv(save_path, index=False) - # print(f"Tagged data saved to {save_path}") def quit_app(self, event): tagged_count = (self.review_data['tagged'] == 1).sum() diff --git a/src/preprocess.py b/src/preprocess.py index 88452e7..af95cab 100644 --- a/src/preprocess.py +++ b/src/preprocess.py @@ -1,19 +1,13 @@ # preprocess.py +# Text cleaning and preprocessing for the Uber Reviews Dataset # langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually import pandas as pd import re def clean_text(text) -> str: - """Clean review text by removing URLS, emails, excessive whitespace - - Input: - text - the review text to clean - - Outputs: - str: the cleaned review text - """ + """Normalise review text by removing URLS, emails, excessive whitespace""" if pd.isna(text): return "" @@ -53,12 +47,6 @@ def preprocess_uber_reviews(input_path, output_path): 6. Removes less than 5 word reviews 6. Saves the cleaned dataset to uber_reviews_cleaned.csv - Inputs: - input_path (str): Path to uber_reviews.csv - output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv - - Outputs: - pd.df_clean: the dataframe of cleaned processed reviews """ print("="*50) print("PREPROCESSING UBER REVIEWS") @@ -117,10 +105,6 @@ def preprocess_uber_reviews(input_path, output_path): print("="*50) print(f"\nFinal dataset: {len(df_clean):,} reviews") print(f"Quality filters: word_count >= 5, duplicates removed") - # while this does remove a some legitimate reviews which would provide use in classification - # it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5 - # it showed the most amount of formative reviews without seeming excessive in data removal - print("\nRating distribution:") rating_dist = df_clean['rating'].value_counts().sort_index() for rating, count in rating_dist.items(): @@ -137,10 +121,7 @@ def preprocess_uber_reviews(input_path, output_path): print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}") print(f" Null values: {df_clean.isnull().sum().to_dict()}") print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}") - # lang detection takes 5+ mins so leaving it commented for now - #df_clean['detected_lang'] = df_clean['review'].apply(detect_language) - #print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}") - + # Sample reviews from each rating print("\n" + "="*50) print("SAMPLE CLEANED REVIEWS") @@ -152,11 +133,6 @@ def preprocess_uber_reviews(input_path, output_path): for index, row in sample.iterrows(): print(f" • ({row['word_count']} words) {row['review'][:100]}") - # Note about language - print("Language detection not applied due to unreliability on short") - print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.") - print(" ...Manual annotation phase will identify any non-English reviews") - return df_clean if __name__ == "__main__": diff --git a/src/sampler.py b/src/sampler.py index bf03d73..152489d 100644 --- a/src/sampler.py +++ b/src/sampler.py @@ -11,11 +11,12 @@ class Sampler: def __init__(self, data_path, target_samples): self.data_path = data_path - self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers) + # Default stratification method is based on original rating distribution + self.stratify_column = "rating" self.original_data = pd.read_csv(original_path, low_memory=False) self.data = pd.read_csv(self.data_path, low_memory=False) - self.total = len(self.data) # total number of records in the dataset + self.total = len(self.data) # total number of records in the working dataset print("="*50) print("SAMPLER INITIALIZED") @@ -35,25 +36,10 @@ class Sampler: print((_origdist*100).round(1),"\n") self.data.info(verbose=True) + """ + Kept for reference with later sampling methods - # add sampling method here - # random sample 5000 entries with stratifiying by rating - """ - rating - 5 57.1% (611133) - 1 26.5% (283895) - 4 7.8% (82953) - 3 4.7% (49928) - 2 3.9% (41707) - Name: proportion, dtype: object - """ - - """ - IGNORE --- Left in just in case - - Sample randomly - Redundant calculation - Doesn't factor that the distribution changed greatly after preprocessing + Samples from current processed data rather than matching the original distribution """ def get_stratified_sample(self) -> pd.DataFrame: stratified_sample = ( @@ -67,9 +53,8 @@ class Sampler: def sample_col(self, column) -> pd.DataFrame: """ - IGNORE --- Left in just in case - - Randomly sample, including conflicting math, I guess I was going to stratify + Samples a proportional number of rows from one column + Deprecated: Not used in final pipeline, kept for reference """ samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000 samples_per_column = max(samples_per_column,1) # also pointless @@ -77,24 +62,9 @@ class Sampler: """ - original_distribution_sample() - The main sampling method for our labelling as it - keeps composition of the original uber dataset, verified in - which is a fairer comparison, may also work better in general - - verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb - and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb - - - manually coded distributions taken from notebooks - - for ratings and actual number of samples - rating data is the whole data for a rating as we iterate - has error handling if totals doesn't match the required amount of samples per the orig distrib - randomise the indexes (samples) and appends to the new dataset - - - + Main sampling method to annotate + Samples reviews matching the original raw dataset distribution, so the labelled set + better represents the original data and is more comparable to the unlabelled set. """ def original_distribution_sample(self): original_dist = { @@ -117,21 +87,14 @@ class Sampler: return original_sample """ - sample_with_keywords() - - In order to train on more bugs and features data in - future this method was created + Build a sample with more likely bugs and feature reviews - 2000 balanced by rating (400 per) - 1500 likely bugs using bug_keywords list - 1500 likely features using feature_keywords list - - inputs: - outputs: - """ def sample_with_keywords(self): - #TODO add keywords for feature classification + # Keyword lists for oversampling likely bug reports and feature requests print(f"\n{'='*50}") print("Keyword influenced / rating stratified set") print(f"\n{'='*50}") @@ -181,18 +144,16 @@ class Sampler: # Drop helper columns keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature']) - - print(f"\n Total samples: {len(keyword_sample):,}") return keyword_sample def sample_tiny_size(self): - mini_sample = self.data.sample(200) # reading some samples manually + mini_sample = self.data.sample(200) # for reading some samples manually return mini_sample def save_sample(self, sample_df,output_path): - """Save sample and display statistics""" + """Save sample and display summary statistics""" sample_df.to_csv(output_path, index=False) print(f"\n{'='*50}") diff --git a/src/train.py b/src/train.py index 7e00aea..da5a0dd 100644 --- a/src/train.py +++ b/src/train.py @@ -1,5 +1,6 @@ # train.py -# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html +# Training script for both MTL and STL setups +# Structure adapted and adjusted from standard PyTorch training loops import argparse import os from datetime import datetime @@ -17,40 +18,26 @@ from transformers import get_linear_schedule_with_warmup from sklearn.metrics import classification_report, f1_score from sklearn.utils.class_weight import compute_class_weight - from dataset import ReviewDataset from model import Model, SingleTaskModel - -# ======================================================================= -# Training script for MTL and STL training configurations -# ======================================================================= - -# NFR5, reproducibility +# Fixed seed for near reproducibile runs SEED = 4321 torch.manual_seed(SEED) np.random.seed(SEED) random.seed(SEED) -# ------------------- Class weights ------------------- -# Using weights inversely proportional to class frequencies to avoid majority class bias, -# prioritize useful bug reports / feature requests + def compute_weights(df, column, device): - """Copmutes inverse frequency class weights for a label column - - Uses sklearns balanced mode - Rare classes receive higher weights to penalise so it can learn more from less - """ + """Computes inverse frequency class weights for a label column""" classes = np.unique(df[column]) weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column]) return torch.tensor(weights, dtype=torch.float).to(device) -# parse_args() - NFR7 and NFR9 -# Example Usages: python src/train.py --dataset boosted -# python src/train.py --epochs 15 NOTE: 8 - 12 epochs has seen best results so far + def parse_args(): parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.") parser.add_argument("--mode", type=str, default="mtl", choices=["mtl", "stl"], help="Choose between 'mtl' (multitask learning) and 'stl' (single task learning).") @@ -67,23 +54,25 @@ def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Starting training...", flush=True) print("Using device:", device) - # Set cuda seeds for reproducibility + + # Set cuda seeds for reproducibility on GPU if torch.cuda.is_available(): print("GPU:", torch.cuda.get_device_name(0)) torch.cuda.manual_seed_all(SEED) torch.cuda.manual_seed(SEED) print(f"Using dataset: {args.dataset.upper()}") + # Force deterministic for reproducibility at a slight performance cost torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False - # load data + # load data into train/val splits train = f"data/processed/{args.dataset}_train.csv" val = f"data/processed/{args.dataset}_val.csv" os.makedirs("outputs", exist_ok=True) os.makedirs("runs", exist_ok=True) - # FR1, FR2, Multilingual tokenizer initilization + # Tokenizer initilization tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") train_dataset = ReviewDataset(train, tokenizer) @@ -92,7 +81,7 @@ def main(): training_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) validation_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) - # FR3, shared multilingual model with task-specific heads + # Shared model uses encoder across all tasks, STL model trains one task at a time if args.mode == "mtl": model = Model().to(device) active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment'] @@ -113,7 +102,7 @@ def main(): train_df = pd.read_csv(train) - # Class weights + # Compute per-task weights from the training split print("\n Computing class weights...") bug_weights = compute_weights(train_df, 'bug_report', device) feature_weights = compute_weights(train_df, 'feature_request', device) @@ -151,7 +140,7 @@ def main(): num_training_steps=total_steps ) - # ------------------- Training loop ------------------- + # Entry point for training loop, with Tensorboard logging and early stopping based on validation macro F1 score start_time = time.time() timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') writer = SummaryWriter(f'runs/reclass_{run_name}_{timestamp}') @@ -175,7 +164,7 @@ def main(): input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) - # FR8, Multitask forward pass + # Multitask forward pass outputs = model(input_ids, attention_mask) loss = 0 @@ -199,7 +188,7 @@ def main(): writer.add_scalar("Loss/train", avg_train_loss, epoch) print(f"Average training loss: {avg_train_loss:.4f}") - # -------------------- Validation loop ------------------- + # Validation phase model.eval() total_val_loss = 0.0 @@ -226,7 +215,7 @@ def main(): avg_vloss = total_val_loss / len(validation_loader) writer.add_scalar("Loss/val", avg_vloss, epoch) - # FR11, Performance evaluation + # Performance evaluation summary print("\nValidation Metrics (MACRO F1):") epoch_f1 = [] for task in active_tasks: @@ -239,7 +228,7 @@ def main(): writer.add_scalar("F1/val_macro_avg", avg_macro_f1, epoch) print(f" Average Macro F1: {avg_macro_f1:.4f}") - # NFR4, Early stopping + # Early stopping if avg_macro_f1 > best_f1: best_f1 = avg_macro_f1 patience_counter = 0