added further documentation across all files

This commit is contained in:
2026-04-05 14:19:57 +01:00
parent 7fa67af6c0
commit 1cca27e0b8
9 changed files with 173 additions and 246 deletions

127
README.md
View File

@@ -4,9 +4,11 @@
---
## Project Overview
# README not finished
RECLASS is a multi-task learning system which uses a shared BERT encoder with task-specific classification heads.
## Overview
RECLASS is a multitask learning system which uses a shared multilingual transformer encoder with task-specific heads and single-task implementations for optional comparison.
| Task | Output | Classes |
|------|--------|---------|
@@ -18,57 +20,104 @@ RECLASS is a multi-task learning system which uses a shared BERT encoder with ta
## Dataset
- **Source**: [Uber Customer Reviews (Kaggle)](https://www.kaggle.com/datasets/khushipitroda/ola-vs-uber-play-store-reviews)
- **Original size**: 1,069,616 reviews
- **Cleaned size**: 495,036 reviews (after removing short/duplicate reviews)
- **Annotation target**: 5,000 manually labelled reviews
- **Original size**: ~1.07M Reviews
- **After Preprocessing**: ~495K Reviews
- **Annotation subsets**: 5,000 from the original distribution, 5,000 from a keyword boosted sample
## Preprocessing Steps
- Removed URLS and emails
- Normalised text and punctuation
- Removed duplicate reviews
- Filtered reviews less than 5 words
- Output sets
- Original: matches the original distribution of the raw dataset
- Boosted: oversamples bug reports and feature requests using keyword heuristics
## Model
- Encoder: XLM-RoBERTa (large multilingual transformer model)
- Architecture:
- Shared encoder
- Task-specific classification heads
- Training setups:
- MTL (Multitask learning)
- STL (Single-task learning)
Class weights are applied to reduce imbalance effects.
## Repository Structure
```
6013/
README.md
.gitignore
data/
uber_reviews.csv # Raw dataset
uber_reviews_cleaned.csv # Preprocessed reviews
uber_reviews_sampled.csv # Stratified sample for annotation
uber_reviews_tagged.csv # Annotated reviews (in progress)
notebooks/
preprocessing_uber.ipynb # Preprocessing analysis
uber_cleaned.ipynb # Cleaned data verification
src/
preprocess.py # Text cleaning and filtering pipeline
sampler.py # Stratified sampling strategies
multitag.py # GUI annotation tool
train.py # Model training (in progress)
infer.py # Inference pipeline (in progress)
outputs/
figures/
```
.
├── data
└── processed
├── boosted_test.csv
├── boosted_train.csv
├── boosted_val.csv
├── original_test.csv
├── original_train.csv
├── original_val.csv
└── review.csv
├── notebooks/
├── outputs
│ └── figures/
├── README.md
├── architecture.png
└── src
├── dataset.py
├── evaluate.py
├── infer.py
├── model.py
├── multitag.py
├── preprocess.py
├── sampler.py
└── train.py
## Current Progress
## Results
- Manual annotation of 5,000 reviews
- BERT baseline implementation
- Multi-task model architecture
- Training and evaluation
- Comparative analysis (MTL vs single-task)
- Final report and presentation
Evaluation includes Precision, Recall, Macro F1, Confusion matrices and confidence analysis.
Results and summaries are found in outputs/*.json and outputs/figures/
## Installation
```
# Clone repository
...
# Create conda environment
...
conda create -n reclass python=3.11
conda activate reclass
```
```
# Install dependencies
...requirements.txt
conda install --file requirements.txt
```
## Usage
## References
## Licenses
#### Train Model
```
python src/train.py --mode mtl --dataset original
```
#### Evaluate Model
```
python src/evaluate.py --mode mtl --dataset original --model_path <model>.pt
```
#### Run Inference
```
python src/infer.py --mode mtl --model_path <model>.pt --dataset review
```
## Notes
- The same tokenizer is used across training, evaluation and inference to ensure consistency
- Sampling and preprocessing choices are documented further in src files and dissertation
---

View File

@@ -1,22 +1,17 @@
# dataset.py
# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer
# Takes a row from the csv, tokenizes the review and returns a tensor
# Takes a row from the csv, tokenizes the review and returns a tensor ready for the model
import torch
import pandas as pd
from torch.utils.data import Dataset
from transformers import AutoTokenizer
class ReviewDataset(Dataset):
"""Pytorch Dataset for loading tokenized reviews
"""
Dataset for tokenized reviews with labels for all 4 tasks.
Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
Expects a csv and tokenizes reviews using XLM-RoBERTa (SentencePiece), returning a dictionary with of
input tensors and integer labels for all 4 tasks.
Args:
path (str): Path to the csv file containing the reviews and labels.
tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
"""
def __init__(self, path, tokenizer, max_length=256):
@@ -30,25 +25,14 @@ class ReviewDataset(Dataset):
def __getitem__(self, idx):
review = self.df.iloc[idx]['review']
# encoding['input_ids'] 1D tensor of token ids, shape [max_length]
# encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
# Both have shape [1, max_length] because of return_tensors='pt'
# Squeeze them to [max_length] with .squeeze(0)
# Tokenize with padding and truncation to max_length, returning PyTorch tensors
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
# Returns a dictionary with:
# 'input_ids': tensor of shape [max_length]
# 'attention_mask': tensor of shape [max_length]
# MTL structure labels as tensor scalars:
# 'bug_report': tensor scalar (torch.tensor(label_value))
# 'feature_request': tensor scalar (torch.tensor(label_value))
# 'aspect': tensor scalar (torch.tensor(label_value))
# 'aspect_sentiment': tensor scalar (torch.tensor(label_value))
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
# Labels for all 4 tasks, converted to tensors
'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long),
'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long),
'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
@@ -65,18 +49,23 @@ class InferenceDataset(Dataset):
return len(self.df)
def __getitem__(self, idx):
#review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews
review = str(self.df.iloc[idx][self.text_column])
if review == 'nan' or review.strip() == '':
review = ' '
# Same as training dataset but without labels, for inference on test sets
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
}
if __name__ == "__main__":
# Quick test
dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
print(dataset.__getitem__(1))

View File

@@ -1,4 +1,6 @@
# evauluate.py
# Evaluate MTL or STL models on the test split
import os
import torch
import time
@@ -17,7 +19,6 @@ from sklearn.metrics import classification_report, confusion_matrix, f1_score
from dataset import ReviewDataset
from model import Model, SingleTaskModel
# TODO: load checkpoint, produce tables of evaluation figures
SEED = 4321
torch.manual_seed(SEED)
np.random.seed(SEED)
@@ -31,6 +32,7 @@ label_names = {
}
def parse_args():
"""Parse command line arguments for evaluation"""
parser = argparse.ArgumentParser(description="RECLASS Evaluation Script")
parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl")
parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
@@ -47,11 +49,13 @@ def main():
os.makedirs("outputs/figures", exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load test dataset and model
test = f"data/processed/{args.dataset}_test.csv"
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
test_dataset = ReviewDataset(test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=args.batch_size)
# MTL evaluates all tasks, STL needs to know a single task to evaluate on
if args.mode == "mtl":
model = Model().to(device)
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
@@ -86,6 +90,7 @@ def main():
logits = outputs[task]
preds = torch.argmax(logits, dim=1)
# Kepp max softmax as confidence estimate
probs = F.softmax(logits, dim=1)
confidence = probs.max(dim=1).values
@@ -93,6 +98,7 @@ def main():
all_preds[task].extend(preds.cpu().numpy())
all_confidences[task].extend(confidence.cpu().numpy())
# Detailed JSON summary along with printed results
summary = {
"mode": args.mode,
"dataset": args.dataset,
@@ -137,7 +143,7 @@ def main():
print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}")
print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}")
# save summary to JSON
# Store main metrics and full per class report to JSON
summary["results"][task] = {
"macro_f1": float(report_dict["macro avg"]["f1-score"]),
"macro_precision": float(report_dict["macro avg"]["precision"]),
@@ -150,8 +156,7 @@ def main():
"per_class": report_dict
}
# Confusion matrix
# Confusion matrix for each evaluated task
cm = confusion_matrix(labels_arr, preds_arr)
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(
@@ -172,7 +177,6 @@ def main():
test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable
test_df[f'{task}_confidence'] = conf_arr
# to JSON
run_name = args.task if args.mode == "stl" else "mtl"
json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json"
with open(json_path, "w") as f:

View File

@@ -1,4 +1,6 @@
# infer.py
# Run inference using MTL or STL on various inputs (CSV or User)
from datetime import datetime
import os
import torch
@@ -20,8 +22,6 @@ from torch.utils.data import Dataset
from dataset import InferenceDataset
from model import Model, SingleTaskModel
label_names = {
'bug_report': ['No', 'Yes'],
'feature_request': ['No', 'Yes'],
@@ -33,9 +33,6 @@ SEED = 4321
torch.manual_seed(SEED)
np.random.seed(SEED)
def parse_args():
parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/")
@@ -55,7 +52,7 @@ def main():
os.makedirs("outputs/inference", exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# this section is nearly identical to the first part of evaluate.py
# Mirrors the evaluation script with addition of interactive modes
args = parse_args()
print(f'{"="*50}')
print(f'{"Starting inference"}')
@@ -70,7 +67,7 @@ def main():
print("Loading model, tokenizer and datasets ...")
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
# Let the user decide if they want to run inference on the whole dataset or via the shell input
# Support CSV and interactive input
if not args.interactive and not args.text:
infer = f"data/processed/{args.dataset}.csv"
infer_df = pd.read_csv(infer)

View File

@@ -1,27 +1,13 @@
# model.py
# One encoder, four shared heads(bug report, feature request, aspect, aspect sentiment)
# 12 transformer layers, 12 attention heads
# Shared encoder (XLM-RoBERTa) with either multitask heads for all 4 tasks or single task head for comparison
from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel
import torch.nn as nn
# Using dropout, This has proven to be an effective technique
# for regularization and preventing the co-adaptation of neurons as described in https://arxiv.org/abs/1207.0580
# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
# Each hidden representation is size 768
# Using dropout before classification to reduce overfitting
class SingleTaskModel(nn.Module):
"""Single task model to compare MTL approach to review classification
Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
just different args
Args:
task_name: which of the 4 tasks are we training for
num_classes: number of output classes for the task
dropout_rate: probability applied to cls representation, randomly drops tokens for better results
"""
"""Single task model with one head to compare MTL approach to review classification"""
def __init__(self, task_name, num_classes, dropout_rate=0.2):
super().__init__()
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -35,15 +21,7 @@ class SingleTaskModel(nn.Module):
return {self.task_name: logits}
class Model(nn.Module):
""" Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads
Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through
shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
allowing the encoder to learn from the shared representations / generalisations
Args:
dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
"""
""" Multitask model with shared encoder and 4 task specific heads."""
def __init__(self, dropout_rate=0.2):
super().__init__()
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -58,8 +36,7 @@ class Model(nn.Module):
self.aspect_head = nn.Linear(hidden_size, 6)
self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
# Pass through encoder then extract the token representation through [batch_size, 768]
# Apply droupout to it, take scores for each head, return them in a dictionary
# Pass through encoder once then extract the token representation, then reuse the shared represenetation across all tasks
def forward(self, input_ids, attention_mask):
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
# index 0 from [batch_size, 768]
@@ -67,7 +44,6 @@ class Model(nn.Module):
output = self.dropout(output)
# Logits for each head:
bug_logits = self.bug_head(output)
feature_logits = self.feature_head(output)
aspect_logits = self.aspect_head(output)

View File

@@ -1,13 +1,9 @@
# multitag.py
# This app enables manual annotation of reviews in the Uber dataset, for training with
# to achieve review classifications with multi task deep learning
# Manual annotation tool for labelling reviews in the Uber reviews dataset, for multitask training
# In another time I would have had much more tasks / classifications so mtl can perform better (that would mean better labelling),
#at least that is my prediction of why this may not be as good as I wanted
import tkinter as tk
from tkinter import ttk
import pandas as pd
# import langdetect
import os
class MultiTag:
@@ -41,9 +37,6 @@ class MultiTag:
self.number_of_aspects = 6 # number of aspect buttons
self.root.title("MultiTag")
#self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
#self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
# Colors for active label
self.color_incomplete = "#003366"
self.color_complete = "#00AA00"
@@ -51,8 +44,7 @@ class MultiTag:
# Paths
tagged_path = "data/uber_reviews_tagged.csv"
sampled_path = "data/uber_reviews_sampled.csv"
# self.load_review_data("data/uber_reviews_sampled.csv")
# self.load_review_data("data/uber_reviews_tagged.csv")
if not os.path.exists(tagged_path):
print(f"Tagged file did not exist, making one at: {sampled_path}")
sampled_df = pd.read_csv(sampled_path, low_memory=False)
@@ -89,13 +81,13 @@ class MultiTag:
self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))
# Labels ROW 3
# ROW 3: Field labels
ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))
# ROW 4 |Buttons|
# ROW 4: Input buttons
# Feature Requests
self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
@@ -132,20 +124,15 @@ class MultiTag:
self.root.bind("f", self.handle_key)
self.root.bind("g", self.handle_key)
self.root.bind("h", self.handle_key)
# self.root.bind("j", self.handle_key)
# self.root.bind("k", self.handle_key)
# self.root.bind("l", self.handle_key)
self.display_next_review()
# self.save_tags("data/uber_reviews_tagged.csv")
self.root.mainloop()
def handle_key(self, event):
key = event.char
# Column 0 or 1: feature/bug (1 and 0)
# Feature Request and Bug Report are binary input (1 and 0 keys)
if key in ['1', '0']:
if self.active_column == 0:
self.feature_pressed(key)
@@ -159,7 +146,7 @@ class MultiTag:
self.sentiment_pressed(key.upper())
def update_status(self):
"""Update status label and highlight color based on completion state"""
"""Update status label and highlight"""
if self.all_labels_complete():
self.highlight.configure(bg=self.color_complete)
self.status_label.configure(
@@ -212,22 +199,22 @@ class MultiTag:
def load_review_data(self, data_path):
"""Load review data from a CSV file."""
"""Load review data from a CSV file. Adds annotation columns if they don't exist."""
self.review_data = pd.read_csv(data_path, low_memory=False)
if "tagged" not in self.review_data.columns:
self.review_data["tagged"] = 0 # Initialize tagged column if not present
self.review_data["tagged"] = 0
if "feature_request" not in self.review_data.columns:
self.review_data["feature_request"] = "" # Initialize feature_request column if not present
self.review_data["feature_request"] = ""
if "bug_report" not in self.review_data.columns:
self.review_data["bug_report"] = "" # Initialize bug_report column if not present
self.review_data["bug_report"] = ""
if "aspect" not in self.review_data.columns:
self.review_data["aspect"] = "" # Initialize aspect column if not present
self.review_data["aspect"] = ""
if "aspect_sentiment" not in self.review_data.columns:
self.review_data["aspect_sentiment"] = "" # Initialize aspect_sentiment column if not present
self.review_data["aspect_sentiment"] = ""
print(f"Loaded {len(self.review_data)} reviews from {data_path}")
def display_next_review(self):
"""Display the next review in the text box."""
"""Display the next unlabelled review in the text box."""
self.current_review_index = self.get_current_review_index()
if self.current_review_index < len(self.review_data):
review = self.review_data.iloc[self.current_review_index]
@@ -283,9 +270,8 @@ class MultiTag:
row["aspect_sentiment"] != "")
def save_tags(self, save_path):
"""Save the tagged data to a CSV file."""
"""Save the current tagged data to a CSV file."""
self.review_data.to_csv(save_path, index=False)
# print(f"Tagged data saved to {save_path}")
def quit_app(self, event):
tagged_count = (self.review_data['tagged'] == 1).sum()

View File

@@ -1,19 +1,13 @@
# preprocess.py
# Text cleaning and preprocessing for the Uber Reviews Dataset
# langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
import pandas as pd
import re
def clean_text(text) -> str:
"""Clean review text by removing URLS, emails, excessive whitespace
Input:
text - the review text to clean
Outputs:
str: the cleaned review text
"""
"""Normalise review text by removing URLS, emails, excessive whitespace"""
if pd.isna(text):
return ""
@@ -53,12 +47,6 @@ def preprocess_uber_reviews(input_path, output_path):
6. Removes less than 5 word reviews
6. Saves the cleaned dataset to uber_reviews_cleaned.csv
Inputs:
input_path (str): Path to uber_reviews.csv
output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv
Outputs:
pd.df_clean: the dataframe of cleaned processed reviews
"""
print("="*50)
print("PREPROCESSING UBER REVIEWS")
@@ -117,10 +105,6 @@ def preprocess_uber_reviews(input_path, output_path):
print("="*50)
print(f"\nFinal dataset: {len(df_clean):,} reviews")
print(f"Quality filters: word_count >= 5, duplicates removed")
# while this does remove a some legitimate reviews which would provide use in classification
# it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5
# it showed the most amount of formative reviews without seeming excessive in data removal
print("\nRating distribution:")
rating_dist = df_clean['rating'].value_counts().sort_index()
for rating, count in rating_dist.items():
@@ -137,10 +121,7 @@ def preprocess_uber_reviews(input_path, output_path):
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
print(f" Null values: {df_clean.isnull().sum().to_dict()}")
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
# lang detection takes 5+ mins so leaving it commented for now
#df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
#print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
# Sample reviews from each rating
print("\n" + "="*50)
print("SAMPLE CLEANED REVIEWS")
@@ -152,11 +133,6 @@ def preprocess_uber_reviews(input_path, output_path):
for index, row in sample.iterrows():
print(f" • ({row['word_count']} words) {row['review'][:100]}")
# Note about language
print("Language detection not applied due to unreliability on short")
print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
print(" ...Manual annotation phase will identify any non-English reviews")
return df_clean
if __name__ == "__main__":

View File

@@ -11,11 +11,12 @@ class Sampler:
def __init__(self, data_path, target_samples):
self.data_path = data_path
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
# Default stratification method is based on original rating distribution
self.stratify_column = "rating"
self.original_data = pd.read_csv(original_path, low_memory=False)
self.data = pd.read_csv(self.data_path, low_memory=False)
self.total = len(self.data) # total number of records in the dataset
self.total = len(self.data) # total number of records in the working dataset
print("="*50)
print("SAMPLER INITIALIZED")
@@ -35,25 +36,10 @@ class Sampler:
print((_origdist*100).round(1),"\n")
self.data.info(verbose=True)
"""
Kept for reference with later sampling methods
# add sampling method here
# random sample 5000 entries with stratifiying by rating
"""
rating
5 57.1% (611133)
1 26.5% (283895)
4 7.8% (82953)
3 4.7% (49928)
2 3.9% (41707)
Name: proportion, dtype: object
"""
"""
IGNORE --- Left in just in case
Sample randomly
Redundant calculation
Doesn't factor that the distribution changed greatly after preprocessing
Samples from current processed data rather than matching the original distribution
"""
def get_stratified_sample(self) -> pd.DataFrame:
stratified_sample = (
@@ -67,9 +53,8 @@ class Sampler:
def sample_col(self, column) -> pd.DataFrame:
"""
IGNORE --- Left in just in case
Randomly sample, including conflicting math, I guess I was going to stratify
Samples a proportional number of rows from one column
Deprecated: Not used in final pipeline, kept for reference
"""
samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
samples_per_column = max(samples_per_column,1) # also pointless
@@ -77,24 +62,9 @@ class Sampler:
"""
original_distribution_sample()
The main sampling method for our labelling as it
keeps composition of the original uber dataset, verified in
which is a fairer comparison, may also work better in general
verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
manually coded distributions taken from notebooks
for ratings and actual number of samples
rating data is the whole data for a rating as we iterate
has error handling if totals doesn't match the required amount of samples per the orig distrib
randomise the indexes (samples) and appends to the new dataset
Main sampling method to annotate
Samples reviews matching the original raw dataset distribution, so the labelled set
better represents the original data and is more comparable to the unlabelled set.
"""
def original_distribution_sample(self):
original_dist = {
@@ -117,21 +87,14 @@ class Sampler:
return original_sample
"""
sample_with_keywords()
In order to train on more bugs and features data in
future this method was created
Build a sample with more likely bugs and feature reviews
- 2000 balanced by rating (400 per)
- 1500 likely bugs using bug_keywords list
- 1500 likely features using feature_keywords list
inputs:
outputs:
"""
def sample_with_keywords(self):
#TODO add keywords for feature classification
# Keyword lists for oversampling likely bug reports and feature requests
print(f"\n{'='*50}")
print("Keyword influenced / rating stratified set")
print(f"\n{'='*50}")
@@ -181,18 +144,16 @@ class Sampler:
# Drop helper columns
keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
print(f"\n Total samples: {len(keyword_sample):,}")
return keyword_sample
def sample_tiny_size(self):
mini_sample = self.data.sample(200) # reading some samples manually
mini_sample = self.data.sample(200) # for reading some samples manually
return mini_sample
def save_sample(self, sample_df,output_path):
"""Save sample and display statistics"""
"""Save sample and display summary statistics"""
sample_df.to_csv(output_path, index=False)
print(f"\n{'='*50}")

View File

@@ -1,5 +1,6 @@
# train.py
# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
# Training script for both MTL and STL setups
# Structure adapted and adjusted from standard PyTorch training loops
import argparse
import os
from datetime import datetime
@@ -17,40 +18,26 @@ from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
from dataset import ReviewDataset
from model import Model, SingleTaskModel
# =======================================================================
# Training script for MTL and STL training configurations
# =======================================================================
# NFR5, reproducibility
# Fixed seed for near reproducibile runs
SEED = 4321
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
# ------------------- Class weights -------------------
# Using weights inversely proportional to class frequencies to avoid majority class bias,
# prioritize useful bug reports / feature requests
def compute_weights(df, column, device):
"""Copmutes inverse frequency class weights for a label column
Uses sklearns balanced mode
Rare classes receive higher weights to penalise so it can learn more from less
"""
"""Computes inverse frequency class weights for a label column"""
classes = np.unique(df[column])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
return torch.tensor(weights, dtype=torch.float).to(device)
# parse_args() - NFR7 and NFR9
# Example Usages: python src/train.py --dataset boosted
# python src/train.py --epochs 15 NOTE: 8 - 12 epochs has seen best results so far
def parse_args():
parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
parser.add_argument("--mode", type=str, default="mtl", choices=["mtl", "stl"], help="Choose between 'mtl' (multitask learning) and 'stl' (single task learning).")
@@ -67,23 +54,25 @@ def main():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Starting training...", flush=True)
print("Using device:", device)
# Set cuda seeds for reproducibility
# Set cuda seeds for reproducibility on GPU
if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0))
torch.cuda.manual_seed_all(SEED)
torch.cuda.manual_seed(SEED)
print(f"Using dataset: {args.dataset.upper()}")
# Force deterministic for reproducibility at a slight performance cost
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# load data
# load data into train/val splits
train = f"data/processed/{args.dataset}_train.csv"
val = f"data/processed/{args.dataset}_val.csv"
os.makedirs("outputs", exist_ok=True)
os.makedirs("runs", exist_ok=True)
# FR1, FR2, Multilingual tokenizer initilization
# Tokenizer initilization
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
train_dataset = ReviewDataset(train, tokenizer)
@@ -92,7 +81,7 @@ def main():
training_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
# FR3, shared multilingual model with task-specific heads
# Shared model uses encoder across all tasks, STL model trains one task at a time
if args.mode == "mtl":
model = Model().to(device)
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
@@ -113,7 +102,7 @@ def main():
train_df = pd.read_csv(train)
# Class weights
# Compute per-task weights from the training split
print("\n Computing class weights...")
bug_weights = compute_weights(train_df, 'bug_report', device)
feature_weights = compute_weights(train_df, 'feature_request', device)
@@ -151,7 +140,7 @@ def main():
num_training_steps=total_steps
)
# ------------------- Training loop -------------------
# Entry point for training loop, with Tensorboard logging and early stopping based on validation macro F1 score
start_time = time.time()
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(f'runs/reclass_{run_name}_{timestamp}')
@@ -175,7 +164,7 @@ def main():
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
# FR8, Multitask forward pass
# Multitask forward pass
outputs = model(input_ids, attention_mask)
loss = 0
@@ -199,7 +188,7 @@ def main():
writer.add_scalar("Loss/train", avg_train_loss, epoch)
print(f"Average training loss: {avg_train_loss:.4f}")
# -------------------- Validation loop -------------------
# Validation phase
model.eval()
total_val_loss = 0.0
@@ -226,7 +215,7 @@ def main():
avg_vloss = total_val_loss / len(validation_loader)
writer.add_scalar("Loss/val", avg_vloss, epoch)
# FR11, Performance evaluation
# Performance evaluation summary
print("\nValidation Metrics (MACRO F1):")
epoch_f1 = []
for task in active_tasks:
@@ -239,7 +228,7 @@ def main():
writer.add_scalar("F1/val_macro_avg", avg_macro_f1, epoch)
print(f" Average Macro F1: {avg_macro_f1:.4f}")
# NFR4, Early stopping
# Early stopping
if avg_macro_f1 > best_f1:
best_f1 = avg_macro_f1
patience_counter = 0