added further documentation across all files

This commit is contained in:
2026-04-05 14:19:57 +01:00
parent 7fa67af6c0
commit 1cca27e0b8
9 changed files with 173 additions and 246 deletions

127
README.md
View File

@@ -4,9 +4,11 @@
--- ---
## Project Overview # README not finished
RECLASS is a multi-task learning system which uses a shared BERT encoder with task-specific classification heads. ## Overview
RECLASS is a multitask learning system which uses a shared multilingual transformer encoder with task-specific heads and single-task implementations for optional comparison.
| Task | Output | Classes | | Task | Output | Classes |
|------|--------|---------| |------|--------|---------|
@@ -18,57 +20,104 @@ RECLASS is a multi-task learning system which uses a shared BERT encoder with ta
## Dataset ## Dataset
- **Source**: [Uber Customer Reviews (Kaggle)](https://www.kaggle.com/datasets/khushipitroda/ola-vs-uber-play-store-reviews) - **Source**: [Uber Customer Reviews (Kaggle)](https://www.kaggle.com/datasets/khushipitroda/ola-vs-uber-play-store-reviews)
- **Original size**: 1,069,616 reviews - **Original size**: ~1.07M Reviews
- **Cleaned size**: 495,036 reviews (after removing short/duplicate reviews) - **After Preprocessing**: ~495K Reviews
- **Annotation target**: 5,000 manually labelled reviews - **Annotation subsets**: 5,000 from the original distribution, 5,000 from a keyword boosted sample
## Preprocessing Steps
- Removed URLS and emails
- Normalised text and punctuation
- Removed duplicate reviews
- Filtered reviews less than 5 words
- Output sets
- Original: matches the original distribution of the raw dataset
- Boosted: oversamples bug reports and feature requests using keyword heuristics
## Model
- Encoder: XLM-RoBERTa (large multilingual transformer model)
- Architecture:
- Shared encoder
- Task-specific classification heads
- Training setups:
- MTL (Multitask learning)
- STL (Single-task learning)
Class weights are applied to reduce imbalance effects.
## Repository Structure ## Repository Structure
``` .
6013/ ├── data
README.md └── processed
.gitignore ├── boosted_test.csv
data/ ├── boosted_train.csv
uber_reviews.csv # Raw dataset ├── boosted_val.csv
uber_reviews_cleaned.csv # Preprocessed reviews ├── original_test.csv
uber_reviews_sampled.csv # Stratified sample for annotation ├── original_train.csv
uber_reviews_tagged.csv # Annotated reviews (in progress) ├── original_val.csv
notebooks/ └── review.csv
preprocessing_uber.ipynb # Preprocessing analysis ├── notebooks/
uber_cleaned.ipynb # Cleaned data verification
src/ ├── outputs
preprocess.py # Text cleaning and filtering pipeline │ └── figures/
sampler.py # Stratified sampling strategies ├── README.md
multitag.py # GUI annotation tool ├── architecture.png
train.py # Model training (in progress) └── src
infer.py # Inference pipeline (in progress) ├── dataset.py
outputs/ ├── evaluate.py
figures/ ├── infer.py
``` ├── model.py
├── multitag.py
├── preprocess.py
├── sampler.py
└── train.py
## Current Progress ## Results
- Manual annotation of 5,000 reviews Evaluation includes Precision, Recall, Macro F1, Confusion matrices and confidence analysis.
- BERT baseline implementation
- Multi-task model architecture Results and summaries are found in outputs/*.json and outputs/figures/
- Training and evaluation
- Comparative analysis (MTL vs single-task)
- Final report and presentation
## Installation ## Installation
``` ```
# Clone repository
...
# Create conda environment # Create conda environment
... conda create -n reclass python=3.11
conda activate reclass
```
```
# Install dependencies # Install dependencies
...requirements.txt conda install --file requirements.txt
``` ```
## Usage ## Usage
## References
## Licenses #### Train Model
```
python src/train.py --mode mtl --dataset original
```
#### Evaluate Model
```
python src/evaluate.py --mode mtl --dataset original --model_path <model>.pt
```
#### Run Inference
```
python src/infer.py --mode mtl --model_path <model>.pt --dataset review
```
## Notes
- The same tokenizer is used across training, evaluation and inference to ensure consistency
- Sampling and preprocessing choices are documented further in src files and dissertation
--- ---

View File

@@ -1,22 +1,17 @@
# dataset.py # dataset.py
# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer # Takes a row from the csv, tokenizes the review and returns a tensor ready for the model
# Takes a row from the csv, tokenizes the review and returns a tensor
import torch import torch
import pandas as pd import pandas as pd
from torch.utils.data import Dataset from torch.utils.data import Dataset
from transformers import AutoTokenizer from transformers import AutoTokenizer
class ReviewDataset(Dataset): class ReviewDataset(Dataset):
"""Pytorch Dataset for loading tokenized reviews """
Dataset for tokenized reviews with labels for all 4 tasks.
Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams). Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of Expects a csv and tokenizes reviews using XLM-RoBERTa (SentencePiece), returning a dictionary with of
input tensors and integer labels for all 4 tasks. input tensors and integer labels for all 4 tasks.
Args:
path (str): Path to the csv file containing the reviews and labels.
tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
""" """
def __init__(self, path, tokenizer, max_length=256): def __init__(self, path, tokenizer, max_length=256):
@@ -30,25 +25,14 @@ class ReviewDataset(Dataset):
def __getitem__(self, idx): def __getitem__(self, idx):
review = self.df.iloc[idx]['review'] review = self.df.iloc[idx]['review']
# encoding['input_ids'] 1D tensor of token ids, shape [max_length] # Tokenize with padding and truncation to max_length, returning PyTorch tensors
# encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
# Both have shape [1, max_length] because of return_tensors='pt'
# Squeeze them to [max_length] with .squeeze(0)
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
# Returns a dictionary with:
# 'input_ids': tensor of shape [max_length]
# 'attention_mask': tensor of shape [max_length]
# MTL structure labels as tensor scalars:
# 'bug_report': tensor scalar (torch.tensor(label_value))
# 'feature_request': tensor scalar (torch.tensor(label_value))
# 'aspect': tensor scalar (torch.tensor(label_value))
# 'aspect_sentiment': tensor scalar (torch.tensor(label_value))
return { return {
'input_ids': encoding['input_ids'].squeeze(0), 'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0),
# Labels for all 4 tasks, converted to tensors
'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long), 'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long),
'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long), 'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long),
'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long), 'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
@@ -65,18 +49,23 @@ class InferenceDataset(Dataset):
return len(self.df) return len(self.df)
def __getitem__(self, idx): def __getitem__(self, idx):
#review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews
review = str(self.df.iloc[idx][self.text_column]) review = str(self.df.iloc[idx][self.text_column])
if review == 'nan' or review.strip() == '': if review == 'nan' or review.strip() == '':
review = ' ' review = ' '
# Same as training dataset but without labels, for inference on test sets
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
return { return {
'input_ids': encoding['input_ids'].squeeze(0), 'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0),
} }
if __name__ == "__main__": if __name__ == "__main__":
# Quick test
dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")) dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
print(dataset.__getitem__(1)) print(dataset.__getitem__(1))

View File

@@ -1,4 +1,6 @@
# evauluate.py # evauluate.py
# Evaluate MTL or STL models on the test split
import os import os
import torch import torch
import time import time
@@ -17,7 +19,6 @@ from sklearn.metrics import classification_report, confusion_matrix, f1_score
from dataset import ReviewDataset from dataset import ReviewDataset
from model import Model, SingleTaskModel from model import Model, SingleTaskModel
# TODO: load checkpoint, produce tables of evaluation figures
SEED = 4321 SEED = 4321
torch.manual_seed(SEED) torch.manual_seed(SEED)
np.random.seed(SEED) np.random.seed(SEED)
@@ -31,6 +32,7 @@ label_names = {
} }
def parse_args(): def parse_args():
"""Parse command line arguments for evaluation"""
parser = argparse.ArgumentParser(description="RECLASS Evaluation Script") parser = argparse.ArgumentParser(description="RECLASS Evaluation Script")
parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl") parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl")
parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"]) parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
@@ -47,11 +49,13 @@ def main():
os.makedirs("outputs/figures", exist_ok=True) os.makedirs("outputs/figures", exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load test dataset and model
test = f"data/processed/{args.dataset}_test.csv" test = f"data/processed/{args.dataset}_test.csv"
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
test_dataset = ReviewDataset(test, tokenizer) test_dataset = ReviewDataset(test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=args.batch_size) test_loader = DataLoader(test_dataset, batch_size=args.batch_size)
# MTL evaluates all tasks, STL needs to know a single task to evaluate on
if args.mode == "mtl": if args.mode == "mtl":
model = Model().to(device) model = Model().to(device)
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment'] active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
@@ -86,6 +90,7 @@ def main():
logits = outputs[task] logits = outputs[task]
preds = torch.argmax(logits, dim=1) preds = torch.argmax(logits, dim=1)
# Kepp max softmax as confidence estimate
probs = F.softmax(logits, dim=1) probs = F.softmax(logits, dim=1)
confidence = probs.max(dim=1).values confidence = probs.max(dim=1).values
@@ -93,6 +98,7 @@ def main():
all_preds[task].extend(preds.cpu().numpy()) all_preds[task].extend(preds.cpu().numpy())
all_confidences[task].extend(confidence.cpu().numpy()) all_confidences[task].extend(confidence.cpu().numpy())
# Detailed JSON summary along with printed results
summary = { summary = {
"mode": args.mode, "mode": args.mode,
"dataset": args.dataset, "dataset": args.dataset,
@@ -137,7 +143,7 @@ def main():
print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}") print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}")
print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}") print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}")
# save summary to JSON # Store main metrics and full per class report to JSON
summary["results"][task] = { summary["results"][task] = {
"macro_f1": float(report_dict["macro avg"]["f1-score"]), "macro_f1": float(report_dict["macro avg"]["f1-score"]),
"macro_precision": float(report_dict["macro avg"]["precision"]), "macro_precision": float(report_dict["macro avg"]["precision"]),
@@ -150,8 +156,7 @@ def main():
"per_class": report_dict "per_class": report_dict
} }
# Confusion matrix # Confusion matrix for each evaluated task
cm = confusion_matrix(labels_arr, preds_arr) cm = confusion_matrix(labels_arr, preds_arr)
fig, ax = plt.subplots(figsize=(8, 6)) fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap( sns.heatmap(
@@ -172,7 +177,6 @@ def main():
test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable
test_df[f'{task}_confidence'] = conf_arr test_df[f'{task}_confidence'] = conf_arr
# to JSON
run_name = args.task if args.mode == "stl" else "mtl" run_name = args.task if args.mode == "stl" else "mtl"
json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json" json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json"
with open(json_path, "w") as f: with open(json_path, "w") as f:

View File

@@ -1,4 +1,6 @@
# infer.py # infer.py
# Run inference using MTL or STL on various inputs (CSV or User)
from datetime import datetime from datetime import datetime
import os import os
import torch import torch
@@ -20,8 +22,6 @@ from torch.utils.data import Dataset
from dataset import InferenceDataset from dataset import InferenceDataset
from model import Model, SingleTaskModel from model import Model, SingleTaskModel
label_names = { label_names = {
'bug_report': ['No', 'Yes'], 'bug_report': ['No', 'Yes'],
'feature_request': ['No', 'Yes'], 'feature_request': ['No', 'Yes'],
@@ -33,9 +33,6 @@ SEED = 4321
torch.manual_seed(SEED) torch.manual_seed(SEED)
np.random.seed(SEED) np.random.seed(SEED)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.") parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/") parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/")
@@ -55,7 +52,7 @@ def main():
os.makedirs("outputs/inference", exist_ok=True) os.makedirs("outputs/inference", exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# this section is nearly identical to the first part of evaluate.py # Mirrors the evaluation script with addition of interactive modes
args = parse_args() args = parse_args()
print(f'{"="*50}') print(f'{"="*50}')
print(f'{"Starting inference"}') print(f'{"Starting inference"}')
@@ -70,7 +67,7 @@ def main():
print("Loading model, tokenizer and datasets ...") print("Loading model, tokenizer and datasets ...")
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
# Let the user decide if they want to run inference on the whole dataset or via the shell input # Support CSV and interactive input
if not args.interactive and not args.text: if not args.interactive and not args.text:
infer = f"data/processed/{args.dataset}.csv" infer = f"data/processed/{args.dataset}.csv"
infer_df = pd.read_csv(infer) infer_df = pd.read_csv(infer)

View File

@@ -1,27 +1,13 @@
# model.py # model.py
# One encoder, four shared heads(bug report, feature request, aspect, aspect sentiment) # Shared encoder (XLM-RoBERTa) with either multitask heads for all 4 tasks or single task head for comparison
# 12 transformer layers, 12 attention heads
from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel
import torch.nn as nn import torch.nn as nn
# Using dropout, This has proven to be an effective technique # Using dropout before classification to reduce overfitting
# for regularization and preventing the co-adaptation of neurons as described in https://arxiv.org/abs/1207.0580
# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
# Each hidden representation is size 768
class SingleTaskModel(nn.Module): class SingleTaskModel(nn.Module):
"""Single task model to compare MTL approach to review classification """Single task model with one head to compare MTL approach to review classification"""
Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
just different args
Args:
task_name: which of the 4 tasks are we training for
num_classes: number of output classes for the task
dropout_rate: probability applied to cls representation, randomly drops tokens for better results
"""
def __init__(self, task_name, num_classes, dropout_rate=0.2): def __init__(self, task_name, num_classes, dropout_rate=0.2):
super().__init__() super().__init__()
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base") self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -35,15 +21,7 @@ class SingleTaskModel(nn.Module):
return {self.task_name: logits} return {self.task_name: logits}
class Model(nn.Module): class Model(nn.Module):
""" Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads """ Multitask model with shared encoder and 4 task specific heads."""
Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through
shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
allowing the encoder to learn from the shared representations / generalisations
Args:
dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
"""
def __init__(self, dropout_rate=0.2): def __init__(self, dropout_rate=0.2):
super().__init__() super().__init__()
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base") self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -58,8 +36,7 @@ class Model(nn.Module):
self.aspect_head = nn.Linear(hidden_size, 6) self.aspect_head = nn.Linear(hidden_size, 6)
self.aspect_sentiment_head = nn.Linear(hidden_size, 3) self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
# Pass through encoder then extract the token representation through [batch_size, 768] # Pass through encoder once then extract the token representation, then reuse the shared represenetation across all tasks
# Apply droupout to it, take scores for each head, return them in a dictionary
def forward(self, input_ids, attention_mask): def forward(self, input_ids, attention_mask):
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
# index 0 from [batch_size, 768] # index 0 from [batch_size, 768]
@@ -67,7 +44,6 @@ class Model(nn.Module):
output = self.dropout(output) output = self.dropout(output)
# Logits for each head:
bug_logits = self.bug_head(output) bug_logits = self.bug_head(output)
feature_logits = self.feature_head(output) feature_logits = self.feature_head(output)
aspect_logits = self.aspect_head(output) aspect_logits = self.aspect_head(output)

View File

@@ -1,13 +1,9 @@
# multitag.py # multitag.py
# This app enables manual annotation of reviews in the Uber dataset, for training with # Manual annotation tool for labelling reviews in the Uber reviews dataset, for multitask training
# to achieve review classifications with multi task deep learning
# In another time I would have had much more tasks / classifications so mtl can perform better (that would mean better labelling),
#at least that is my prediction of why this may not be as good as I wanted
import tkinter as tk import tkinter as tk
from tkinter import ttk from tkinter import ttk
import pandas as pd import pandas as pd
# import langdetect
import os import os
class MultiTag: class MultiTag:
@@ -41,9 +37,6 @@ class MultiTag:
self.number_of_aspects = 6 # number of aspect buttons self.number_of_aspects = 6 # number of aspect buttons
self.root.title("MultiTag") self.root.title("MultiTag")
#self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
#self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
# Colors for active label # Colors for active label
self.color_incomplete = "#003366" self.color_incomplete = "#003366"
self.color_complete = "#00AA00" self.color_complete = "#00AA00"
@@ -51,8 +44,7 @@ class MultiTag:
# Paths # Paths
tagged_path = "data/uber_reviews_tagged.csv" tagged_path = "data/uber_reviews_tagged.csv"
sampled_path = "data/uber_reviews_sampled.csv" sampled_path = "data/uber_reviews_sampled.csv"
# self.load_review_data("data/uber_reviews_sampled.csv")
# self.load_review_data("data/uber_reviews_tagged.csv")
if not os.path.exists(tagged_path): if not os.path.exists(tagged_path):
print(f"Tagged file did not exist, making one at: {sampled_path}") print(f"Tagged file did not exist, making one at: {sampled_path}")
sampled_df = pd.read_csv(sampled_path, low_memory=False) sampled_df = pd.read_csv(sampled_path, low_memory=False)
@@ -89,13 +81,13 @@ class MultiTag:
self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5)) self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))
# Labels ROW 3 # ROW 3: Field labels
ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2)) ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2)) ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2)) ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2)) ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))
# ROW 4 |Buttons| # ROW 4: Input buttons
# Feature Requests # Feature Requests
self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2) self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2) self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
@@ -132,20 +124,15 @@ class MultiTag:
self.root.bind("f", self.handle_key) self.root.bind("f", self.handle_key)
self.root.bind("g", self.handle_key) self.root.bind("g", self.handle_key)
self.root.bind("h", self.handle_key) self.root.bind("h", self.handle_key)
# self.root.bind("j", self.handle_key)
# self.root.bind("k", self.handle_key)
# self.root.bind("l", self.handle_key)
self.display_next_review() self.display_next_review()
# self.save_tags("data/uber_reviews_tagged.csv")
self.root.mainloop() self.root.mainloop()
def handle_key(self, event): def handle_key(self, event):
key = event.char key = event.char
# Column 0 or 1: feature/bug (1 and 0) # Feature Request and Bug Report are binary input (1 and 0 keys)
if key in ['1', '0']: if key in ['1', '0']:
if self.active_column == 0: if self.active_column == 0:
self.feature_pressed(key) self.feature_pressed(key)
@@ -159,7 +146,7 @@ class MultiTag:
self.sentiment_pressed(key.upper()) self.sentiment_pressed(key.upper())
def update_status(self): def update_status(self):
"""Update status label and highlight color based on completion state""" """Update status label and highlight"""
if self.all_labels_complete(): if self.all_labels_complete():
self.highlight.configure(bg=self.color_complete) self.highlight.configure(bg=self.color_complete)
self.status_label.configure( self.status_label.configure(
@@ -212,22 +199,22 @@ class MultiTag:
def load_review_data(self, data_path): def load_review_data(self, data_path):
"""Load review data from a CSV file.""" """Load review data from a CSV file. Adds annotation columns if they don't exist."""
self.review_data = pd.read_csv(data_path, low_memory=False) self.review_data = pd.read_csv(data_path, low_memory=False)
if "tagged" not in self.review_data.columns: if "tagged" not in self.review_data.columns:
self.review_data["tagged"] = 0 # Initialize tagged column if not present self.review_data["tagged"] = 0
if "feature_request" not in self.review_data.columns: if "feature_request" not in self.review_data.columns:
self.review_data["feature_request"] = "" # Initialize feature_request column if not present self.review_data["feature_request"] = ""
if "bug_report" not in self.review_data.columns: if "bug_report" not in self.review_data.columns:
self.review_data["bug_report"] = "" # Initialize bug_report column if not present self.review_data["bug_report"] = ""
if "aspect" not in self.review_data.columns: if "aspect" not in self.review_data.columns:
self.review_data["aspect"] = "" # Initialize aspect column if not present self.review_data["aspect"] = ""
if "aspect_sentiment" not in self.review_data.columns: if "aspect_sentiment" not in self.review_data.columns:
self.review_data["aspect_sentiment"] = "" # Initialize aspect_sentiment column if not present self.review_data["aspect_sentiment"] = ""
print(f"Loaded {len(self.review_data)} reviews from {data_path}") print(f"Loaded {len(self.review_data)} reviews from {data_path}")
def display_next_review(self): def display_next_review(self):
"""Display the next review in the text box.""" """Display the next unlabelled review in the text box."""
self.current_review_index = self.get_current_review_index() self.current_review_index = self.get_current_review_index()
if self.current_review_index < len(self.review_data): if self.current_review_index < len(self.review_data):
review = self.review_data.iloc[self.current_review_index] review = self.review_data.iloc[self.current_review_index]
@@ -283,9 +270,8 @@ class MultiTag:
row["aspect_sentiment"] != "") row["aspect_sentiment"] != "")
def save_tags(self, save_path): def save_tags(self, save_path):
"""Save the tagged data to a CSV file.""" """Save the current tagged data to a CSV file."""
self.review_data.to_csv(save_path, index=False) self.review_data.to_csv(save_path, index=False)
# print(f"Tagged data saved to {save_path}")
def quit_app(self, event): def quit_app(self, event):
tagged_count = (self.review_data['tagged'] == 1).sum() tagged_count = (self.review_data['tagged'] == 1).sum()

View File

@@ -1,19 +1,13 @@
# preprocess.py # preprocess.py
# Text cleaning and preprocessing for the Uber Reviews Dataset
# langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually # langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
import pandas as pd import pandas as pd
import re import re
def clean_text(text) -> str: def clean_text(text) -> str:
"""Clean review text by removing URLS, emails, excessive whitespace """Normalise review text by removing URLS, emails, excessive whitespace"""
Input:
text - the review text to clean
Outputs:
str: the cleaned review text
"""
if pd.isna(text): if pd.isna(text):
return "" return ""
@@ -53,12 +47,6 @@ def preprocess_uber_reviews(input_path, output_path):
6. Removes less than 5 word reviews 6. Removes less than 5 word reviews
6. Saves the cleaned dataset to uber_reviews_cleaned.csv 6. Saves the cleaned dataset to uber_reviews_cleaned.csv
Inputs:
input_path (str): Path to uber_reviews.csv
output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv
Outputs:
pd.df_clean: the dataframe of cleaned processed reviews
""" """
print("="*50) print("="*50)
print("PREPROCESSING UBER REVIEWS") print("PREPROCESSING UBER REVIEWS")
@@ -117,10 +105,6 @@ def preprocess_uber_reviews(input_path, output_path):
print("="*50) print("="*50)
print(f"\nFinal dataset: {len(df_clean):,} reviews") print(f"\nFinal dataset: {len(df_clean):,} reviews")
print(f"Quality filters: word_count >= 5, duplicates removed") print(f"Quality filters: word_count >= 5, duplicates removed")
# while this does remove a some legitimate reviews which would provide use in classification
# it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5
# it showed the most amount of formative reviews without seeming excessive in data removal
print("\nRating distribution:") print("\nRating distribution:")
rating_dist = df_clean['rating'].value_counts().sort_index() rating_dist = df_clean['rating'].value_counts().sort_index()
for rating, count in rating_dist.items(): for rating, count in rating_dist.items():
@@ -137,10 +121,7 @@ def preprocess_uber_reviews(input_path, output_path):
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}") print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
print(f" Null values: {df_clean.isnull().sum().to_dict()}") print(f" Null values: {df_clean.isnull().sum().to_dict()}")
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}") print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
# lang detection takes 5+ mins so leaving it commented for now
#df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
#print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
# Sample reviews from each rating # Sample reviews from each rating
print("\n" + "="*50) print("\n" + "="*50)
print("SAMPLE CLEANED REVIEWS") print("SAMPLE CLEANED REVIEWS")
@@ -152,11 +133,6 @@ def preprocess_uber_reviews(input_path, output_path):
for index, row in sample.iterrows(): for index, row in sample.iterrows():
print(f" • ({row['word_count']} words) {row['review'][:100]}") print(f" • ({row['word_count']} words) {row['review'][:100]}")
# Note about language
print("Language detection not applied due to unreliability on short")
print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
print(" ...Manual annotation phase will identify any non-English reviews")
return df_clean return df_clean
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -11,11 +11,12 @@ class Sampler:
def __init__(self, data_path, target_samples): def __init__(self, data_path, target_samples):
self.data_path = data_path self.data_path = data_path
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers) # Default stratification method is based on original rating distribution
self.stratify_column = "rating"
self.original_data = pd.read_csv(original_path, low_memory=False) self.original_data = pd.read_csv(original_path, low_memory=False)
self.data = pd.read_csv(self.data_path, low_memory=False) self.data = pd.read_csv(self.data_path, low_memory=False)
self.total = len(self.data) # total number of records in the dataset self.total = len(self.data) # total number of records in the working dataset
print("="*50) print("="*50)
print("SAMPLER INITIALIZED") print("SAMPLER INITIALIZED")
@@ -35,25 +36,10 @@ class Sampler:
print((_origdist*100).round(1),"\n") print((_origdist*100).round(1),"\n")
self.data.info(verbose=True) self.data.info(verbose=True)
"""
Kept for reference with later sampling methods
# add sampling method here Samples from current processed data rather than matching the original distribution
# random sample 5000 entries with stratifiying by rating
"""
rating
5 57.1% (611133)
1 26.5% (283895)
4 7.8% (82953)
3 4.7% (49928)
2 3.9% (41707)
Name: proportion, dtype: object
"""
"""
IGNORE --- Left in just in case
Sample randomly
Redundant calculation
Doesn't factor that the distribution changed greatly after preprocessing
""" """
def get_stratified_sample(self) -> pd.DataFrame: def get_stratified_sample(self) -> pd.DataFrame:
stratified_sample = ( stratified_sample = (
@@ -67,9 +53,8 @@ class Sampler:
def sample_col(self, column) -> pd.DataFrame: def sample_col(self, column) -> pd.DataFrame:
""" """
IGNORE --- Left in just in case Samples a proportional number of rows from one column
Deprecated: Not used in final pipeline, kept for reference
Randomly sample, including conflicting math, I guess I was going to stratify
""" """
samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000 samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
samples_per_column = max(samples_per_column,1) # also pointless samples_per_column = max(samples_per_column,1) # also pointless
@@ -77,24 +62,9 @@ class Sampler:
""" """
original_distribution_sample() Main sampling method to annotate
The main sampling method for our labelling as it Samples reviews matching the original raw dataset distribution, so the labelled set
keeps composition of the original uber dataset, verified in better represents the original data and is more comparable to the unlabelled set.
which is a fairer comparison, may also work better in general
verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
manually coded distributions taken from notebooks
for ratings and actual number of samples
rating data is the whole data for a rating as we iterate
has error handling if totals doesn't match the required amount of samples per the orig distrib
randomise the indexes (samples) and appends to the new dataset
""" """
def original_distribution_sample(self): def original_distribution_sample(self):
original_dist = { original_dist = {
@@ -117,21 +87,14 @@ class Sampler:
return original_sample return original_sample
""" """
sample_with_keywords() Build a sample with more likely bugs and feature reviews
In order to train on more bugs and features data in
future this method was created
- 2000 balanced by rating (400 per) - 2000 balanced by rating (400 per)
- 1500 likely bugs using bug_keywords list - 1500 likely bugs using bug_keywords list
- 1500 likely features using feature_keywords list - 1500 likely features using feature_keywords list
inputs:
outputs:
""" """
def sample_with_keywords(self): def sample_with_keywords(self):
#TODO add keywords for feature classification # Keyword lists for oversampling likely bug reports and feature requests
print(f"\n{'='*50}") print(f"\n{'='*50}")
print("Keyword influenced / rating stratified set") print("Keyword influenced / rating stratified set")
print(f"\n{'='*50}") print(f"\n{'='*50}")
@@ -181,18 +144,16 @@ class Sampler:
# Drop helper columns # Drop helper columns
keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature']) keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
print(f"\n Total samples: {len(keyword_sample):,}") print(f"\n Total samples: {len(keyword_sample):,}")
return keyword_sample return keyword_sample
def sample_tiny_size(self): def sample_tiny_size(self):
mini_sample = self.data.sample(200) # reading some samples manually mini_sample = self.data.sample(200) # for reading some samples manually
return mini_sample return mini_sample
def save_sample(self, sample_df,output_path): def save_sample(self, sample_df,output_path):
"""Save sample and display statistics""" """Save sample and display summary statistics"""
sample_df.to_csv(output_path, index=False) sample_df.to_csv(output_path, index=False)
print(f"\n{'='*50}") print(f"\n{'='*50}")

View File

@@ -1,5 +1,6 @@
# train.py # train.py
# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html # Training script for both MTL and STL setups
# Structure adapted and adjusted from standard PyTorch training loops
import argparse import argparse
import os import os
from datetime import datetime from datetime import datetime
@@ -17,40 +18,26 @@ from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import classification_report, f1_score from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight from sklearn.utils.class_weight import compute_class_weight
from dataset import ReviewDataset from dataset import ReviewDataset
from model import Model, SingleTaskModel from model import Model, SingleTaskModel
# Fixed seed for near reproducibile runs
# =======================================================================
# Training script for MTL and STL training configurations
# =======================================================================
# NFR5, reproducibility
SEED = 4321 SEED = 4321
torch.manual_seed(SEED) torch.manual_seed(SEED)
np.random.seed(SEED) np.random.seed(SEED)
random.seed(SEED) random.seed(SEED)
# ------------------- Class weights -------------------
# Using weights inversely proportional to class frequencies to avoid majority class bias,
# prioritize useful bug reports / feature requests
def compute_weights(df, column, device): def compute_weights(df, column, device):
"""Copmutes inverse frequency class weights for a label column """Computes inverse frequency class weights for a label column"""
Uses sklearns balanced mode
Rare classes receive higher weights to penalise so it can learn more from less
"""
classes = np.unique(df[column]) classes = np.unique(df[column])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column]) weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
return torch.tensor(weights, dtype=torch.float).to(device) return torch.tensor(weights, dtype=torch.float).to(device)
# parse_args() - NFR7 and NFR9
# Example Usages: python src/train.py --dataset boosted
# python src/train.py --epochs 15 NOTE: 8 - 12 epochs has seen best results so far
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.") parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
parser.add_argument("--mode", type=str, default="mtl", choices=["mtl", "stl"], help="Choose between 'mtl' (multitask learning) and 'stl' (single task learning).") parser.add_argument("--mode", type=str, default="mtl", choices=["mtl", "stl"], help="Choose between 'mtl' (multitask learning) and 'stl' (single task learning).")
@@ -67,23 +54,25 @@ def main():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Starting training...", flush=True) print("Starting training...", flush=True)
print("Using device:", device) print("Using device:", device)
# Set cuda seeds for reproducibility
# Set cuda seeds for reproducibility on GPU
if torch.cuda.is_available(): if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0)) print("GPU:", torch.cuda.get_device_name(0))
torch.cuda.manual_seed_all(SEED) torch.cuda.manual_seed_all(SEED)
torch.cuda.manual_seed(SEED) torch.cuda.manual_seed(SEED)
print(f"Using dataset: {args.dataset.upper()}") print(f"Using dataset: {args.dataset.upper()}")
# Force deterministic for reproducibility at a slight performance cost # Force deterministic for reproducibility at a slight performance cost
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False torch.backends.cudnn.benchmark = False
# load data # load data into train/val splits
train = f"data/processed/{args.dataset}_train.csv" train = f"data/processed/{args.dataset}_train.csv"
val = f"data/processed/{args.dataset}_val.csv" val = f"data/processed/{args.dataset}_val.csv"
os.makedirs("outputs", exist_ok=True) os.makedirs("outputs", exist_ok=True)
os.makedirs("runs", exist_ok=True) os.makedirs("runs", exist_ok=True)
# FR1, FR2, Multilingual tokenizer initilization # Tokenizer initilization
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
train_dataset = ReviewDataset(train, tokenizer) train_dataset = ReviewDataset(train, tokenizer)
@@ -92,7 +81,7 @@ def main():
training_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) training_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) validation_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
# FR3, shared multilingual model with task-specific heads # Shared model uses encoder across all tasks, STL model trains one task at a time
if args.mode == "mtl": if args.mode == "mtl":
model = Model().to(device) model = Model().to(device)
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment'] active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
@@ -113,7 +102,7 @@ def main():
train_df = pd.read_csv(train) train_df = pd.read_csv(train)
# Class weights # Compute per-task weights from the training split
print("\n Computing class weights...") print("\n Computing class weights...")
bug_weights = compute_weights(train_df, 'bug_report', device) bug_weights = compute_weights(train_df, 'bug_report', device)
feature_weights = compute_weights(train_df, 'feature_request', device) feature_weights = compute_weights(train_df, 'feature_request', device)
@@ -151,7 +140,7 @@ def main():
num_training_steps=total_steps num_training_steps=total_steps
) )
# ------------------- Training loop ------------------- # Entry point for training loop, with Tensorboard logging and early stopping based on validation macro F1 score
start_time = time.time() start_time = time.time()
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(f'runs/reclass_{run_name}_{timestamp}') writer = SummaryWriter(f'runs/reclass_{run_name}_{timestamp}')
@@ -175,7 +164,7 @@ def main():
input_ids = batch["input_ids"].to(device) input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device) attention_mask = batch["attention_mask"].to(device)
# FR8, Multitask forward pass # Multitask forward pass
outputs = model(input_ids, attention_mask) outputs = model(input_ids, attention_mask)
loss = 0 loss = 0
@@ -199,7 +188,7 @@ def main():
writer.add_scalar("Loss/train", avg_train_loss, epoch) writer.add_scalar("Loss/train", avg_train_loss, epoch)
print(f"Average training loss: {avg_train_loss:.4f}") print(f"Average training loss: {avg_train_loss:.4f}")
# -------------------- Validation loop ------------------- # Validation phase
model.eval() model.eval()
total_val_loss = 0.0 total_val_loss = 0.0
@@ -226,7 +215,7 @@ def main():
avg_vloss = total_val_loss / len(validation_loader) avg_vloss = total_val_loss / len(validation_loader)
writer.add_scalar("Loss/val", avg_vloss, epoch) writer.add_scalar("Loss/val", avg_vloss, epoch)
# FR11, Performance evaluation # Performance evaluation summary
print("\nValidation Metrics (MACRO F1):") print("\nValidation Metrics (MACRO F1):")
epoch_f1 = [] epoch_f1 = []
for task in active_tasks: for task in active_tasks:
@@ -239,7 +228,7 @@ def main():
writer.add_scalar("F1/val_macro_avg", avg_macro_f1, epoch) writer.add_scalar("F1/val_macro_avg", avg_macro_f1, epoch)
print(f" Average Macro F1: {avg_macro_f1:.4f}") print(f" Average Macro F1: {avg_macro_f1:.4f}")
# NFR4, Early stopping # Early stopping
if avg_macro_f1 > best_f1: if avg_macro_f1 > best_f1:
best_f1 = avg_macro_f1 best_f1 = avg_macro_f1
patience_counter = 0 patience_counter = 0