added further documentation across all files
This commit is contained in:
127
README.md
127
README.md
@@ -4,9 +4,11 @@
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Project Overview
|
# README not finished
|
||||||
|
|
||||||
RECLASS is a multi-task learning system which uses a shared BERT encoder with task-specific classification heads.
|
## Overview
|
||||||
|
|
||||||
|
RECLASS is a multitask learning system which uses a shared multilingual transformer encoder with task-specific heads and single-task implementations for optional comparison.
|
||||||
|
|
||||||
| Task | Output | Classes |
|
| Task | Output | Classes |
|
||||||
|------|--------|---------|
|
|------|--------|---------|
|
||||||
@@ -18,57 +20,104 @@ RECLASS is a multi-task learning system which uses a shared BERT encoder with ta
|
|||||||
## Dataset
|
## Dataset
|
||||||
|
|
||||||
- **Source**: [Uber Customer Reviews (Kaggle)](https://www.kaggle.com/datasets/khushipitroda/ola-vs-uber-play-store-reviews)
|
- **Source**: [Uber Customer Reviews (Kaggle)](https://www.kaggle.com/datasets/khushipitroda/ola-vs-uber-play-store-reviews)
|
||||||
- **Original size**: 1,069,616 reviews
|
- **Original size**: ~1.07M Reviews
|
||||||
- **Cleaned size**: 495,036 reviews (after removing short/duplicate reviews)
|
- **After Preprocessing**: ~495K Reviews
|
||||||
- **Annotation target**: 5,000 manually labelled reviews
|
- **Annotation subsets**: 5,000 from the original distribution, 5,000 from a keyword boosted sample
|
||||||
|
|
||||||
|
## Preprocessing Steps
|
||||||
|
|
||||||
|
- Removed URLS and emails
|
||||||
|
- Normalised text and punctuation
|
||||||
|
- Removed duplicate reviews
|
||||||
|
- Filtered reviews less than 5 words
|
||||||
|
|
||||||
|
- Output sets
|
||||||
|
- Original: matches the original distribution of the raw dataset
|
||||||
|
- Boosted: oversamples bug reports and feature requests using keyword heuristics
|
||||||
|
|
||||||
|
## Model
|
||||||
|
|
||||||
|
- Encoder: XLM-RoBERTa (large multilingual transformer model)
|
||||||
|
- Architecture:
|
||||||
|
- Shared encoder
|
||||||
|
- Task-specific classification heads
|
||||||
|
- Training setups:
|
||||||
|
- MTL (Multitask learning)
|
||||||
|
- STL (Single-task learning)
|
||||||
|
|
||||||
|
Class weights are applied to reduce imbalance effects.
|
||||||
|
|
||||||
## Repository Structure
|
## Repository Structure
|
||||||
|
|
||||||
```
|
.
|
||||||
6013/
|
├── data
|
||||||
README.md
|
│ └── processed
|
||||||
.gitignore
|
│ ├── boosted_test.csv
|
||||||
data/
|
│ ├── boosted_train.csv
|
||||||
uber_reviews.csv # Raw dataset
|
│ ├── boosted_val.csv
|
||||||
uber_reviews_cleaned.csv # Preprocessed reviews
|
│ ├── original_test.csv
|
||||||
uber_reviews_sampled.csv # Stratified sample for annotation
|
│ ├── original_train.csv
|
||||||
uber_reviews_tagged.csv # Annotated reviews (in progress)
|
│ ├── original_val.csv
|
||||||
notebooks/
|
│ └── review.csv
|
||||||
preprocessing_uber.ipynb # Preprocessing analysis
|
├── notebooks/
|
||||||
uber_cleaned.ipynb # Cleaned data verification
|
│
|
||||||
src/
|
├── outputs
|
||||||
preprocess.py # Text cleaning and filtering pipeline
|
│ └── figures/
|
||||||
sampler.py # Stratified sampling strategies
|
├── README.md
|
||||||
multitag.py # GUI annotation tool
|
├── architecture.png
|
||||||
train.py # Model training (in progress)
|
└── src
|
||||||
infer.py # Inference pipeline (in progress)
|
├── dataset.py
|
||||||
outputs/
|
├── evaluate.py
|
||||||
figures/
|
├── infer.py
|
||||||
```
|
├── model.py
|
||||||
|
├── multitag.py
|
||||||
|
├── preprocess.py
|
||||||
|
├── sampler.py
|
||||||
|
└── train.py
|
||||||
|
|
||||||
## Current Progress
|
## Results
|
||||||
|
|
||||||
- Manual annotation of 5,000 reviews
|
Evaluation includes Precision, Recall, Macro F1, Confusion matrices and confidence analysis.
|
||||||
- BERT baseline implementation
|
|
||||||
- Multi-task model architecture
|
Results and summaries are found in outputs/*.json and outputs/figures/
|
||||||
- Training and evaluation
|
|
||||||
- Comparative analysis (MTL vs single-task)
|
|
||||||
- Final report and presentation
|
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
```
|
```
|
||||||
# Clone repository
|
|
||||||
...
|
|
||||||
# Create conda environment
|
# Create conda environment
|
||||||
...
|
conda create -n reclass python=3.11
|
||||||
|
conda activate reclass
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
# Install dependencies
|
# Install dependencies
|
||||||
...requirements.txt
|
conda install --file requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
## References
|
|
||||||
## Licenses
|
#### Train Model
|
||||||
|
|
||||||
|
```
|
||||||
|
python src/train.py --mode mtl --dataset original
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Evaluate Model
|
||||||
|
|
||||||
|
```
|
||||||
|
python src/evaluate.py --mode mtl --dataset original --model_path <model>.pt
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Run Inference
|
||||||
|
|
||||||
|
```
|
||||||
|
python src/infer.py --mode mtl --model_path <model>.pt --dataset review
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The same tokenizer is used across training, evaluation and inference to ensure consistency
|
||||||
|
- Sampling and preprocessing choices are documented further in src files and dissertation
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -1,22 +1,17 @@
|
|||||||
# dataset.py
|
# dataset.py
|
||||||
# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer
|
# Takes a row from the csv, tokenizes the review and returns a tensor ready for the model
|
||||||
# Takes a row from the csv, tokenizes the review and returns a tensor
|
|
||||||
import torch
|
import torch
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
class ReviewDataset(Dataset):
|
class ReviewDataset(Dataset):
|
||||||
"""Pytorch Dataset for loading tokenized reviews
|
"""
|
||||||
|
Dataset for tokenized reviews with labels for all 4 tasks.
|
||||||
|
|
||||||
Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
|
Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
|
||||||
Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
|
Expects a csv and tokenizes reviews using XLM-RoBERTa (SentencePiece), returning a dictionary with of
|
||||||
input tensors and integer labels for all 4 tasks.
|
input tensors and integer labels for all 4 tasks.
|
||||||
|
|
||||||
Args:
|
|
||||||
path (str): Path to the csv file containing the reviews and labels.
|
|
||||||
tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
|
|
||||||
max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, path, tokenizer, max_length=256):
|
def __init__(self, path, tokenizer, max_length=256):
|
||||||
@@ -30,25 +25,14 @@ class ReviewDataset(Dataset):
|
|||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
review = self.df.iloc[idx]['review']
|
review = self.df.iloc[idx]['review']
|
||||||
|
|
||||||
# encoding['input_ids'] 1D tensor of token ids, shape [max_length]
|
# Tokenize with padding and truncation to max_length, returning PyTorch tensors
|
||||||
# encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
|
|
||||||
# Both have shape [1, max_length] because of return_tensors='pt'
|
|
||||||
# Squeeze them to [max_length] with .squeeze(0)
|
|
||||||
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
||||||
|
|
||||||
# Returns a dictionary with:
|
|
||||||
# 'input_ids': tensor of shape [max_length]
|
|
||||||
|
|
||||||
# 'attention_mask': tensor of shape [max_length]
|
|
||||||
|
|
||||||
# MTL structure labels as tensor scalars:
|
|
||||||
# 'bug_report': tensor scalar (torch.tensor(label_value))
|
|
||||||
# 'feature_request': tensor scalar (torch.tensor(label_value))
|
|
||||||
# 'aspect': tensor scalar (torch.tensor(label_value))
|
|
||||||
# 'aspect_sentiment': tensor scalar (torch.tensor(label_value))
|
|
||||||
return {
|
return {
|
||||||
'input_ids': encoding['input_ids'].squeeze(0),
|
'input_ids': encoding['input_ids'].squeeze(0),
|
||||||
'attention_mask': encoding['attention_mask'].squeeze(0),
|
'attention_mask': encoding['attention_mask'].squeeze(0),
|
||||||
|
|
||||||
|
# Labels for all 4 tasks, converted to tensors
|
||||||
'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long),
|
'bug_report': torch.tensor(self.df.iloc[idx]['bug_report'], dtype=torch.long),
|
||||||
'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long),
|
'feature_request': torch.tensor(self.df.iloc[idx]['feature_request'], dtype=torch.long),
|
||||||
'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
|
'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
|
||||||
@@ -65,10 +49,12 @@ class InferenceDataset(Dataset):
|
|||||||
return len(self.df)
|
return len(self.df)
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
#review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews
|
|
||||||
review = str(self.df.iloc[idx][self.text_column])
|
review = str(self.df.iloc[idx][self.text_column])
|
||||||
|
|
||||||
if review == 'nan' or review.strip() == '':
|
if review == 'nan' or review.strip() == '':
|
||||||
review = ' '
|
review = ' '
|
||||||
|
|
||||||
|
# Same as training dataset but without labels, for inference on test sets
|
||||||
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
||||||
return {
|
return {
|
||||||
'input_ids': encoding['input_ids'].squeeze(0),
|
'input_ids': encoding['input_ids'].squeeze(0),
|
||||||
@@ -76,7 +62,10 @@ class InferenceDataset(Dataset):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# Quick test
|
||||||
dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
|
dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
|
||||||
print(dataset.__getitem__(1))
|
print(dataset.__getitem__(1))
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
# evauluate.py
|
# evauluate.py
|
||||||
|
# Evaluate MTL or STL models on the test split
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import torch
|
import torch
|
||||||
import time
|
import time
|
||||||
@@ -17,7 +19,6 @@ from sklearn.metrics import classification_report, confusion_matrix, f1_score
|
|||||||
from dataset import ReviewDataset
|
from dataset import ReviewDataset
|
||||||
from model import Model, SingleTaskModel
|
from model import Model, SingleTaskModel
|
||||||
|
|
||||||
# TODO: load checkpoint, produce tables of evaluation figures
|
|
||||||
SEED = 4321
|
SEED = 4321
|
||||||
torch.manual_seed(SEED)
|
torch.manual_seed(SEED)
|
||||||
np.random.seed(SEED)
|
np.random.seed(SEED)
|
||||||
@@ -31,6 +32,7 @@ label_names = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
|
"""Parse command line arguments for evaluation"""
|
||||||
parser = argparse.ArgumentParser(description="RECLASS Evaluation Script")
|
parser = argparse.ArgumentParser(description="RECLASS Evaluation Script")
|
||||||
parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl")
|
parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl")
|
||||||
parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
|
parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
|
||||||
@@ -47,11 +49,13 @@ def main():
|
|||||||
os.makedirs("outputs/figures", exist_ok=True)
|
os.makedirs("outputs/figures", exist_ok=True)
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
# Load test dataset and model
|
||||||
test = f"data/processed/{args.dataset}_test.csv"
|
test = f"data/processed/{args.dataset}_test.csv"
|
||||||
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||||
test_dataset = ReviewDataset(test, tokenizer)
|
test_dataset = ReviewDataset(test, tokenizer)
|
||||||
test_loader = DataLoader(test_dataset, batch_size=args.batch_size)
|
test_loader = DataLoader(test_dataset, batch_size=args.batch_size)
|
||||||
|
|
||||||
|
# MTL evaluates all tasks, STL needs to know a single task to evaluate on
|
||||||
if args.mode == "mtl":
|
if args.mode == "mtl":
|
||||||
model = Model().to(device)
|
model = Model().to(device)
|
||||||
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
|
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
|
||||||
@@ -86,6 +90,7 @@ def main():
|
|||||||
logits = outputs[task]
|
logits = outputs[task]
|
||||||
preds = torch.argmax(logits, dim=1)
|
preds = torch.argmax(logits, dim=1)
|
||||||
|
|
||||||
|
# Kepp max softmax as confidence estimate
|
||||||
probs = F.softmax(logits, dim=1)
|
probs = F.softmax(logits, dim=1)
|
||||||
confidence = probs.max(dim=1).values
|
confidence = probs.max(dim=1).values
|
||||||
|
|
||||||
@@ -93,6 +98,7 @@ def main():
|
|||||||
all_preds[task].extend(preds.cpu().numpy())
|
all_preds[task].extend(preds.cpu().numpy())
|
||||||
all_confidences[task].extend(confidence.cpu().numpy())
|
all_confidences[task].extend(confidence.cpu().numpy())
|
||||||
|
|
||||||
|
# Detailed JSON summary along with printed results
|
||||||
summary = {
|
summary = {
|
||||||
"mode": args.mode,
|
"mode": args.mode,
|
||||||
"dataset": args.dataset,
|
"dataset": args.dataset,
|
||||||
@@ -137,7 +143,7 @@ def main():
|
|||||||
print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}")
|
print(f"Mean confidence for correct predictions: {mean_conf_correct:.4f}")
|
||||||
print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}")
|
print(f"Incorrect Predictions confidence: {mean_conf_incorrect:.4f}")
|
||||||
|
|
||||||
# save summary to JSON
|
# Store main metrics and full per class report to JSON
|
||||||
summary["results"][task] = {
|
summary["results"][task] = {
|
||||||
"macro_f1": float(report_dict["macro avg"]["f1-score"]),
|
"macro_f1": float(report_dict["macro avg"]["f1-score"]),
|
||||||
"macro_precision": float(report_dict["macro avg"]["precision"]),
|
"macro_precision": float(report_dict["macro avg"]["precision"]),
|
||||||
@@ -150,8 +156,7 @@ def main():
|
|||||||
"per_class": report_dict
|
"per_class": report_dict
|
||||||
}
|
}
|
||||||
|
|
||||||
# Confusion matrix
|
# Confusion matrix for each evaluated task
|
||||||
|
|
||||||
cm = confusion_matrix(labels_arr, preds_arr)
|
cm = confusion_matrix(labels_arr, preds_arr)
|
||||||
fig, ax = plt.subplots(figsize=(8, 6))
|
fig, ax = plt.subplots(figsize=(8, 6))
|
||||||
sns.heatmap(
|
sns.heatmap(
|
||||||
@@ -172,7 +177,6 @@ def main():
|
|||||||
test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable
|
test_df[f'{task}_pred'] = [label_names[task][p] for p in preds_arr] # Map to human readable
|
||||||
test_df[f'{task}_confidence'] = conf_arr
|
test_df[f'{task}_confidence'] = conf_arr
|
||||||
|
|
||||||
# to JSON
|
|
||||||
run_name = args.task if args.mode == "stl" else "mtl"
|
run_name = args.task if args.mode == "stl" else "mtl"
|
||||||
json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json"
|
json_path = f"outputs/eval_summary_{args.mode}_{run_name}_{args.dataset}.json"
|
||||||
with open(json_path, "w") as f:
|
with open(json_path, "w") as f:
|
||||||
|
|||||||
11
src/infer.py
11
src/infer.py
@@ -1,4 +1,6 @@
|
|||||||
# infer.py
|
# infer.py
|
||||||
|
# Run inference using MTL or STL on various inputs (CSV or User)
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import os
|
import os
|
||||||
import torch
|
import torch
|
||||||
@@ -20,8 +22,6 @@ from torch.utils.data import Dataset
|
|||||||
from dataset import InferenceDataset
|
from dataset import InferenceDataset
|
||||||
from model import Model, SingleTaskModel
|
from model import Model, SingleTaskModel
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
label_names = {
|
label_names = {
|
||||||
'bug_report': ['No', 'Yes'],
|
'bug_report': ['No', 'Yes'],
|
||||||
'feature_request': ['No', 'Yes'],
|
'feature_request': ['No', 'Yes'],
|
||||||
@@ -33,9 +33,6 @@ SEED = 4321
|
|||||||
torch.manual_seed(SEED)
|
torch.manual_seed(SEED)
|
||||||
np.random.seed(SEED)
|
np.random.seed(SEED)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
|
parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
|
||||||
parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/")
|
parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/")
|
||||||
@@ -55,7 +52,7 @@ def main():
|
|||||||
os.makedirs("outputs/inference", exist_ok=True)
|
os.makedirs("outputs/inference", exist_ok=True)
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
# this section is nearly identical to the first part of evaluate.py
|
# Mirrors the evaluation script with addition of interactive modes
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
print(f'{"="*50}')
|
print(f'{"="*50}')
|
||||||
print(f'{"Starting inference"}')
|
print(f'{"Starting inference"}')
|
||||||
@@ -70,7 +67,7 @@ def main():
|
|||||||
print("Loading model, tokenizer and datasets ...")
|
print("Loading model, tokenizer and datasets ...")
|
||||||
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||||
|
|
||||||
# Let the user decide if they want to run inference on the whole dataset or via the shell input
|
# Support CSV and interactive input
|
||||||
if not args.interactive and not args.text:
|
if not args.interactive and not args.text:
|
||||||
infer = f"data/processed/{args.dataset}.csv"
|
infer = f"data/processed/{args.dataset}.csv"
|
||||||
infer_df = pd.read_csv(infer)
|
infer_df = pd.read_csv(infer)
|
||||||
|
|||||||
34
src/model.py
34
src/model.py
@@ -1,27 +1,13 @@
|
|||||||
# model.py
|
# model.py
|
||||||
# One encoder, four shared heads(bug report, feature request, aspect, aspect sentiment)
|
# Shared encoder (XLM-RoBERTa) with either multitask heads for all 4 tasks or single task head for comparison
|
||||||
# 12 transformer layers, 12 attention heads
|
|
||||||
|
|
||||||
from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel
|
from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
# Using dropout, This has proven to be an effective technique
|
# Using dropout before classification to reduce overfitting
|
||||||
# for regularization and preventing the co-adaptation of neurons as described in https://arxiv.org/abs/1207.0580
|
|
||||||
|
|
||||||
# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
|
|
||||||
# Each hidden representation is size 768
|
|
||||||
|
|
||||||
class SingleTaskModel(nn.Module):
|
class SingleTaskModel(nn.Module):
|
||||||
"""Single task model to compare MTL approach to review classification
|
"""Single task model with one head to compare MTL approach to review classification"""
|
||||||
|
|
||||||
Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
|
|
||||||
just different args
|
|
||||||
|
|
||||||
Args:
|
|
||||||
task_name: which of the 4 tasks are we training for
|
|
||||||
num_classes: number of output classes for the task
|
|
||||||
dropout_rate: probability applied to cls representation, randomly drops tokens for better results
|
|
||||||
"""
|
|
||||||
def __init__(self, task_name, num_classes, dropout_rate=0.2):
|
def __init__(self, task_name, num_classes, dropout_rate=0.2):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
|
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||||
@@ -35,15 +21,7 @@ class SingleTaskModel(nn.Module):
|
|||||||
return {self.task_name: logits}
|
return {self.task_name: logits}
|
||||||
|
|
||||||
class Model(nn.Module):
|
class Model(nn.Module):
|
||||||
""" Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads
|
""" Multitask model with shared encoder and 4 task specific heads."""
|
||||||
|
|
||||||
Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through
|
|
||||||
shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
|
|
||||||
allowing the encoder to learn from the shared representations / generalisations
|
|
||||||
|
|
||||||
Args:
|
|
||||||
dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
|
|
||||||
"""
|
|
||||||
def __init__(self, dropout_rate=0.2):
|
def __init__(self, dropout_rate=0.2):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
|
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||||
@@ -58,8 +36,7 @@ class Model(nn.Module):
|
|||||||
self.aspect_head = nn.Linear(hidden_size, 6)
|
self.aspect_head = nn.Linear(hidden_size, 6)
|
||||||
self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
|
self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
|
||||||
|
|
||||||
# Pass through encoder then extract the token representation through [batch_size, 768]
|
# Pass through encoder once then extract the token representation, then reuse the shared represenetation across all tasks
|
||||||
# Apply droupout to it, take scores for each head, return them in a dictionary
|
|
||||||
def forward(self, input_ids, attention_mask):
|
def forward(self, input_ids, attention_mask):
|
||||||
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
|
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
|
||||||
# index 0 from [batch_size, 768]
|
# index 0 from [batch_size, 768]
|
||||||
@@ -67,7 +44,6 @@ class Model(nn.Module):
|
|||||||
|
|
||||||
output = self.dropout(output)
|
output = self.dropout(output)
|
||||||
|
|
||||||
# Logits for each head:
|
|
||||||
bug_logits = self.bug_head(output)
|
bug_logits = self.bug_head(output)
|
||||||
feature_logits = self.feature_head(output)
|
feature_logits = self.feature_head(output)
|
||||||
aspect_logits = self.aspect_head(output)
|
aspect_logits = self.aspect_head(output)
|
||||||
|
|||||||
@@ -1,13 +1,9 @@
|
|||||||
# multitag.py
|
# multitag.py
|
||||||
# This app enables manual annotation of reviews in the Uber dataset, for training with
|
# Manual annotation tool for labelling reviews in the Uber reviews dataset, for multitask training
|
||||||
# to achieve review classifications with multi task deep learning
|
|
||||||
|
|
||||||
# In another time I would have had much more tasks / classifications so mtl can perform better (that would mean better labelling),
|
|
||||||
#at least that is my prediction of why this may not be as good as I wanted
|
|
||||||
import tkinter as tk
|
import tkinter as tk
|
||||||
from tkinter import ttk
|
from tkinter import ttk
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
# import langdetect
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
class MultiTag:
|
class MultiTag:
|
||||||
@@ -41,9 +37,6 @@ class MultiTag:
|
|||||||
self.number_of_aspects = 6 # number of aspect buttons
|
self.number_of_aspects = 6 # number of aspect buttons
|
||||||
self.root.title("MultiTag")
|
self.root.title("MultiTag")
|
||||||
|
|
||||||
#self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
|
|
||||||
#self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
|
|
||||||
|
|
||||||
# Colors for active label
|
# Colors for active label
|
||||||
self.color_incomplete = "#003366"
|
self.color_incomplete = "#003366"
|
||||||
self.color_complete = "#00AA00"
|
self.color_complete = "#00AA00"
|
||||||
@@ -51,8 +44,7 @@ class MultiTag:
|
|||||||
# Paths
|
# Paths
|
||||||
tagged_path = "data/uber_reviews_tagged.csv"
|
tagged_path = "data/uber_reviews_tagged.csv"
|
||||||
sampled_path = "data/uber_reviews_sampled.csv"
|
sampled_path = "data/uber_reviews_sampled.csv"
|
||||||
# self.load_review_data("data/uber_reviews_sampled.csv")
|
|
||||||
# self.load_review_data("data/uber_reviews_tagged.csv")
|
|
||||||
if not os.path.exists(tagged_path):
|
if not os.path.exists(tagged_path):
|
||||||
print(f"Tagged file did not exist, making one at: {sampled_path}")
|
print(f"Tagged file did not exist, making one at: {sampled_path}")
|
||||||
sampled_df = pd.read_csv(sampled_path, low_memory=False)
|
sampled_df = pd.read_csv(sampled_path, low_memory=False)
|
||||||
@@ -89,13 +81,13 @@ class MultiTag:
|
|||||||
self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))
|
self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))
|
||||||
|
|
||||||
|
|
||||||
# Labels ROW 3
|
# ROW 3: Field labels
|
||||||
ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
|
ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
|
||||||
ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
|
ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
|
||||||
ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
|
ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
|
||||||
ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))
|
ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))
|
||||||
|
|
||||||
# ROW 4 |Buttons|
|
# ROW 4: Input buttons
|
||||||
# Feature Requests
|
# Feature Requests
|
||||||
self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
|
self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
|
||||||
self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
|
self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
|
||||||
@@ -132,20 +124,15 @@ class MultiTag:
|
|||||||
self.root.bind("f", self.handle_key)
|
self.root.bind("f", self.handle_key)
|
||||||
self.root.bind("g", self.handle_key)
|
self.root.bind("g", self.handle_key)
|
||||||
self.root.bind("h", self.handle_key)
|
self.root.bind("h", self.handle_key)
|
||||||
# self.root.bind("j", self.handle_key)
|
|
||||||
# self.root.bind("k", self.handle_key)
|
|
||||||
# self.root.bind("l", self.handle_key)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
self.display_next_review()
|
self.display_next_review()
|
||||||
# self.save_tags("data/uber_reviews_tagged.csv")
|
|
||||||
self.root.mainloop()
|
self.root.mainloop()
|
||||||
|
|
||||||
def handle_key(self, event):
|
def handle_key(self, event):
|
||||||
key = event.char
|
key = event.char
|
||||||
|
|
||||||
# Column 0 or 1: feature/bug (1 and 0)
|
# Feature Request and Bug Report are binary input (1 and 0 keys)
|
||||||
if key in ['1', '0']:
|
if key in ['1', '0']:
|
||||||
if self.active_column == 0:
|
if self.active_column == 0:
|
||||||
self.feature_pressed(key)
|
self.feature_pressed(key)
|
||||||
@@ -159,7 +146,7 @@ class MultiTag:
|
|||||||
self.sentiment_pressed(key.upper())
|
self.sentiment_pressed(key.upper())
|
||||||
|
|
||||||
def update_status(self):
|
def update_status(self):
|
||||||
"""Update status label and highlight color based on completion state"""
|
"""Update status label and highlight"""
|
||||||
if self.all_labels_complete():
|
if self.all_labels_complete():
|
||||||
self.highlight.configure(bg=self.color_complete)
|
self.highlight.configure(bg=self.color_complete)
|
||||||
self.status_label.configure(
|
self.status_label.configure(
|
||||||
@@ -212,22 +199,22 @@ class MultiTag:
|
|||||||
|
|
||||||
|
|
||||||
def load_review_data(self, data_path):
|
def load_review_data(self, data_path):
|
||||||
"""Load review data from a CSV file."""
|
"""Load review data from a CSV file. Adds annotation columns if they don't exist."""
|
||||||
self.review_data = pd.read_csv(data_path, low_memory=False)
|
self.review_data = pd.read_csv(data_path, low_memory=False)
|
||||||
if "tagged" not in self.review_data.columns:
|
if "tagged" not in self.review_data.columns:
|
||||||
self.review_data["tagged"] = 0 # Initialize tagged column if not present
|
self.review_data["tagged"] = 0
|
||||||
if "feature_request" not in self.review_data.columns:
|
if "feature_request" not in self.review_data.columns:
|
||||||
self.review_data["feature_request"] = "" # Initialize feature_request column if not present
|
self.review_data["feature_request"] = ""
|
||||||
if "bug_report" not in self.review_data.columns:
|
if "bug_report" not in self.review_data.columns:
|
||||||
self.review_data["bug_report"] = "" # Initialize bug_report column if not present
|
self.review_data["bug_report"] = ""
|
||||||
if "aspect" not in self.review_data.columns:
|
if "aspect" not in self.review_data.columns:
|
||||||
self.review_data["aspect"] = "" # Initialize aspect column if not present
|
self.review_data["aspect"] = ""
|
||||||
if "aspect_sentiment" not in self.review_data.columns:
|
if "aspect_sentiment" not in self.review_data.columns:
|
||||||
self.review_data["aspect_sentiment"] = "" # Initialize aspect_sentiment column if not present
|
self.review_data["aspect_sentiment"] = ""
|
||||||
print(f"Loaded {len(self.review_data)} reviews from {data_path}")
|
print(f"Loaded {len(self.review_data)} reviews from {data_path}")
|
||||||
|
|
||||||
def display_next_review(self):
|
def display_next_review(self):
|
||||||
"""Display the next review in the text box."""
|
"""Display the next unlabelled review in the text box."""
|
||||||
self.current_review_index = self.get_current_review_index()
|
self.current_review_index = self.get_current_review_index()
|
||||||
if self.current_review_index < len(self.review_data):
|
if self.current_review_index < len(self.review_data):
|
||||||
review = self.review_data.iloc[self.current_review_index]
|
review = self.review_data.iloc[self.current_review_index]
|
||||||
@@ -283,9 +270,8 @@ class MultiTag:
|
|||||||
row["aspect_sentiment"] != "")
|
row["aspect_sentiment"] != "")
|
||||||
|
|
||||||
def save_tags(self, save_path):
|
def save_tags(self, save_path):
|
||||||
"""Save the tagged data to a CSV file."""
|
"""Save the current tagged data to a CSV file."""
|
||||||
self.review_data.to_csv(save_path, index=False)
|
self.review_data.to_csv(save_path, index=False)
|
||||||
# print(f"Tagged data saved to {save_path}")
|
|
||||||
|
|
||||||
def quit_app(self, event):
|
def quit_app(self, event):
|
||||||
tagged_count = (self.review_data['tagged'] == 1).sum()
|
tagged_count = (self.review_data['tagged'] == 1).sum()
|
||||||
|
|||||||
@@ -1,19 +1,13 @@
|
|||||||
# preprocess.py
|
# preprocess.py
|
||||||
|
|
||||||
|
# Text cleaning and preprocessing for the Uber Reviews Dataset
|
||||||
# langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
|
# langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import re
|
import re
|
||||||
|
|
||||||
def clean_text(text) -> str:
|
def clean_text(text) -> str:
|
||||||
"""Clean review text by removing URLS, emails, excessive whitespace
|
"""Normalise review text by removing URLS, emails, excessive whitespace"""
|
||||||
|
|
||||||
Input:
|
|
||||||
text - the review text to clean
|
|
||||||
|
|
||||||
Outputs:
|
|
||||||
str: the cleaned review text
|
|
||||||
"""
|
|
||||||
if pd.isna(text):
|
if pd.isna(text):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
@@ -53,12 +47,6 @@ def preprocess_uber_reviews(input_path, output_path):
|
|||||||
6. Removes less than 5 word reviews
|
6. Removes less than 5 word reviews
|
||||||
6. Saves the cleaned dataset to uber_reviews_cleaned.csv
|
6. Saves the cleaned dataset to uber_reviews_cleaned.csv
|
||||||
|
|
||||||
Inputs:
|
|
||||||
input_path (str): Path to uber_reviews.csv
|
|
||||||
output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv
|
|
||||||
|
|
||||||
Outputs:
|
|
||||||
pd.df_clean: the dataframe of cleaned processed reviews
|
|
||||||
"""
|
"""
|
||||||
print("="*50)
|
print("="*50)
|
||||||
print("PREPROCESSING UBER REVIEWS")
|
print("PREPROCESSING UBER REVIEWS")
|
||||||
@@ -117,10 +105,6 @@ def preprocess_uber_reviews(input_path, output_path):
|
|||||||
print("="*50)
|
print("="*50)
|
||||||
print(f"\nFinal dataset: {len(df_clean):,} reviews")
|
print(f"\nFinal dataset: {len(df_clean):,} reviews")
|
||||||
print(f"Quality filters: word_count >= 5, duplicates removed")
|
print(f"Quality filters: word_count >= 5, duplicates removed")
|
||||||
# while this does remove a some legitimate reviews which would provide use in classification
|
|
||||||
# it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5
|
|
||||||
# it showed the most amount of formative reviews without seeming excessive in data removal
|
|
||||||
|
|
||||||
print("\nRating distribution:")
|
print("\nRating distribution:")
|
||||||
rating_dist = df_clean['rating'].value_counts().sort_index()
|
rating_dist = df_clean['rating'].value_counts().sort_index()
|
||||||
for rating, count in rating_dist.items():
|
for rating, count in rating_dist.items():
|
||||||
@@ -137,9 +121,6 @@ def preprocess_uber_reviews(input_path, output_path):
|
|||||||
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
|
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
|
||||||
print(f" Null values: {df_clean.isnull().sum().to_dict()}")
|
print(f" Null values: {df_clean.isnull().sum().to_dict()}")
|
||||||
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
|
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
|
||||||
# lang detection takes 5+ mins so leaving it commented for now
|
|
||||||
#df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
|
|
||||||
#print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
|
|
||||||
|
|
||||||
# Sample reviews from each rating
|
# Sample reviews from each rating
|
||||||
print("\n" + "="*50)
|
print("\n" + "="*50)
|
||||||
@@ -152,11 +133,6 @@ def preprocess_uber_reviews(input_path, output_path):
|
|||||||
for index, row in sample.iterrows():
|
for index, row in sample.iterrows():
|
||||||
print(f" • ({row['word_count']} words) {row['review'][:100]}")
|
print(f" • ({row['word_count']} words) {row['review'][:100]}")
|
||||||
|
|
||||||
# Note about language
|
|
||||||
print("Language detection not applied due to unreliability on short")
|
|
||||||
print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
|
|
||||||
print(" ...Manual annotation phase will identify any non-English reviews")
|
|
||||||
|
|
||||||
return df_clean
|
return df_clean
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -11,11 +11,12 @@ class Sampler:
|
|||||||
def __init__(self, data_path, target_samples):
|
def __init__(self, data_path, target_samples):
|
||||||
|
|
||||||
self.data_path = data_path
|
self.data_path = data_path
|
||||||
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
|
# Default stratification method is based on original rating distribution
|
||||||
|
self.stratify_column = "rating"
|
||||||
|
|
||||||
self.original_data = pd.read_csv(original_path, low_memory=False)
|
self.original_data = pd.read_csv(original_path, low_memory=False)
|
||||||
self.data = pd.read_csv(self.data_path, low_memory=False)
|
self.data = pd.read_csv(self.data_path, low_memory=False)
|
||||||
self.total = len(self.data) # total number of records in the dataset
|
self.total = len(self.data) # total number of records in the working dataset
|
||||||
|
|
||||||
print("="*50)
|
print("="*50)
|
||||||
print("SAMPLER INITIALIZED")
|
print("SAMPLER INITIALIZED")
|
||||||
@@ -35,25 +36,10 @@ class Sampler:
|
|||||||
print((_origdist*100).round(1),"\n")
|
print((_origdist*100).round(1),"\n")
|
||||||
|
|
||||||
self.data.info(verbose=True)
|
self.data.info(verbose=True)
|
||||||
|
|
||||||
# add sampling method here
|
|
||||||
# random sample 5000 entries with stratifiying by rating
|
|
||||||
"""
|
|
||||||
rating
|
|
||||||
5 57.1% (611133)
|
|
||||||
1 26.5% (283895)
|
|
||||||
4 7.8% (82953)
|
|
||||||
3 4.7% (49928)
|
|
||||||
2 3.9% (41707)
|
|
||||||
Name: proportion, dtype: object
|
|
||||||
"""
|
"""
|
||||||
|
Kept for reference with later sampling methods
|
||||||
|
|
||||||
"""
|
Samples from current processed data rather than matching the original distribution
|
||||||
IGNORE --- Left in just in case
|
|
||||||
|
|
||||||
Sample randomly
|
|
||||||
Redundant calculation
|
|
||||||
Doesn't factor that the distribution changed greatly after preprocessing
|
|
||||||
"""
|
"""
|
||||||
def get_stratified_sample(self) -> pd.DataFrame:
|
def get_stratified_sample(self) -> pd.DataFrame:
|
||||||
stratified_sample = (
|
stratified_sample = (
|
||||||
@@ -67,9 +53,8 @@ class Sampler:
|
|||||||
|
|
||||||
def sample_col(self, column) -> pd.DataFrame:
|
def sample_col(self, column) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
IGNORE --- Left in just in case
|
Samples a proportional number of rows from one column
|
||||||
|
Deprecated: Not used in final pipeline, kept for reference
|
||||||
Randomly sample, including conflicting math, I guess I was going to stratify
|
|
||||||
"""
|
"""
|
||||||
samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
|
samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
|
||||||
samples_per_column = max(samples_per_column,1) # also pointless
|
samples_per_column = max(samples_per_column,1) # also pointless
|
||||||
@@ -77,24 +62,9 @@ class Sampler:
|
|||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
original_distribution_sample()
|
Main sampling method to annotate
|
||||||
The main sampling method for our labelling as it
|
Samples reviews matching the original raw dataset distribution, so the labelled set
|
||||||
keeps composition of the original uber dataset, verified in
|
better represents the original data and is more comparable to the unlabelled set.
|
||||||
which is a fairer comparison, may also work better in general
|
|
||||||
|
|
||||||
verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
|
|
||||||
and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
|
|
||||||
|
|
||||||
|
|
||||||
manually coded distributions taken from notebooks
|
|
||||||
|
|
||||||
for ratings and actual number of samples
|
|
||||||
rating data is the whole data for a rating as we iterate
|
|
||||||
has error handling if totals doesn't match the required amount of samples per the orig distrib
|
|
||||||
randomise the indexes (samples) and appends to the new dataset
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def original_distribution_sample(self):
|
def original_distribution_sample(self):
|
||||||
original_dist = {
|
original_dist = {
|
||||||
@@ -117,21 +87,14 @@ class Sampler:
|
|||||||
return original_sample
|
return original_sample
|
||||||
|
|
||||||
"""
|
"""
|
||||||
sample_with_keywords()
|
Build a sample with more likely bugs and feature reviews
|
||||||
|
|
||||||
In order to train on more bugs and features data in
|
|
||||||
future this method was created
|
|
||||||
- 2000 balanced by rating (400 per)
|
- 2000 balanced by rating (400 per)
|
||||||
- 1500 likely bugs using bug_keywords list
|
- 1500 likely bugs using bug_keywords list
|
||||||
- 1500 likely features using feature_keywords list
|
- 1500 likely features using feature_keywords list
|
||||||
|
|
||||||
inputs:
|
|
||||||
outputs:
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def sample_with_keywords(self):
|
def sample_with_keywords(self):
|
||||||
#TODO add keywords for feature classification
|
# Keyword lists for oversampling likely bug reports and feature requests
|
||||||
print(f"\n{'='*50}")
|
print(f"\n{'='*50}")
|
||||||
print("Keyword influenced / rating stratified set")
|
print("Keyword influenced / rating stratified set")
|
||||||
print(f"\n{'='*50}")
|
print(f"\n{'='*50}")
|
||||||
@@ -181,18 +144,16 @@ class Sampler:
|
|||||||
# Drop helper columns
|
# Drop helper columns
|
||||||
keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
|
keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print(f"\n Total samples: {len(keyword_sample):,}")
|
print(f"\n Total samples: {len(keyword_sample):,}")
|
||||||
return keyword_sample
|
return keyword_sample
|
||||||
|
|
||||||
def sample_tiny_size(self):
|
def sample_tiny_size(self):
|
||||||
mini_sample = self.data.sample(200) # reading some samples manually
|
mini_sample = self.data.sample(200) # for reading some samples manually
|
||||||
return mini_sample
|
return mini_sample
|
||||||
|
|
||||||
|
|
||||||
def save_sample(self, sample_df,output_path):
|
def save_sample(self, sample_df,output_path):
|
||||||
"""Save sample and display statistics"""
|
"""Save sample and display summary statistics"""
|
||||||
sample_df.to_csv(output_path, index=False)
|
sample_df.to_csv(output_path, index=False)
|
||||||
|
|
||||||
print(f"\n{'='*50}")
|
print(f"\n{'='*50}")
|
||||||
|
|||||||
47
src/train.py
47
src/train.py
@@ -1,5 +1,6 @@
|
|||||||
# train.py
|
# train.py
|
||||||
# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
|
# Training script for both MTL and STL setups
|
||||||
|
# Structure adapted and adjusted from standard PyTorch training loops
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@@ -17,40 +18,26 @@ from transformers import get_linear_schedule_with_warmup
|
|||||||
from sklearn.metrics import classification_report, f1_score
|
from sklearn.metrics import classification_report, f1_score
|
||||||
from sklearn.utils.class_weight import compute_class_weight
|
from sklearn.utils.class_weight import compute_class_weight
|
||||||
|
|
||||||
|
|
||||||
from dataset import ReviewDataset
|
from dataset import ReviewDataset
|
||||||
from model import Model, SingleTaskModel
|
from model import Model, SingleTaskModel
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Fixed seed for near reproducibile runs
|
||||||
# =======================================================================
|
|
||||||
# Training script for MTL and STL training configurations
|
|
||||||
# =======================================================================
|
|
||||||
|
|
||||||
# NFR5, reproducibility
|
|
||||||
SEED = 4321
|
SEED = 4321
|
||||||
torch.manual_seed(SEED)
|
torch.manual_seed(SEED)
|
||||||
np.random.seed(SEED)
|
np.random.seed(SEED)
|
||||||
random.seed(SEED)
|
random.seed(SEED)
|
||||||
|
|
||||||
|
|
||||||
# ------------------- Class weights -------------------
|
|
||||||
# Using weights inversely proportional to class frequencies to avoid majority class bias,
|
|
||||||
# prioritize useful bug reports / feature requests
|
|
||||||
def compute_weights(df, column, device):
|
|
||||||
"""Copmutes inverse frequency class weights for a label column
|
|
||||||
|
|
||||||
Uses sklearns balanced mode
|
def compute_weights(df, column, device):
|
||||||
Rare classes receive higher weights to penalise so it can learn more from less
|
"""Computes inverse frequency class weights for a label column"""
|
||||||
"""
|
|
||||||
classes = np.unique(df[column])
|
classes = np.unique(df[column])
|
||||||
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
|
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
|
||||||
return torch.tensor(weights, dtype=torch.float).to(device)
|
return torch.tensor(weights, dtype=torch.float).to(device)
|
||||||
|
|
||||||
# parse_args() - NFR7 and NFR9
|
|
||||||
# Example Usages: python src/train.py --dataset boosted
|
|
||||||
# python src/train.py --epochs 15 NOTE: 8 - 12 epochs has seen best results so far
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
|
parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
|
||||||
parser.add_argument("--mode", type=str, default="mtl", choices=["mtl", "stl"], help="Choose between 'mtl' (multitask learning) and 'stl' (single task learning).")
|
parser.add_argument("--mode", type=str, default="mtl", choices=["mtl", "stl"], help="Choose between 'mtl' (multitask learning) and 'stl' (single task learning).")
|
||||||
@@ -67,23 +54,25 @@ def main():
|
|||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
print("Starting training...", flush=True)
|
print("Starting training...", flush=True)
|
||||||
print("Using device:", device)
|
print("Using device:", device)
|
||||||
# Set cuda seeds for reproducibility
|
|
||||||
|
# Set cuda seeds for reproducibility on GPU
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
print("GPU:", torch.cuda.get_device_name(0))
|
print("GPU:", torch.cuda.get_device_name(0))
|
||||||
torch.cuda.manual_seed_all(SEED)
|
torch.cuda.manual_seed_all(SEED)
|
||||||
torch.cuda.manual_seed(SEED)
|
torch.cuda.manual_seed(SEED)
|
||||||
print(f"Using dataset: {args.dataset.upper()}")
|
print(f"Using dataset: {args.dataset.upper()}")
|
||||||
|
|
||||||
# Force deterministic for reproducibility at a slight performance cost
|
# Force deterministic for reproducibility at a slight performance cost
|
||||||
torch.backends.cudnn.deterministic = True
|
torch.backends.cudnn.deterministic = True
|
||||||
torch.backends.cudnn.benchmark = False
|
torch.backends.cudnn.benchmark = False
|
||||||
|
|
||||||
# load data
|
# load data into train/val splits
|
||||||
train = f"data/processed/{args.dataset}_train.csv"
|
train = f"data/processed/{args.dataset}_train.csv"
|
||||||
val = f"data/processed/{args.dataset}_val.csv"
|
val = f"data/processed/{args.dataset}_val.csv"
|
||||||
os.makedirs("outputs", exist_ok=True)
|
os.makedirs("outputs", exist_ok=True)
|
||||||
os.makedirs("runs", exist_ok=True)
|
os.makedirs("runs", exist_ok=True)
|
||||||
|
|
||||||
# FR1, FR2, Multilingual tokenizer initilization
|
# Tokenizer initilization
|
||||||
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||||
|
|
||||||
train_dataset = ReviewDataset(train, tokenizer)
|
train_dataset = ReviewDataset(train, tokenizer)
|
||||||
@@ -92,7 +81,7 @@ def main():
|
|||||||
training_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
|
training_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
|
||||||
validation_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
|
validation_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
|
||||||
|
|
||||||
# FR3, shared multilingual model with task-specific heads
|
# Shared model uses encoder across all tasks, STL model trains one task at a time
|
||||||
if args.mode == "mtl":
|
if args.mode == "mtl":
|
||||||
model = Model().to(device)
|
model = Model().to(device)
|
||||||
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
|
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
|
||||||
@@ -113,7 +102,7 @@ def main():
|
|||||||
|
|
||||||
train_df = pd.read_csv(train)
|
train_df = pd.read_csv(train)
|
||||||
|
|
||||||
# Class weights
|
# Compute per-task weights from the training split
|
||||||
print("\n Computing class weights...")
|
print("\n Computing class weights...")
|
||||||
bug_weights = compute_weights(train_df, 'bug_report', device)
|
bug_weights = compute_weights(train_df, 'bug_report', device)
|
||||||
feature_weights = compute_weights(train_df, 'feature_request', device)
|
feature_weights = compute_weights(train_df, 'feature_request', device)
|
||||||
@@ -151,7 +140,7 @@ def main():
|
|||||||
num_training_steps=total_steps
|
num_training_steps=total_steps
|
||||||
)
|
)
|
||||||
|
|
||||||
# ------------------- Training loop -------------------
|
# Entry point for training loop, with Tensorboard logging and early stopping based on validation macro F1 score
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||||
writer = SummaryWriter(f'runs/reclass_{run_name}_{timestamp}')
|
writer = SummaryWriter(f'runs/reclass_{run_name}_{timestamp}')
|
||||||
@@ -175,7 +164,7 @@ def main():
|
|||||||
input_ids = batch["input_ids"].to(device)
|
input_ids = batch["input_ids"].to(device)
|
||||||
attention_mask = batch["attention_mask"].to(device)
|
attention_mask = batch["attention_mask"].to(device)
|
||||||
|
|
||||||
# FR8, Multitask forward pass
|
# Multitask forward pass
|
||||||
outputs = model(input_ids, attention_mask)
|
outputs = model(input_ids, attention_mask)
|
||||||
|
|
||||||
loss = 0
|
loss = 0
|
||||||
@@ -199,7 +188,7 @@ def main():
|
|||||||
writer.add_scalar("Loss/train", avg_train_loss, epoch)
|
writer.add_scalar("Loss/train", avg_train_loss, epoch)
|
||||||
print(f"Average training loss: {avg_train_loss:.4f}")
|
print(f"Average training loss: {avg_train_loss:.4f}")
|
||||||
|
|
||||||
# -------------------- Validation loop -------------------
|
# Validation phase
|
||||||
model.eval()
|
model.eval()
|
||||||
total_val_loss = 0.0
|
total_val_loss = 0.0
|
||||||
|
|
||||||
@@ -226,7 +215,7 @@ def main():
|
|||||||
avg_vloss = total_val_loss / len(validation_loader)
|
avg_vloss = total_val_loss / len(validation_loader)
|
||||||
writer.add_scalar("Loss/val", avg_vloss, epoch)
|
writer.add_scalar("Loss/val", avg_vloss, epoch)
|
||||||
|
|
||||||
# FR11, Performance evaluation
|
# Performance evaluation summary
|
||||||
print("\nValidation Metrics (MACRO F1):")
|
print("\nValidation Metrics (MACRO F1):")
|
||||||
epoch_f1 = []
|
epoch_f1 = []
|
||||||
for task in active_tasks:
|
for task in active_tasks:
|
||||||
@@ -239,7 +228,7 @@ def main():
|
|||||||
writer.add_scalar("F1/val_macro_avg", avg_macro_f1, epoch)
|
writer.add_scalar("F1/val_macro_avg", avg_macro_f1, epoch)
|
||||||
print(f" Average Macro F1: {avg_macro_f1:.4f}")
|
print(f" Average Macro F1: {avg_macro_f1:.4f}")
|
||||||
|
|
||||||
# NFR4, Early stopping
|
# Early stopping
|
||||||
if avg_macro_f1 > best_f1:
|
if avg_macro_f1 > best_f1:
|
||||||
best_f1 = avg_macro_f1
|
best_f1 = avg_macro_f1
|
||||||
patience_counter = 0
|
patience_counter = 0
|
||||||
|
|||||||
Reference in New Issue
Block a user