Added some comments and readability

This commit is contained in:
2026-03-24 18:11:31 +00:00
parent afe61eaaa2
commit 753723694b
5 changed files with 103 additions and 84 deletions

View File

@@ -1,5 +1,5 @@
# dataset.py # dataset.py
# tokenize data using (sentencepiece) XLM-RoBERTa
# Takes a row from the csv, tokenizes the review and returns a tensor # Takes a row from the csv, tokenizes the review and returns a tensor
import torch import torch
import pandas as pd import pandas as pd
@@ -7,6 +7,18 @@ from torch.utils.data import Dataset
from transformers import AutoTokenizer from transformers import AutoTokenizer
class ReviewDataset(Dataset): class ReviewDataset(Dataset):
"""Pytorch Dataset for loading tokenized reviews
Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
input tensors and integer labels for all 4 tasks.
Args:
path (str): Path to the csv file containing the reviews and labels.
tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
"""
def __init__(self, path, tokenizer, max_length=256): def __init__(self, path, tokenizer, max_length=256):
self.df = pd.read_csv(path) self.df = pd.read_csv(path)
self.tokenizer = tokenizer self.tokenizer = tokenizer
@@ -22,13 +34,7 @@ class ReviewDataset(Dataset):
# encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length] # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
# Both have shape [1, max_length] because of return_tensors='pt' # Both have shape [1, max_length] because of return_tensors='pt'
# Squeeze them to [max_length] with .squeeze(0) # Squeeze them to [max_length] with .squeeze(0)
encoding = self.tokenizer( encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
review,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
# Returns a dictionary with: # Returns a dictionary with:
# 'input_ids': tensor of shape [max_length] # 'input_ids': tensor of shape [max_length]

View File

@@ -11,7 +11,17 @@ import torch.nn as nn
# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head # Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
# Each hidden representation is size 768 # Each hidden representation is size 768
class SingleTaskModel(nn.Module): # TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTURE class SingleTaskModel(nn.Module):
"""Single task model to compare MTL approach to review classification
Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
just different args
Args:
task_name: which of the 4 tasks are we training for
num_classes: number of output classes for the task
dropout_rate: probability applied to cls representation, randomly drops tokens for better results
"""
def __init__(self, task_name, num_classes, dropout_rate=0.2): def __init__(self, task_name, num_classes, dropout_rate=0.2):
super().__init__() super().__init__()
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base") self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
@@ -24,14 +34,23 @@ class SingleTaskModel(nn.Module): # TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTU
logits = self.head(output) logits = self.head(output)
return {self.task_name: logits} return {self.task_name: logits}
class Model(nn.Module): # MULTITASK MODEL ARCHITECTURE class Model(nn.Module):
def __init__(self, dropout_rate=0.2): # Try other p values """ Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads
Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through
shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
allowing the encoder to learn from the shared representations / generalisations
Args:
dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
"""
def __init__(self, dropout_rate=0.2):
super().__init__() super().__init__()
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base") self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
hidden_size = self.encoder.config.hidden_size hidden_size = self.encoder.config.hidden_size
# Applied across whole output, shared # Applied across shared cls token, before all task heads
self.dropout = nn.Dropout(dropout_rate) self.dropout = nn.Dropout(dropout_rate)
self.bug_head = nn.Linear(hidden_size, 2) self.bug_head = nn.Linear(hidden_size, 2)
@@ -39,10 +58,11 @@ class Model(nn.Module): # MULTITASK MODEL ARCHITECTURE
self.aspect_head = nn.Linear(hidden_size, 6) self.aspect_head = nn.Linear(hidden_size, 6)
self.aspect_sentiment_head = nn.Linear(hidden_size, 3) self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
# Pass through encoder then extract the token representation # Pass through encoder then extract the token representation through [batch_size, 768]
# Apply droupout to it, take scores for each head, return them in a dictionary # Apply droupout to it, take scores for each head, return them in a dictionary
def forward(self, input_ids, attention_mask): def forward(self, input_ids, attention_mask):
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
# index 0 from [batch_size, 768]
output = outputs.last_hidden_state[:, 0, :] output = outputs.last_hidden_state[:, 0, :]
output = self.dropout(output) output = self.dropout(output)

View File

@@ -1,10 +1,11 @@
# preprocess.py # preprocess.py
# langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
import pandas as pd import pandas as pd
import re import re
from langdetect import detect, LangDetectException
def clean_text(text): def clean_text(text) -> str:
"""Clean review text by removing URLS, emails, excessive whitespace """Clean review text by removing URLS, emails, excessive whitespace
Input: Input:
@@ -19,31 +20,22 @@ def clean_text(text):
# Convert to lower for uniformity # Convert to lower for uniformity
text = str(text).lower() text = str(text).lower()
# Remove URLs using regex # Remove URLs using regex, match http in any non whitespace char (\S) numerous (+) times. same with either ( | ) www
text = re.sub(r'http\S+|www\S+', '', text) text = re.sub(r'http\S+|www\S+', '', text)
# Remove emails # Remove emails, one or more (+) non whitespace (\S) before "@" with trailing \S up replace with '' for each text (review)
text = re.sub(r'\S+@\S+', '', text) text = re.sub(r'\S+@\S+', '', text)
# Normalize punctuation # Normalize punctuation, any character except line terminators (\.) at least 2 times {2,} for . ! ? replace with a single
text = re.sub(r'\.{2,}', '.', text) text = re.sub(r'\.{2,}', '.', text)
text = re.sub(r'!{2,}', '!', text) text = re.sub(r'!{2,}', '!', text)
text = re.sub(r'\?{2,}', '?', text) text = re.sub(r'\?{2,}', '?', text)
# Remove excessive whitespace by replacing with single whitespace where there is trailing spaces # Remove excessive whitespace (\s) by replacing with single whitespace where there is trailing spaces
text = re.sub(r'\s+', ' ', text).strip() text = re.sub(r'\s+', ' ', text).strip()
return text return text
def detect_language(text):
"""Detect language of text"""
try:
if pd.isna(text) or len(str(text).strip()) < 10:
return 'unknown'
return detect(str(text))
except LangDetectException:
return 'unknown'
def preprocess_uber_reviews(input_path, output_path): def preprocess_uber_reviews(input_path, output_path):
""" """
preprocess_uber_reviews by loading, cleaning, and filtering the data. preprocess_uber_reviews by loading, cleaning, and filtering the data.

View File

@@ -1,21 +1,16 @@
# TODO: Add verification comparison between ratings
# TODO: Clean up the logging print statements
import pandas as pd import pandas as pd
import numpy as np import numpy as np
print(pd.__version__) print(pd.__version__)
print(np.__version__) print(np.__version__)
path = "multitag/data/uber_reviews_cleaned.csv" path = "data/raw/uber_reviews_cleaned.csv"
sampled_path = "multitag/data/uber_reviews_sampled.csv" sampled_path = "data/raw/uber_reviews_sampled.csv"
original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison original_path = "data/raw/uber_reviews.csv" ### only for distribution comparison
class Sampler: class Sampler:
def __init__(self, data_path, target_samples): def __init__(self, data_path, target_samples):
self.data_path = data_path self.data_path = data_path
self.target_samples = 5000 # target number of samples
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers) self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
self.original_data = pd.read_csv(original_path, low_memory=False) self.original_data = pd.read_csv(original_path, low_memory=False)
@@ -39,7 +34,7 @@ class Sampler:
print(f"Original Distribution from {original_path}:") print(f"Original Distribution from {original_path}:")
print((_origdist*100).round(1),"\n") print((_origdist*100).round(1),"\n")
self.data.info() self.data.info(verbose=True)
# add sampling method here # add sampling method here
# random sample 5000 entries with stratifiying by rating # random sample 5000 entries with stratifiying by rating
@@ -52,43 +47,53 @@ class Sampler:
2 3.9% (41707) 2 3.9% (41707)
Name: proportion, dtype: object Name: proportion, dtype: object
""" """
""" """
IGNORE --- Left in just in case
Sample size by rating Sample randomly
Redundant calculation, kept for clarity Redundant calculation
Doesn't factor that the distribution changed greatly after preprocessing Doesn't factor that the distribution changed greatly after preprocessing
""" """
def get_stratified_sample(self) -> pd.DataFrame: def get_stratified_sample(self) -> pd.DataFrame:
stratified_sample = ( stratified_sample = (
self.data self.data
.reset_index(drop=True) .reset_index(drop=True) # remove messy indexes
.apply(self.x) .apply(self.sample_col) # applies to each column
.sample(n=self.target_samples, random_state=42) .sample(n=self.target_samples, random_state=42) # 42 on sampler 4321 on any other file
) )
return stratified_sample return stratified_sample
def sample_col(self, column) -> pd.DataFrame:
"""
IGNORE --- Left in just in case
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX= Randomly sample, including conflicting math, I guess I was going to stratify
def x(self, x): """
n = int(len(x) / self.total * self.target_samples) samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
n = max(n,1) samples_per_column = max(samples_per_column,1) # also pointless
return x.sample(n=n, random_state=42) return column.sample(n=samples_per_column, random_state=42)
"""
get_proportional_sample()
"""
""" """
original_distribution_sample() original_distribution_sample()
The main sampling method for our labelling as it The main sampling method for our labelling as it
keeps composition of the original uber dataset keeps composition of the original uber dataset, verified in
which is a fairer comparison, may also work better in general which is a fairer comparison, may also work better in general
inputs: verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
manually coded distributions taken from notebooks
for ratings and actual number of samples
rating data is the whole data for a rating as we iterate
has error handling if totals doesn't match the required amount of samples per the orig distrib
randomise the indexes (samples) and appends to the new dataset
outputs:
""" """
def original_distribution_sample(self): def original_distribution_sample(self):
@@ -102,8 +107,8 @@ class Sampler:
print("Target Distribution =", original_dist) print("Target Distribution =", original_dist)
samples = [] samples = []
for rating, num_samples in original_dist.items(): for rating, num_samples in original_dist.items():
rating_data = self.data[self.data[self.stratify_column] == rating] rating_data = self.data[self.data[self.stratify_column] == rating] # stratify_column = "rating"
if len(rating_data) < num_samples: if len(rating_data) < num_samples: # data is a pd.dataframe of the set
print("Missing samples available for rating") print("Missing samples available for rating")
num_samples = len(rating_data) num_samples = len(rating_data)
sample = rating_data.sample(n = num_samples,random_state=42) sample = rating_data.sample(n = num_samples,random_state=42)
@@ -127,9 +132,9 @@ class Sampler:
def sample_with_keywords(self): def sample_with_keywords(self):
#TODO add keywords for feature classification #TODO add keywords for feature classification
print(f"\n{"="*50}") print(f"\n{'='*50}")
print("Keyword influenced / rating stratified set") print("Keyword influenced / rating stratified set")
print(f"\n{"="*50}") print(f"\n{'='*50}")
bug_keywords = ["crash","freeze", "error", bug_keywords = ["crash","freeze", "error",
"stop", "doesnt work", "doesn't work","loading", "stop", "doesnt work", "doesn't work","loading",
@@ -204,7 +209,7 @@ class Sampler:
def main(): def main():
sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000) sampler = Sampler("data/raw/uber_reviews_cleaned.csv", target_samples=5000)
# Choose sampling strategy # Choose sampling strategy
print(f"\n{'='*50}") print(f"\n{'='*50}")
@@ -218,19 +223,19 @@ def main():
if choice == '1': if choice == '1':
sample = sampler.get_stratified_sample() sample = sampler.get_stratified_sample()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
elif choice == '2': elif choice == '2':
sample = sampler.original_distribution_sample() sample = sampler.original_distribution_sample()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
elif choice == '3': elif choice == '3':
sample = sampler.sample_with_keywords() sample = sampler.sample_with_keywords()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
elif choice == '4': elif choice == '4':
sample = sampler.sample_tiny_size() sample = sampler.sample_tiny_size()
sampler.save_sample(sample,"multitag/data/uber_review_temp.csv") sampler.save_sample(sample,"data/raw/uber_review_temp.csv")

View File

@@ -1,6 +1,6 @@
# train.py # train.py
# some code directly from pytorch docs https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html # structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
import argparse # argparse for later switching to boosted data import argparse
import os import os
from datetime import datetime from datetime import datetime
import time import time
@@ -25,7 +25,7 @@ from model import Model, SingleTaskModel
# ======================================================================= # =======================================================================
# Multitask implementation # Training script for MTL and STL training configurations
# ======================================================================= # =======================================================================
# NFR5, reproducibility # NFR5, reproducibility
@@ -34,12 +34,16 @@ torch.manual_seed(SEED)
np.random.seed(SEED) np.random.seed(SEED)
random.seed(SEED) random.seed(SEED)
# class weights, training loop and early stopping
# ------------------- Class weights ------------------- # ------------------- Class weights -------------------
# Using weights inversely proportional to class frequencies to avoid majority class bias, # Using weights inversely proportional to class frequencies to avoid majority class bias,
# prioritize useful bug reports / feature requests # prioritize useful bug reports / feature requests
def compute_weights(df, column, device): def compute_weights(df, column, device):
"""Copmutes inverse frequency class weights for a label column
Uses sklearns balanced mode
Rare classes receive higher weights to penalise so it can learn more from less
"""
classes = np.unique(df[column]) classes = np.unique(df[column])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column]) weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
return torch.tensor(weights, dtype=torch.float).to(device) return torch.tensor(weights, dtype=torch.float).to(device)
@@ -63,19 +67,17 @@ def main():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Starting training...", flush=True) print("Starting training...", flush=True)
print("Using device:", device) print("Using device:", device)
# Remove randomness # Set cuda seeds for reproducibility
if torch.cuda.is_available(): if torch.cuda.is_available():
print("GPU:", torch.cuda.get_device_name(0)) print("GPU:", torch.cuda.get_device_name(0))
torch.cuda.manual_seed_all(SEED) torch.cuda.manual_seed_all(SEED)
torch.cuda.manual_seed(SEED) torch.cuda.manual_seed(SEED)
print(f"Using dataset: {args.dataset.upper()}") print(f"Using dataset: {args.dataset.upper()}")
# Force deterministic for reproducibility at a slight performance cost
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False torch.backends.cudnn.benchmark = False
""" # load data
Data loading:
"""
train = f"data/processed/{args.dataset}_train.csv" train = f"data/processed/{args.dataset}_train.csv"
val = f"data/processed/{args.dataset}_val.csv" val = f"data/processed/{args.dataset}_val.csv"
os.makedirs("outputs", exist_ok=True) os.makedirs("outputs", exist_ok=True)
@@ -117,19 +119,13 @@ def main():
feature_weights = compute_weights(train_df, 'feature_request', device) feature_weights = compute_weights(train_df, 'feature_request', device)
aspect_weights = compute_weights(train_df, 'aspect', device) aspect_weights = compute_weights(train_df, 'aspect', device)
aspect_sentiment_weights = compute_weights(train_df, 'aspect_sentiment', device) aspect_sentiment_weights = compute_weights(train_df, 'aspect_sentiment', device)
# Move tensors to cpu and conver to numpy for usage with sklearn classification report
# Use detatch() later for predictions
print("Bug report class weights:", bug_weights.cpu().numpy()) print("Bug report class weights:", bug_weights.cpu().numpy())
print("Feature request class weights:", feature_weights.cpu().numpy()) print("Feature request class weights:", feature_weights.cpu().numpy())
print("Aspect class weights:", aspect_weights.cpu().numpy()) print("Aspect class weights:", aspect_weights.cpu().numpy())
print("Aspect sentiment class weights:", aspect_sentiment_weights.cpu().numpy()) print("Aspect sentiment class weights:", aspect_sentiment_weights.cpu().numpy())
# for later # equal weighted task losses. unequal was considered but equal weights performed well without adding complexity
# 1.0 * bug_loss +
# 1.0 * feature_loss +
# 0.5 * aspect_loss +
# 0.5 * sentiment_loss
# FR4-FR7: Task specific loss functions
criterions = { criterions = {
'bug_report': nn.CrossEntropyLoss(weight=bug_weights), 'bug_report': nn.CrossEntropyLoss(weight=bug_weights),
'feature_request': nn.CrossEntropyLoss(weight=feature_weights), 'feature_request': nn.CrossEntropyLoss(weight=feature_weights),
@@ -140,7 +136,7 @@ def main():
# -------------------- Optimizer and scheduler ------------------- # -------------------- Optimizer and scheduler -------------------
optimizer = torch.optim.AdamW( optimizer = torch.optim.AdamW(
model.parameters(), model.parameters(),
lr=args.lr, # change lr=args.lr,
weight_decay=0.01 weight_decay=0.01
) )