Added some comments and readability
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
# dataset.py
|
||||
|
||||
# tokenize data using (sentencepiece) XLM-RoBERTa
|
||||
# Takes a row from the csv, tokenizes the review and returns a tensor
|
||||
import torch
|
||||
import pandas as pd
|
||||
@@ -7,6 +7,18 @@ from torch.utils.data import Dataset
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
class ReviewDataset(Dataset):
|
||||
"""Pytorch Dataset for loading tokenized reviews
|
||||
|
||||
Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
|
||||
Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
|
||||
input tensors and integer labels for all 4 tasks.
|
||||
|
||||
Args:
|
||||
path (str): Path to the csv file containing the reviews and labels.
|
||||
tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
|
||||
max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
|
||||
"""
|
||||
|
||||
def __init__(self, path, tokenizer, max_length=256):
|
||||
self.df = pd.read_csv(path)
|
||||
self.tokenizer = tokenizer
|
||||
@@ -22,13 +34,7 @@ class ReviewDataset(Dataset):
|
||||
# encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
|
||||
# Both have shape [1, max_length] because of return_tensors='pt'
|
||||
# Squeeze them to [max_length] with .squeeze(0)
|
||||
encoding = self.tokenizer(
|
||||
review,
|
||||
max_length=self.max_length,
|
||||
padding='max_length',
|
||||
truncation=True,
|
||||
return_tensors='pt'
|
||||
)
|
||||
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
||||
|
||||
# Returns a dictionary with:
|
||||
# 'input_ids': tensor of shape [max_length]
|
||||
|
||||
30
src/model.py
30
src/model.py
@@ -11,7 +11,17 @@ import torch.nn as nn
|
||||
# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
|
||||
# Each hidden representation is size 768
|
||||
|
||||
class SingleTaskModel(nn.Module): # TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTURE
|
||||
class SingleTaskModel(nn.Module):
|
||||
"""Single task model to compare MTL approach to review classification
|
||||
|
||||
Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
|
||||
just different args
|
||||
|
||||
Args:
|
||||
task_name: which of the 4 tasks are we training for
|
||||
num_classes: number of output classes for the task
|
||||
dropout_rate: probability applied to cls representation, randomly drops tokens for better results
|
||||
"""
|
||||
def __init__(self, task_name, num_classes, dropout_rate=0.2):
|
||||
super().__init__()
|
||||
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||
@@ -24,14 +34,23 @@ class SingleTaskModel(nn.Module): # TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTU
|
||||
logits = self.head(output)
|
||||
return {self.task_name: logits}
|
||||
|
||||
class Model(nn.Module): # MULTITASK MODEL ARCHITECTURE
|
||||
def __init__(self, dropout_rate=0.2): # Try other p values
|
||||
class Model(nn.Module):
|
||||
""" Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads
|
||||
|
||||
Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through
|
||||
shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
|
||||
allowing the encoder to learn from the shared representations / generalisations
|
||||
|
||||
Args:
|
||||
dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
|
||||
"""
|
||||
def __init__(self, dropout_rate=0.2):
|
||||
super().__init__()
|
||||
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||
|
||||
hidden_size = self.encoder.config.hidden_size
|
||||
|
||||
# Applied across whole output, shared
|
||||
# Applied across shared cls token, before all task heads
|
||||
self.dropout = nn.Dropout(dropout_rate)
|
||||
|
||||
self.bug_head = nn.Linear(hidden_size, 2)
|
||||
@@ -39,10 +58,11 @@ class Model(nn.Module): # MULTITASK MODEL ARCHITECTURE
|
||||
self.aspect_head = nn.Linear(hidden_size, 6)
|
||||
self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
|
||||
|
||||
# Pass through encoder then extract the token representation
|
||||
# Pass through encoder then extract the token representation through [batch_size, 768]
|
||||
# Apply droupout to it, take scores for each head, return them in a dictionary
|
||||
def forward(self, input_ids, attention_mask):
|
||||
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
|
||||
# index 0 from [batch_size, 768]
|
||||
output = outputs.last_hidden_state[:, 0, :]
|
||||
|
||||
output = self.dropout(output)
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
# preprocess.py
|
||||
|
||||
# langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
|
||||
|
||||
import pandas as pd
|
||||
import re
|
||||
from langdetect import detect, LangDetectException
|
||||
|
||||
def clean_text(text):
|
||||
def clean_text(text) -> str:
|
||||
"""Clean review text by removing URLS, emails, excessive whitespace
|
||||
|
||||
Input:
|
||||
@@ -19,31 +20,22 @@ def clean_text(text):
|
||||
# Convert to lower for uniformity
|
||||
text = str(text).lower()
|
||||
|
||||
# Remove URLs using regex
|
||||
# Remove URLs using regex, match http in any non whitespace char (\S) numerous (+) times. same with either ( | ) www
|
||||
text = re.sub(r'http\S+|www\S+', '', text)
|
||||
|
||||
# Remove emails
|
||||
# Remove emails, one or more (+) non whitespace (\S) before "@" with trailing \S up replace with '' for each text (review)
|
||||
text = re.sub(r'\S+@\S+', '', text)
|
||||
|
||||
# Normalize punctuation
|
||||
# Normalize punctuation, any character except line terminators (\.) at least 2 times {2,} for . ! ? replace with a single
|
||||
text = re.sub(r'\.{2,}', '.', text)
|
||||
text = re.sub(r'!{2,}', '!', text)
|
||||
text = re.sub(r'\?{2,}', '?', text)
|
||||
|
||||
# Remove excessive whitespace by replacing with single whitespace where there is trailing spaces
|
||||
# Remove excessive whitespace (\s) by replacing with single whitespace where there is trailing spaces
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
return text
|
||||
|
||||
def detect_language(text):
|
||||
"""Detect language of text"""
|
||||
try:
|
||||
if pd.isna(text) or len(str(text).strip()) < 10:
|
||||
return 'unknown'
|
||||
return detect(str(text))
|
||||
except LangDetectException:
|
||||
return 'unknown'
|
||||
|
||||
def preprocess_uber_reviews(input_path, output_path):
|
||||
"""
|
||||
preprocess_uber_reviews by loading, cleaning, and filtering the data.
|
||||
|
||||
@@ -1,21 +1,16 @@
|
||||
# TODO: Add verification comparison between ratings
|
||||
# TODO: Clean up the logging print statements
|
||||
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
print(pd.__version__)
|
||||
print(np.__version__)
|
||||
|
||||
path = "multitag/data/uber_reviews_cleaned.csv"
|
||||
sampled_path = "multitag/data/uber_reviews_sampled.csv"
|
||||
original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
|
||||
path = "data/raw/uber_reviews_cleaned.csv"
|
||||
sampled_path = "data/raw/uber_reviews_sampled.csv"
|
||||
original_path = "data/raw/uber_reviews.csv" ### only for distribution comparison
|
||||
class Sampler:
|
||||
def __init__(self, data_path, target_samples):
|
||||
|
||||
self.data_path = data_path
|
||||
self.target_samples = 5000 # target number of samples
|
||||
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
|
||||
|
||||
self.original_data = pd.read_csv(original_path, low_memory=False)
|
||||
@@ -39,7 +34,7 @@ class Sampler:
|
||||
print(f"Original Distribution from {original_path}:")
|
||||
print((_origdist*100).round(1),"\n")
|
||||
|
||||
self.data.info()
|
||||
self.data.info(verbose=True)
|
||||
|
||||
# add sampling method here
|
||||
# random sample 5000 entries with stratifiying by rating
|
||||
@@ -52,43 +47,53 @@ class Sampler:
|
||||
2 3.9% (41707)
|
||||
Name: proportion, dtype: object
|
||||
"""
|
||||
|
||||
"""
|
||||
IGNORE --- Left in just in case
|
||||
|
||||
Sample size by rating
|
||||
Redundant calculation, kept for clarity
|
||||
Sample randomly
|
||||
Redundant calculation
|
||||
Doesn't factor that the distribution changed greatly after preprocessing
|
||||
|
||||
"""
|
||||
def get_stratified_sample(self) -> pd.DataFrame:
|
||||
stratified_sample = (
|
||||
self.data
|
||||
.reset_index(drop=True)
|
||||
.apply(self.x)
|
||||
.sample(n=self.target_samples, random_state=42)
|
||||
.reset_index(drop=True) # remove messy indexes
|
||||
.apply(self.sample_col) # applies to each column
|
||||
.sample(n=self.target_samples, random_state=42) # 42 on sampler 4321 on any other file
|
||||
)
|
||||
return stratified_sample
|
||||
|
||||
|
||||
|
||||
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
|
||||
def x(self, x):
|
||||
n = int(len(x) / self.total * self.target_samples)
|
||||
n = max(n,1)
|
||||
return x.sample(n=n, random_state=42)
|
||||
def sample_col(self, column) -> pd.DataFrame:
|
||||
"""
|
||||
get_proportional_sample()
|
||||
IGNORE --- Left in just in case
|
||||
|
||||
Randomly sample, including conflicting math, I guess I was going to stratify
|
||||
"""
|
||||
samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
|
||||
samples_per_column = max(samples_per_column,1) # also pointless
|
||||
return column.sample(n=samples_per_column, random_state=42)
|
||||
|
||||
|
||||
"""
|
||||
original_distribution_sample()
|
||||
The main sampling method for our labelling as it
|
||||
keeps composition of the original uber dataset
|
||||
keeps composition of the original uber dataset, verified in
|
||||
which is a fairer comparison, may also work better in general
|
||||
|
||||
inputs:
|
||||
verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
|
||||
and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
|
||||
|
||||
|
||||
manually coded distributions taken from notebooks
|
||||
|
||||
for ratings and actual number of samples
|
||||
rating data is the whole data for a rating as we iterate
|
||||
has error handling if totals doesn't match the required amount of samples per the orig distrib
|
||||
randomise the indexes (samples) and appends to the new dataset
|
||||
|
||||
|
||||
outputs:
|
||||
|
||||
"""
|
||||
def original_distribution_sample(self):
|
||||
@@ -102,8 +107,8 @@ class Sampler:
|
||||
print("Target Distribution =", original_dist)
|
||||
samples = []
|
||||
for rating, num_samples in original_dist.items():
|
||||
rating_data = self.data[self.data[self.stratify_column] == rating]
|
||||
if len(rating_data) < num_samples:
|
||||
rating_data = self.data[self.data[self.stratify_column] == rating] # stratify_column = "rating"
|
||||
if len(rating_data) < num_samples: # data is a pd.dataframe of the set
|
||||
print("Missing samples available for rating")
|
||||
num_samples = len(rating_data)
|
||||
sample = rating_data.sample(n = num_samples,random_state=42)
|
||||
@@ -127,9 +132,9 @@ class Sampler:
|
||||
|
||||
def sample_with_keywords(self):
|
||||
#TODO add keywords for feature classification
|
||||
print(f"\n{"="*50}")
|
||||
print(f"\n{'='*50}")
|
||||
print("Keyword influenced / rating stratified set")
|
||||
print(f"\n{"="*50}")
|
||||
print(f"\n{'='*50}")
|
||||
|
||||
bug_keywords = ["crash","freeze", "error",
|
||||
"stop", "doesnt work", "doesn't work","loading",
|
||||
@@ -204,7 +209,7 @@ class Sampler:
|
||||
|
||||
def main():
|
||||
|
||||
sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
|
||||
sampler = Sampler("data/raw/uber_reviews_cleaned.csv", target_samples=5000)
|
||||
|
||||
# Choose sampling strategy
|
||||
print(f"\n{'='*50}")
|
||||
@@ -218,19 +223,19 @@ def main():
|
||||
|
||||
if choice == '1':
|
||||
sample = sampler.get_stratified_sample()
|
||||
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
||||
sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
|
||||
|
||||
elif choice == '2':
|
||||
sample = sampler.original_distribution_sample()
|
||||
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
||||
sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
|
||||
|
||||
elif choice == '3':
|
||||
sample = sampler.sample_with_keywords()
|
||||
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
||||
sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
|
||||
|
||||
elif choice == '4':
|
||||
sample = sampler.sample_tiny_size()
|
||||
sampler.save_sample(sample,"multitag/data/uber_review_temp.csv")
|
||||
sampler.save_sample(sample,"data/raw/uber_review_temp.csv")
|
||||
|
||||
|
||||
|
||||
|
||||
32
src/train.py
32
src/train.py
@@ -1,6 +1,6 @@
|
||||
# train.py
|
||||
# some code directly from pytorch docs https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
|
||||
import argparse # argparse for later switching to boosted data
|
||||
# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
|
||||
import argparse
|
||||
import os
|
||||
from datetime import datetime
|
||||
import time
|
||||
@@ -25,7 +25,7 @@ from model import Model, SingleTaskModel
|
||||
|
||||
|
||||
# =======================================================================
|
||||
# Multitask implementation
|
||||
# Training script for MTL and STL training configurations
|
||||
# =======================================================================
|
||||
|
||||
# NFR5, reproducibility
|
||||
@@ -34,12 +34,16 @@ torch.manual_seed(SEED)
|
||||
np.random.seed(SEED)
|
||||
random.seed(SEED)
|
||||
|
||||
# class weights, training loop and early stopping
|
||||
|
||||
# ------------------- Class weights -------------------
|
||||
# Using weights inversely proportional to class frequencies to avoid majority class bias,
|
||||
# prioritize useful bug reports / feature requests
|
||||
def compute_weights(df, column, device):
|
||||
"""Copmutes inverse frequency class weights for a label column
|
||||
|
||||
Uses sklearns balanced mode
|
||||
Rare classes receive higher weights to penalise so it can learn more from less
|
||||
"""
|
||||
classes = np.unique(df[column])
|
||||
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
|
||||
return torch.tensor(weights, dtype=torch.float).to(device)
|
||||
@@ -63,19 +67,17 @@ def main():
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print("Starting training...", flush=True)
|
||||
print("Using device:", device)
|
||||
# Remove randomness
|
||||
# Set cuda seeds for reproducibility
|
||||
if torch.cuda.is_available():
|
||||
print("GPU:", torch.cuda.get_device_name(0))
|
||||
torch.cuda.manual_seed_all(SEED)
|
||||
torch.cuda.manual_seed(SEED)
|
||||
print(f"Using dataset: {args.dataset.upper()}")
|
||||
# Force deterministic for reproducibility at a slight performance cost
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
||||
"""
|
||||
Data loading:
|
||||
|
||||
"""
|
||||
# load data
|
||||
train = f"data/processed/{args.dataset}_train.csv"
|
||||
val = f"data/processed/{args.dataset}_val.csv"
|
||||
os.makedirs("outputs", exist_ok=True)
|
||||
@@ -117,19 +119,13 @@ def main():
|
||||
feature_weights = compute_weights(train_df, 'feature_request', device)
|
||||
aspect_weights = compute_weights(train_df, 'aspect', device)
|
||||
aspect_sentiment_weights = compute_weights(train_df, 'aspect_sentiment', device)
|
||||
# Move tensors to cpu and conver to numpy for usage with sklearn classification report
|
||||
# Use detatch() later for predictions
|
||||
|
||||
print("Bug report class weights:", bug_weights.cpu().numpy())
|
||||
print("Feature request class weights:", feature_weights.cpu().numpy())
|
||||
print("Aspect class weights:", aspect_weights.cpu().numpy())
|
||||
print("Aspect sentiment class weights:", aspect_sentiment_weights.cpu().numpy())
|
||||
|
||||
# for later
|
||||
# 1.0 * bug_loss +
|
||||
# 1.0 * feature_loss +
|
||||
# 0.5 * aspect_loss +
|
||||
# 0.5 * sentiment_loss
|
||||
# FR4-FR7: Task specific loss functions
|
||||
# equal weighted task losses. unequal was considered but equal weights performed well without adding complexity
|
||||
criterions = {
|
||||
'bug_report': nn.CrossEntropyLoss(weight=bug_weights),
|
||||
'feature_request': nn.CrossEntropyLoss(weight=feature_weights),
|
||||
@@ -140,7 +136,7 @@ def main():
|
||||
# -------------------- Optimizer and scheduler -------------------
|
||||
optimizer = torch.optim.AdamW(
|
||||
model.parameters(),
|
||||
lr=args.lr, # change
|
||||
lr=args.lr,
|
||||
weight_decay=0.01
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user