Added some comments and readability
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
# dataset.py
|
# dataset.py
|
||||||
|
# tokenize data using (sentencepiece) XLM-RoBERTa
|
||||||
# Takes a row from the csv, tokenizes the review and returns a tensor
|
# Takes a row from the csv, tokenizes the review and returns a tensor
|
||||||
import torch
|
import torch
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -7,6 +7,18 @@ from torch.utils.data import Dataset
|
|||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
class ReviewDataset(Dataset):
|
class ReviewDataset(Dataset):
|
||||||
|
"""Pytorch Dataset for loading tokenized reviews
|
||||||
|
|
||||||
|
Dataset is for map style datasets like here, instead of using IteratableDataset (better for data streams).
|
||||||
|
Expects a csv and tokenizes reviews using XLM-RoBERTa, returning a dictionary with of
|
||||||
|
input tensors and integer labels for all 4 tasks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Path to the csv file containing the reviews and labels.
|
||||||
|
tokenizer (transformers.PreTrainedTokenizer): Tokenizer to use for encoding the reviews.
|
||||||
|
max_length (int, optional): Maximum length for tokenized sequences. Defaults to 256. 128 would have dropped about half of minority classes
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, path, tokenizer, max_length=256):
|
def __init__(self, path, tokenizer, max_length=256):
|
||||||
self.df = pd.read_csv(path)
|
self.df = pd.read_csv(path)
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
@@ -22,13 +34,7 @@ class ReviewDataset(Dataset):
|
|||||||
# encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
|
# encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
|
||||||
# Both have shape [1, max_length] because of return_tensors='pt'
|
# Both have shape [1, max_length] because of return_tensors='pt'
|
||||||
# Squeeze them to [max_length] with .squeeze(0)
|
# Squeeze them to [max_length] with .squeeze(0)
|
||||||
encoding = self.tokenizer(
|
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
||||||
review,
|
|
||||||
max_length=self.max_length,
|
|
||||||
padding='max_length',
|
|
||||||
truncation=True,
|
|
||||||
return_tensors='pt'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Returns a dictionary with:
|
# Returns a dictionary with:
|
||||||
# 'input_ids': tensor of shape [max_length]
|
# 'input_ids': tensor of shape [max_length]
|
||||||
|
|||||||
30
src/model.py
30
src/model.py
@@ -11,7 +11,17 @@ import torch.nn as nn
|
|||||||
# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
|
# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
|
||||||
# Each hidden representation is size 768
|
# Each hidden representation is size 768
|
||||||
|
|
||||||
class SingleTaskModel(nn.Module): # TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTURE
|
class SingleTaskModel(nn.Module):
|
||||||
|
"""Single task model to compare MTL approach to review classification
|
||||||
|
|
||||||
|
Same XLM-RoBERTa only with one head, returns same dictionary format so training loop is the same
|
||||||
|
just different args
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task_name: which of the 4 tasks are we training for
|
||||||
|
num_classes: number of output classes for the task
|
||||||
|
dropout_rate: probability applied to cls representation, randomly drops tokens for better results
|
||||||
|
"""
|
||||||
def __init__(self, task_name, num_classes, dropout_rate=0.2):
|
def __init__(self, task_name, num_classes, dropout_rate=0.2):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
|
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||||
@@ -24,14 +34,23 @@ class SingleTaskModel(nn.Module): # TASK-SPECIFIC/SINGLE-TASK MODEL ARCHITECTU
|
|||||||
logits = self.head(output)
|
logits = self.head(output)
|
||||||
return {self.task_name: logits}
|
return {self.task_name: logits}
|
||||||
|
|
||||||
class Model(nn.Module): # MULTITASK MODEL ARCHITECTURE
|
class Model(nn.Module):
|
||||||
def __init__(self, dropout_rate=0.2): # Try other p values
|
""" Multitask model with shared encoder (XLM-RoBERTa) and four task specific heads
|
||||||
|
|
||||||
|
Architecture: XLM-RoBERTa base (12 layers 768 hidden size), cls token representation is processed through
|
||||||
|
shared dropout then ito four linear classification heads. Shared training optimises all tasks simultaneously,
|
||||||
|
allowing the encoder to learn from the shared representations / generalisations
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dropout_rate: probability applied to preven co-adaptation of neurons across heads 0.2 is standard default
|
||||||
|
"""
|
||||||
|
def __init__(self, dropout_rate=0.2):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
|
self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||||
|
|
||||||
hidden_size = self.encoder.config.hidden_size
|
hidden_size = self.encoder.config.hidden_size
|
||||||
|
|
||||||
# Applied across whole output, shared
|
# Applied across shared cls token, before all task heads
|
||||||
self.dropout = nn.Dropout(dropout_rate)
|
self.dropout = nn.Dropout(dropout_rate)
|
||||||
|
|
||||||
self.bug_head = nn.Linear(hidden_size, 2)
|
self.bug_head = nn.Linear(hidden_size, 2)
|
||||||
@@ -39,10 +58,11 @@ class Model(nn.Module): # MULTITASK MODEL ARCHITECTURE
|
|||||||
self.aspect_head = nn.Linear(hidden_size, 6)
|
self.aspect_head = nn.Linear(hidden_size, 6)
|
||||||
self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
|
self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
|
||||||
|
|
||||||
# Pass through encoder then extract the token representation
|
# Pass through encoder then extract the token representation through [batch_size, 768]
|
||||||
# Apply droupout to it, take scores for each head, return them in a dictionary
|
# Apply droupout to it, take scores for each head, return them in a dictionary
|
||||||
def forward(self, input_ids, attention_mask):
|
def forward(self, input_ids, attention_mask):
|
||||||
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
|
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
|
||||||
|
# index 0 from [batch_size, 768]
|
||||||
output = outputs.last_hidden_state[:, 0, :]
|
output = outputs.last_hidden_state[:, 0, :]
|
||||||
|
|
||||||
output = self.dropout(output)
|
output = self.dropout(output)
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
# preprocess.py
|
# preprocess.py
|
||||||
|
|
||||||
|
# langdetect was experimented with but wasn't consistent enough to be a better choice than translating manually
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import re
|
import re
|
||||||
from langdetect import detect, LangDetectException
|
|
||||||
|
|
||||||
def clean_text(text):
|
def clean_text(text) -> str:
|
||||||
"""Clean review text by removing URLS, emails, excessive whitespace
|
"""Clean review text by removing URLS, emails, excessive whitespace
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
@@ -19,31 +20,22 @@ def clean_text(text):
|
|||||||
# Convert to lower for uniformity
|
# Convert to lower for uniformity
|
||||||
text = str(text).lower()
|
text = str(text).lower()
|
||||||
|
|
||||||
# Remove URLs using regex
|
# Remove URLs using regex, match http in any non whitespace char (\S) numerous (+) times. same with either ( | ) www
|
||||||
text = re.sub(r'http\S+|www\S+', '', text)
|
text = re.sub(r'http\S+|www\S+', '', text)
|
||||||
|
|
||||||
# Remove emails
|
# Remove emails, one or more (+) non whitespace (\S) before "@" with trailing \S up replace with '' for each text (review)
|
||||||
text = re.sub(r'\S+@\S+', '', text)
|
text = re.sub(r'\S+@\S+', '', text)
|
||||||
|
|
||||||
# Normalize punctuation
|
# Normalize punctuation, any character except line terminators (\.) at least 2 times {2,} for . ! ? replace with a single
|
||||||
text = re.sub(r'\.{2,}', '.', text)
|
text = re.sub(r'\.{2,}', '.', text)
|
||||||
text = re.sub(r'!{2,}', '!', text)
|
text = re.sub(r'!{2,}', '!', text)
|
||||||
text = re.sub(r'\?{2,}', '?', text)
|
text = re.sub(r'\?{2,}', '?', text)
|
||||||
|
|
||||||
# Remove excessive whitespace by replacing with single whitespace where there is trailing spaces
|
# Remove excessive whitespace (\s) by replacing with single whitespace where there is trailing spaces
|
||||||
text = re.sub(r'\s+', ' ', text).strip()
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def detect_language(text):
|
|
||||||
"""Detect language of text"""
|
|
||||||
try:
|
|
||||||
if pd.isna(text) or len(str(text).strip()) < 10:
|
|
||||||
return 'unknown'
|
|
||||||
return detect(str(text))
|
|
||||||
except LangDetectException:
|
|
||||||
return 'unknown'
|
|
||||||
|
|
||||||
def preprocess_uber_reviews(input_path, output_path):
|
def preprocess_uber_reviews(input_path, output_path):
|
||||||
"""
|
"""
|
||||||
preprocess_uber_reviews by loading, cleaning, and filtering the data.
|
preprocess_uber_reviews by loading, cleaning, and filtering the data.
|
||||||
|
|||||||
@@ -1,21 +1,16 @@
|
|||||||
# TODO: Add verification comparison between ratings
|
|
||||||
# TODO: Clean up the logging print statements
|
|
||||||
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
print(pd.__version__)
|
print(pd.__version__)
|
||||||
print(np.__version__)
|
print(np.__version__)
|
||||||
|
|
||||||
path = "multitag/data/uber_reviews_cleaned.csv"
|
path = "data/raw/uber_reviews_cleaned.csv"
|
||||||
sampled_path = "multitag/data/uber_reviews_sampled.csv"
|
sampled_path = "data/raw/uber_reviews_sampled.csv"
|
||||||
original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
|
original_path = "data/raw/uber_reviews.csv" ### only for distribution comparison
|
||||||
class Sampler:
|
class Sampler:
|
||||||
def __init__(self, data_path, target_samples):
|
def __init__(self, data_path, target_samples):
|
||||||
|
|
||||||
self.data_path = data_path
|
self.data_path = data_path
|
||||||
self.target_samples = 5000 # target number of samples
|
|
||||||
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
|
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
|
||||||
|
|
||||||
self.original_data = pd.read_csv(original_path, low_memory=False)
|
self.original_data = pd.read_csv(original_path, low_memory=False)
|
||||||
@@ -39,7 +34,7 @@ class Sampler:
|
|||||||
print(f"Original Distribution from {original_path}:")
|
print(f"Original Distribution from {original_path}:")
|
||||||
print((_origdist*100).round(1),"\n")
|
print((_origdist*100).round(1),"\n")
|
||||||
|
|
||||||
self.data.info()
|
self.data.info(verbose=True)
|
||||||
|
|
||||||
# add sampling method here
|
# add sampling method here
|
||||||
# random sample 5000 entries with stratifiying by rating
|
# random sample 5000 entries with stratifiying by rating
|
||||||
@@ -52,43 +47,53 @@ class Sampler:
|
|||||||
2 3.9% (41707)
|
2 3.9% (41707)
|
||||||
Name: proportion, dtype: object
|
Name: proportion, dtype: object
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
IGNORE --- Left in just in case
|
||||||
|
|
||||||
Sample size by rating
|
Sample randomly
|
||||||
Redundant calculation, kept for clarity
|
Redundant calculation
|
||||||
Doesn't factor that the distribution changed greatly after preprocessing
|
Doesn't factor that the distribution changed greatly after preprocessing
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def get_stratified_sample(self) -> pd.DataFrame:
|
def get_stratified_sample(self) -> pd.DataFrame:
|
||||||
stratified_sample = (
|
stratified_sample = (
|
||||||
self.data
|
self.data
|
||||||
.reset_index(drop=True)
|
.reset_index(drop=True) # remove messy indexes
|
||||||
.apply(self.x)
|
.apply(self.sample_col) # applies to each column
|
||||||
.sample(n=self.target_samples, random_state=42)
|
.sample(n=self.target_samples, random_state=42) # 42 on sampler 4321 on any other file
|
||||||
)
|
)
|
||||||
return stratified_sample
|
return stratified_sample
|
||||||
|
|
||||||
|
|
||||||
|
def sample_col(self, column) -> pd.DataFrame:
|
||||||
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
|
|
||||||
def x(self, x):
|
|
||||||
n = int(len(x) / self.total * self.target_samples)
|
|
||||||
n = max(n,1)
|
|
||||||
return x.sample(n=n, random_state=42)
|
|
||||||
"""
|
"""
|
||||||
get_proportional_sample()
|
IGNORE --- Left in just in case
|
||||||
|
|
||||||
|
Randomly sample, including conflicting math, I guess I was going to stratify
|
||||||
"""
|
"""
|
||||||
|
samples_per_column = int(len(column) / self.total * self.target_samples) # pointless 1 *5000
|
||||||
|
samples_per_column = max(samples_per_column,1) # also pointless
|
||||||
|
return column.sample(n=samples_per_column, random_state=42)
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
original_distribution_sample()
|
original_distribution_sample()
|
||||||
The main sampling method for our labelling as it
|
The main sampling method for our labelling as it
|
||||||
keeps composition of the original uber dataset
|
keeps composition of the original uber dataset, verified in
|
||||||
which is a fairer comparison, may also work better in general
|
which is a fairer comparison, may also work better in general
|
||||||
|
|
||||||
inputs:
|
verified post preprocessing in rating_distribution.ipynb and verify_tagged_distributions.ipynb
|
||||||
|
and raw data distribution verified at the bottom of verify_tagged_distributions.ipynb
|
||||||
|
|
||||||
|
|
||||||
|
manually coded distributions taken from notebooks
|
||||||
|
|
||||||
|
for ratings and actual number of samples
|
||||||
|
rating data is the whole data for a rating as we iterate
|
||||||
|
has error handling if totals doesn't match the required amount of samples per the orig distrib
|
||||||
|
randomise the indexes (samples) and appends to the new dataset
|
||||||
|
|
||||||
|
|
||||||
outputs:
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def original_distribution_sample(self):
|
def original_distribution_sample(self):
|
||||||
@@ -102,8 +107,8 @@ class Sampler:
|
|||||||
print("Target Distribution =", original_dist)
|
print("Target Distribution =", original_dist)
|
||||||
samples = []
|
samples = []
|
||||||
for rating, num_samples in original_dist.items():
|
for rating, num_samples in original_dist.items():
|
||||||
rating_data = self.data[self.data[self.stratify_column] == rating]
|
rating_data = self.data[self.data[self.stratify_column] == rating] # stratify_column = "rating"
|
||||||
if len(rating_data) < num_samples:
|
if len(rating_data) < num_samples: # data is a pd.dataframe of the set
|
||||||
print("Missing samples available for rating")
|
print("Missing samples available for rating")
|
||||||
num_samples = len(rating_data)
|
num_samples = len(rating_data)
|
||||||
sample = rating_data.sample(n = num_samples,random_state=42)
|
sample = rating_data.sample(n = num_samples,random_state=42)
|
||||||
@@ -127,9 +132,9 @@ class Sampler:
|
|||||||
|
|
||||||
def sample_with_keywords(self):
|
def sample_with_keywords(self):
|
||||||
#TODO add keywords for feature classification
|
#TODO add keywords for feature classification
|
||||||
print(f"\n{"="*50}")
|
print(f"\n{'='*50}")
|
||||||
print("Keyword influenced / rating stratified set")
|
print("Keyword influenced / rating stratified set")
|
||||||
print(f"\n{"="*50}")
|
print(f"\n{'='*50}")
|
||||||
|
|
||||||
bug_keywords = ["crash","freeze", "error",
|
bug_keywords = ["crash","freeze", "error",
|
||||||
"stop", "doesnt work", "doesn't work","loading",
|
"stop", "doesnt work", "doesn't work","loading",
|
||||||
@@ -204,7 +209,7 @@ class Sampler:
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
|
sampler = Sampler("data/raw/uber_reviews_cleaned.csv", target_samples=5000)
|
||||||
|
|
||||||
# Choose sampling strategy
|
# Choose sampling strategy
|
||||||
print(f"\n{'='*50}")
|
print(f"\n{'='*50}")
|
||||||
@@ -218,19 +223,19 @@ def main():
|
|||||||
|
|
||||||
if choice == '1':
|
if choice == '1':
|
||||||
sample = sampler.get_stratified_sample()
|
sample = sampler.get_stratified_sample()
|
||||||
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
|
||||||
|
|
||||||
elif choice == '2':
|
elif choice == '2':
|
||||||
sample = sampler.original_distribution_sample()
|
sample = sampler.original_distribution_sample()
|
||||||
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
|
||||||
|
|
||||||
elif choice == '3':
|
elif choice == '3':
|
||||||
sample = sampler.sample_with_keywords()
|
sample = sampler.sample_with_keywords()
|
||||||
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
sampler.save_sample(sample, "data/raw/uber_reviews_sampled.csv")
|
||||||
|
|
||||||
elif choice == '4':
|
elif choice == '4':
|
||||||
sample = sampler.sample_tiny_size()
|
sample = sampler.sample_tiny_size()
|
||||||
sampler.save_sample(sample,"multitag/data/uber_review_temp.csv")
|
sampler.save_sample(sample,"data/raw/uber_review_temp.csv")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
32
src/train.py
32
src/train.py
@@ -1,6 +1,6 @@
|
|||||||
# train.py
|
# train.py
|
||||||
# some code directly from pytorch docs https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
|
# structure adapted from Pytorch introductory tutorials https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html
|
||||||
import argparse # argparse for later switching to boosted data
|
import argparse
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import time
|
import time
|
||||||
@@ -25,7 +25,7 @@ from model import Model, SingleTaskModel
|
|||||||
|
|
||||||
|
|
||||||
# =======================================================================
|
# =======================================================================
|
||||||
# Multitask implementation
|
# Training script for MTL and STL training configurations
|
||||||
# =======================================================================
|
# =======================================================================
|
||||||
|
|
||||||
# NFR5, reproducibility
|
# NFR5, reproducibility
|
||||||
@@ -34,12 +34,16 @@ torch.manual_seed(SEED)
|
|||||||
np.random.seed(SEED)
|
np.random.seed(SEED)
|
||||||
random.seed(SEED)
|
random.seed(SEED)
|
||||||
|
|
||||||
# class weights, training loop and early stopping
|
|
||||||
|
|
||||||
# ------------------- Class weights -------------------
|
# ------------------- Class weights -------------------
|
||||||
# Using weights inversely proportional to class frequencies to avoid majority class bias,
|
# Using weights inversely proportional to class frequencies to avoid majority class bias,
|
||||||
# prioritize useful bug reports / feature requests
|
# prioritize useful bug reports / feature requests
|
||||||
def compute_weights(df, column, device):
|
def compute_weights(df, column, device):
|
||||||
|
"""Copmutes inverse frequency class weights for a label column
|
||||||
|
|
||||||
|
Uses sklearns balanced mode
|
||||||
|
Rare classes receive higher weights to penalise so it can learn more from less
|
||||||
|
"""
|
||||||
classes = np.unique(df[column])
|
classes = np.unique(df[column])
|
||||||
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
|
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df[column])
|
||||||
return torch.tensor(weights, dtype=torch.float).to(device)
|
return torch.tensor(weights, dtype=torch.float).to(device)
|
||||||
@@ -63,19 +67,17 @@ def main():
|
|||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
print("Starting training...", flush=True)
|
print("Starting training...", flush=True)
|
||||||
print("Using device:", device)
|
print("Using device:", device)
|
||||||
# Remove randomness
|
# Set cuda seeds for reproducibility
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
print("GPU:", torch.cuda.get_device_name(0))
|
print("GPU:", torch.cuda.get_device_name(0))
|
||||||
torch.cuda.manual_seed_all(SEED)
|
torch.cuda.manual_seed_all(SEED)
|
||||||
torch.cuda.manual_seed(SEED)
|
torch.cuda.manual_seed(SEED)
|
||||||
print(f"Using dataset: {args.dataset.upper()}")
|
print(f"Using dataset: {args.dataset.upper()}")
|
||||||
|
# Force deterministic for reproducibility at a slight performance cost
|
||||||
torch.backends.cudnn.deterministic = True
|
torch.backends.cudnn.deterministic = True
|
||||||
torch.backends.cudnn.benchmark = False
|
torch.backends.cudnn.benchmark = False
|
||||||
|
|
||||||
"""
|
# load data
|
||||||
Data loading:
|
|
||||||
|
|
||||||
"""
|
|
||||||
train = f"data/processed/{args.dataset}_train.csv"
|
train = f"data/processed/{args.dataset}_train.csv"
|
||||||
val = f"data/processed/{args.dataset}_val.csv"
|
val = f"data/processed/{args.dataset}_val.csv"
|
||||||
os.makedirs("outputs", exist_ok=True)
|
os.makedirs("outputs", exist_ok=True)
|
||||||
@@ -117,19 +119,13 @@ def main():
|
|||||||
feature_weights = compute_weights(train_df, 'feature_request', device)
|
feature_weights = compute_weights(train_df, 'feature_request', device)
|
||||||
aspect_weights = compute_weights(train_df, 'aspect', device)
|
aspect_weights = compute_weights(train_df, 'aspect', device)
|
||||||
aspect_sentiment_weights = compute_weights(train_df, 'aspect_sentiment', device)
|
aspect_sentiment_weights = compute_weights(train_df, 'aspect_sentiment', device)
|
||||||
# Move tensors to cpu and conver to numpy for usage with sklearn classification report
|
|
||||||
# Use detatch() later for predictions
|
|
||||||
print("Bug report class weights:", bug_weights.cpu().numpy())
|
print("Bug report class weights:", bug_weights.cpu().numpy())
|
||||||
print("Feature request class weights:", feature_weights.cpu().numpy())
|
print("Feature request class weights:", feature_weights.cpu().numpy())
|
||||||
print("Aspect class weights:", aspect_weights.cpu().numpy())
|
print("Aspect class weights:", aspect_weights.cpu().numpy())
|
||||||
print("Aspect sentiment class weights:", aspect_sentiment_weights.cpu().numpy())
|
print("Aspect sentiment class weights:", aspect_sentiment_weights.cpu().numpy())
|
||||||
|
|
||||||
# for later
|
# equal weighted task losses. unequal was considered but equal weights performed well without adding complexity
|
||||||
# 1.0 * bug_loss +
|
|
||||||
# 1.0 * feature_loss +
|
|
||||||
# 0.5 * aspect_loss +
|
|
||||||
# 0.5 * sentiment_loss
|
|
||||||
# FR4-FR7: Task specific loss functions
|
|
||||||
criterions = {
|
criterions = {
|
||||||
'bug_report': nn.CrossEntropyLoss(weight=bug_weights),
|
'bug_report': nn.CrossEntropyLoss(weight=bug_weights),
|
||||||
'feature_request': nn.CrossEntropyLoss(weight=feature_weights),
|
'feature_request': nn.CrossEntropyLoss(weight=feature_weights),
|
||||||
@@ -140,7 +136,7 @@ def main():
|
|||||||
# -------------------- Optimizer and scheduler -------------------
|
# -------------------- Optimizer and scheduler -------------------
|
||||||
optimizer = torch.optim.AdamW(
|
optimizer = torch.optim.AdamW(
|
||||||
model.parameters(),
|
model.parameters(),
|
||||||
lr=args.lr, # change
|
lr=args.lr,
|
||||||
weight_decay=0.01
|
weight_decay=0.01
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user