From cccd91a68098954c18158f463cf9b2bdfb4e563c Mon Sep 17 00:00:00 2001 From: charlie-rasberry Date: Fri, 20 Feb 2026 18:18:17 +0000 Subject: [PATCH] Small bit of progress towards model.py, now building forward() --- src/dataset.py | 5 +++-- src/model.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/src/dataset.py b/src/dataset.py index d3245f1..2be5173 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -18,8 +18,8 @@ class ReviewDataset(Dataset): def __getitem__(self, idx): review = self.df.iloc[idx]['review'] - # encoding['input_ids'] - # encoding['attention_mask'] + # encoding['input_ids'] 1D tensor of token ids, shape [max_length] + # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length] # Both have shape [1, max_length] because of return_tensors='pt' # Squeeze them to [max_length] with .squeeze(0) encoding = self.tokenizer( @@ -35,6 +35,7 @@ class ReviewDataset(Dataset): # 'attention_mask': tensor of shape [max_length] + # MTL structure labels as tensor scalars: # 'bug_report': tensor scalar (torch.tensor(label_value)) # 'feature_request': tensor scalar (torch.tensor(label_value)) # 'aspect': tensor scalar (torch.tensor(label_value)) diff --git a/src/model.py b/src/model.py index e69de29..1969b3d 100644 --- a/src/model.py +++ b/src/model.py @@ -0,0 +1,36 @@ +# model.py +# One encoder, four shared heads(bug report, feature request, aspect, aspect sentiment) +# 12 transformer layers, 12 attention heads + +from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel +import torch.nn as nn + +# Using dropout, This has proven to be an effective technique +# for regularization and preventing the co-adaptation of neurons as described in https://arxiv.org/abs/1207.0580 + +# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head +# Each hidden representation is size 768 +class Model(nn.Module): + def __init__(self, dropout_rate=0.2): # Try other p values + super().__init__() + self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base") + + hidden_size = self.encoder.config.hidden_size + + # Applied across whole output, shared + self.dropout = nn.Dropout(dropout_rate) + + self.bug_head = nn.Linear(hidden_size, 2) + self.feature_head = nn.Linear(hidden_size, 2) + self.aspect_head = nn.Linear(hidden_size, 6) + self.aspect_sentiment_head = nn.Linear(hidden_size, 3) + def forward(self, input_ids, attention_mask): + outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) + + + + + + +tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") +model = AutoModelForMaskedLM.from_pretrained("FacebookAI/xlm-roberta-base") \ No newline at end of file