From cccd91a68098954c18158f463cf9b2bdfb4e563c Mon Sep 17 00:00:00 2001
From: charlie-rasberry <charlie.rasberry@outlook.com>
Date: Fri, 20 Feb 2026 18:18:17 +0000
Subject: [PATCH] Small bit of progress towards model.py, now building
 forward()

---
 src/dataset.py |  5 +++--
 src/model.py   | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/src/dataset.py b/src/dataset.py
index d3245f1..2be5173 100644
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -18,8 +18,8 @@ class ReviewDataset(Dataset):
     def __getitem__(self, idx):
         review = self.df.iloc[idx]['review']
 
-        # encoding['input_ids'] 
-        # encoding['attention_mask']
+        # encoding['input_ids'] 1D tensor of token ids, shape [max_length]
+        # encoding['attention_mask'] 1D tensor of 1s 0s showing real tokens vs padding, shape [max_length]
         # Both have shape [1, max_length] because of return_tensors='pt'
         # Squeeze them to [max_length] with .squeeze(0)
         encoding = self.tokenizer(
@@ -35,6 +35,7 @@ class ReviewDataset(Dataset):
         
         #   'attention_mask': tensor of shape [max_length]
 
+        # MTL structure labels as tensor scalars:
         #   'bug_report': tensor scalar (torch.tensor(label_value))
         #   'feature_request': tensor scalar (torch.tensor(label_value))
         #   'aspect': tensor scalar (torch.tensor(label_value))
diff --git a/src/model.py b/src/model.py
index e69de29..1969b3d 100644
--- a/src/model.py
+++ b/src/model.py
@@ -0,0 +1,36 @@
+# model.py
+# One encoder, four shared heads(bug report, feature request, aspect, aspect sentiment)
+# 12 transformer layers, 12 attention heads
+
+from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel
+import torch.nn as nn
+
+# Using dropout, This has proven to be an effective technique 
+# for regularization and preventing the co-adaptation of neurons as described in https://arxiv.org/abs/1207.0580
+
+# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
+# Each hidden representation is size 768
+class Model(nn.Module):
+    def __init__(self, dropout_rate=0.2): # Try other p values
+        super().__init__()
+        self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
+
+        hidden_size = self.encoder.config.hidden_size
+
+        # Applied across whole output, shared
+        self.dropout = nn.Dropout(dropout_rate)
+
+        self.bug_head = nn.Linear(hidden_size, 2)
+        self.feature_head = nn.Linear(hidden_size, 2)
+        self.aspect_head = nn.Linear(hidden_size, 6)
+        self.aspect_sentiment_head = nn.Linear(hidden_size, 3)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+
+
+
+
+
+
+tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
+model = AutoModelForMaskedLM.from_pretrained("FacebookAI/xlm-roberta-base")
\ No newline at end of file