# model.py
# One encoder, four shared heads(bug report, feature request, aspect, aspect sentiment)
# 12 transformer layers, 12 attention heads

from transformers import AutoTokenizer, AutoModelForMaskedLM, XLMRobertaModel
import torch.nn as nn

# Using dropout, This has proven to be an effective technique 
# for regularization and preventing the co-adaptation of neurons as described in https://arxiv.org/abs/1207.0580

# Each nn.linear is used to map RoBERTa's hidden representation onto the output space of each task head
# Each hidden representation is size 768

class SingleTaskModel(nn.Module): #   SINGLE TASK MODEL ARCHITECTURE
    def __init__(self, task_name, num_classes, dropout_rate=0.2):
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
        self.droput = nn.Dropout(dropout_rate)
        self.head = nn.Linear(self.encoder.config.hidden_size, num_classes)
        self.task_name = task_name
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        output= self.droput(outputs.last_hidden_state[:, 0, :])
        logits = self.head(output)
        return {self.task_name: logits}

class Model(nn.Module): #   MULTITASK MODEL ARCHITECTURE
    def __init__(self, dropout_rate=0.2): # Try other p values
        super().__init__()
        self.encoder = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")

        hidden_size = self.encoder.config.hidden_size

        # Applied across whole output, shared
        self.dropout = nn.Dropout(dropout_rate)

        self.bug_head = nn.Linear(hidden_size, 2)
        self.feature_head = nn.Linear(hidden_size, 2)
        self.aspect_head = nn.Linear(hidden_size, 6)
        self.aspect_sentiment_head = nn.Linear(hidden_size, 3)

    # Pass through encoder then extract the token representation
    # Apply droupout to it, take scores for each head, return them in a dictionary
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        output = outputs.last_hidden_state[:, 0, :]

        output = self.dropout(output)

        # Logits for each head:
        bug_logits = self.bug_head(output)
        feature_logits = self.feature_head(output)
        aspect_logits = self.aspect_head(output)
        aspect_sentiment = self.aspect_sentiment_head(output)
        return {
            'bug_report': bug_logits,
            'feature_request': feature_logits,
            'aspect': aspect_logits,
            'aspect_sentiment': aspect_sentiment
        }
    
if __name__ == "__main__":
    from dataset import ReviewDataset
    from transformers import AutoTokenizer
    from torch.utils.data import DataLoader

    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
    dataset = ReviewDataset("data/processed/original_train.csv", tokenizer)
    loader = DataLoader(dataset, batch_size=2)

    batch = next(iter(loader))

    model = Model()
    outputs = model(batch["input_ids"], batch["attention_mask"])

    for k, v in outputs.items():
        print(k, v.shape)