From 19bcf2aa18159133a8e8ad6baa86708dd6ccb7c2 Mon Sep 17 00:00:00 2001 From: charlie-rasberry Date: Thu, 19 Feb 2026 18:41:37 +0000 Subject: [PATCH] Started dataset.py, added the ReviewDataset class and implemented the __init__, __len__ and __getitem__ methods. The __getitem__ method currently just returns the review text, but will be updated to return the tokenized review as a tensor --- src/dataset.py | 29 +++++++++++++++++++++++++++++ src/train.py | 7 +++++++ 2 files changed, 36 insertions(+) diff --git a/src/dataset.py b/src/dataset.py index e69de29..43b3b1e 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -0,0 +1,29 @@ +# dataset.py + +# Takes a row from the csv, tokenizes the review and returns a tensor + +import torch +import pandas as pd +from torch.utils.data import Dataset +from transformers import AutoTokenizer + +class ReviewDataset(Dataset): + def __init__(self, path, tokenizer, max_length=256): + self.df = pd.read_csv(path) + self.tokenizer = tokenizer + self.max_length = max_length + + def __len__(self): + return len(self.df) + + def __getitem__(self, idx): + review = self.df.iloc[idx]['review'] + return review + +uber = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")) +print(uber.__getitem__(1)) + + + + + diff --git a/src/train.py b/src/train.py index e69de29..46a59f7 100644 --- a/src/train.py +++ b/src/train.py @@ -0,0 +1,7 @@ +#train.py + +from transformers import AutoTokenizer + + +class multiTaskModel(): + tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") \ No newline at end of file