Started dataset.py, added the ReviewDataset class and implemented the __init__, __len__ and __getitem__ methods. The __getitem__ method currently just returns the review text, but will be updated to return the tokenized review as a tensor
This commit is contained in:
@@ -0,0 +1,29 @@
|
|||||||
|
# dataset.py
|
||||||
|
|
||||||
|
# Takes a row from the csv, tokenizes the review and returns a tensor
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import pandas as pd
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
class ReviewDataset(Dataset):
|
||||||
|
def __init__(self, path, tokenizer, max_length=256):
|
||||||
|
self.df = pd.read_csv(path)
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.max_length = max_length
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.df)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
review = self.df.iloc[idx]['review']
|
||||||
|
return review
|
||||||
|
|
||||||
|
uber = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
|
||||||
|
print(uber.__getitem__(1))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,7 @@
|
|||||||
|
#train.py
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class multiTaskModel():
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||||
Reference in New Issue
Block a user