inference shouldn't need much to complete
This commit is contained in:
@@ -54,7 +54,25 @@ class ReviewDataset(Dataset):
|
||||
'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
|
||||
'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment'], dtype=torch.long)
|
||||
}
|
||||
class InferenceDataset(Dataset):
|
||||
def __init__(self, path, tokenizer, text_column, max_length=256):
|
||||
self.df = pd.read_csv(path)
|
||||
self.tokenizer = tokenizer
|
||||
self.text_column = text_column
|
||||
self.max_length = max_length
|
||||
|
||||
def __len__(self):
|
||||
return len(self.df)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
review = self.df.iloc[idx][self.text_column]
|
||||
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
||||
return {
|
||||
'input_ids': encoding['input_ids'].squeeze(0),
|
||||
'attention_mask': encoding['attention_mask'].squeeze(0),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
|
||||
print(dataset.__getitem__(1))
|
||||
|
||||
Reference in New Issue
Block a user