inference shouldn't need much to complete

This commit is contained in:
2026-03-29 00:17:05 +00:00
parent 72c27aca13
commit 1e8ea39287
2 changed files with 110 additions and 13 deletions

View File

@@ -54,7 +54,25 @@ class ReviewDataset(Dataset):
'aspect': torch.tensor(self.df.iloc[idx]['aspect'], dtype=torch.long),
'aspect_sentiment': torch.tensor(self.df.iloc[idx]['aspect_sentiment'], dtype=torch.long)
}
class InferenceDataset(Dataset):
def __init__(self, path, tokenizer, text_column, max_length=256):
self.df = pd.read_csv(path)
self.tokenizer = tokenizer
self.text_column = text_column
self.max_length = max_length
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
review = self.df.iloc[idx][self.text_column]
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
}
if __name__ == "__main__":
dataset = ReviewDataset("data/processed/original_train.csv", AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base"))
print(dataset.__getitem__(1))