Inference working with cli text, csv text, just need to add timings and I am done

This commit is contained in:
2026-04-01 01:28:14 +01:00
parent 1e8ea39287
commit 82e6277cc1
3 changed files with 63 additions and 40 deletions

View File

@@ -217,12 +217,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 36, "execution_count": null,
"id": "0b1c7f73", "id": "0b1c7f73",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"df = df[['review_description']].reset_index(drop=True)" "df = df[['review_description']].reset_index(drop=True)\n"
] ]
}, },
{ {

View File

@@ -1,5 +1,5 @@
# dataset.py # dataset.py
# tokenize data using (sentencepiece) XLM-RoBERTa # tokenize data using (sentencepiece) XLM-RoBERTas tokenizer
# Takes a row from the csv, tokenizes the review and returns a tensor # Takes a row from the csv, tokenizes the review and returns a tensor
import torch import torch
import pandas as pd import pandas as pd
@@ -65,7 +65,10 @@ class InferenceDataset(Dataset):
return len(self.df) return len(self.df)
def __getitem__(self, idx): def __getitem__(self, idx):
review = self.df.iloc[idx][self.text_column] #review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews
review = str(self.df.iloc[idx][self.text_column])
if review == 'nan' or review.strip() == '':
review = ' '
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
return { return {
'input_ids': encoding['input_ids'].squeeze(0), 'input_ids': encoding['input_ids'].squeeze(0),

View File

@@ -1,4 +1,4 @@
# evauluate.py # infer.py
import os import os
import torch import torch
import time import time
@@ -33,14 +33,14 @@ np.random.seed(SEED)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.") parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
parser.add_argument("--model_path", type=str, required=True, help=".pt file path") parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/")
parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"]) parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
parser.add_argument("--interactive", help="Loops reading input until exit()") parser.add_argument("--interactive", action="store_true", help="Loops reading input until exit()")
parser.add_argument("--text", help="Use command line text for input") parser.add_argument("--text", action="store_true", help="Use command line text for input")
parser.add_argument("--dataset", type=str, help="Enter a file for inference") parser.add_argument("--dataset", type=str, help="Enter a file name for inference (stored in data/processed/)", default="review")
parser.add_argument("--batch_size", type=int, default=16) parser.add_argument("--batch_size", type=int, default=16)
parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl") parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl")
parser.add_argument("--text_column", type=str, required=True, default="review", help="Where is the text column") parser.add_argument("--text_column", type=str, default="review", help="Where is the text column")
return parser.parse_args() return parser.parse_args()
@@ -52,27 +52,47 @@ def main():
# this section is nearly identical to the first part of evaluate.py # this section is nearly identical to the first part of evaluate.py
args = parse_args() args = parse_args()
print(f'='*50) print(f'{"="*50}')
print(f' '*15 + "Starting inference") print(f'{"Starting inference"}')
if torch.cuda.is_available(): if torch.cuda.is_available():
print(f' '*15 + "GPU:", torch.cuda.get_device_name(0)) print(f"Using CUDA for inference: {torch.cuda.get_device_name(0)}")
torch.cuda.manual_seed_all(SEED) torch.cuda.manual_seed_all(SEED)
torch.cuda.manual_seed(SEED) torch.cuda.manual_seed(SEED)
else: else:
print(f' '*15 + "No GPUs available") print(f'{" "*15, "No GPUs available"}')
print(f'='*50 + "\n") print(f'{"="*50}\n')
print(f"Running inference on: {args.model_path.upper()} using data/processed/{args.dataset}.csv") print(f"Running inference on: outputs/{args.model_path} using data/processed/{args.dataset}.csv")
print("Loading model, tokenizer and datasets ...") print("Loading model, tokenizer and datasets ...")
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
infer = f"data/processed/{args.dataset}.csv"
infer_df = pd.read_csv(infer) # Let the user decide if they want to run inference on the whole dataset or via the shell input
infer_data = InferenceDataset(infer, tokenizer, args.text_column) if not args.interactive and not args.text:
infer_loader = DataLoader(infer_data, batch_size=args.batch_size) infer = f"data/processed/{args.dataset}.csv"
infer_df = pd.read_csv(infer)
filename = f"outputs/inference/{args.model_path}_{args.task}_predictions_{args.dataset}.csv"
else:
infer_df = pd.DataFrame(columns=[args.text_column])
print("Entering interactive mode. Type 'exit()' to quit.")
while True:
user_input = input("Enter text for inference: ")
if user_input.lower() == "exit()":
break
infer_df = pd.concat([infer_df, pd.DataFrame({args.text_column: [user_input]})], ignore_index=True)
filename = f"outputs/inference/{args.model_path}_{args.task}_predictions_interactive.csv"
infer_df.to_csv(filename, index=False)
infer = filename
if infer is not None:
infer_data = InferenceDataset(infer, tokenizer, args.text_column)
infer_loader = DataLoader(infer_data, batch_size=args.batch_size)
else:
print("No dataset provided for inference. Exiting.")
return
if args.mode == "mtl": if args.mode == "mtl":
model = Model().to(device) model = Model().to(device)
print(f"Loading weights from {args.model_path}...") print(f"Loading weights from {args.model_path}...")
model.load_state_dict(torch.load(args.model_path, map_location=device)) model.load_state_dict(torch.load(f"outputs/{args.model_path}", map_location=device))
model.eval() model.eval()
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment'] active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
else: else:
@@ -87,46 +107,46 @@ def main():
model = SingleTaskModel(args.task, task_classes[args.task]).to(device) model = SingleTaskModel(args.task, task_classes[args.task]).to(device)
active_tasks = [args.task] active_tasks = [args.task]
print(f"Loading weights from {args.model_path}...") print(f"Loading weights from {args.model_path}...")
model.load_state_dict(torch.load(args.model_path, map_location=device)) model.load_state_dict(torch.load(f"outputs/{args.model_path}", map_location=device))
model.eval() model.eval()
# the above section is nearly identical to the first part of evaluate.py
all_labels = {task: [] for task in active_tasks} all_preds = {task: [] for task in active_tasks}
all_preds = {task: [] for task in active_tasks} all_confidences = {task: [] for task in active_tasks}
all_confidences = {task: [] for task in active_tasks} print(f"Running inference on {args.dataset} dataset")
print("Running inference on test set")
with torch.no_grad(): with torch.no_grad():
for batch in infer_loader: for batch in infer_loader:
input_ids = batch['input_ids'].to(device) input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device) attention_mask = batch['attention_mask'].to(device)
outputs = model(input_ids, attention_mask) outputs = model(input_ids, attention_mask)
for task in active_tasks: for task in active_tasks:
# labels = batch[task].to(device)
logits = outputs[task] logits = outputs[task]
preds = torch.argmax(logits, dim=1) preds = torch.argmax(logits, dim=1)
probs = F.softmax(logits, dim=1) probs = F.softmax(logits, dim=1)
confidence = probs.max(dim=1).values confidence = probs.max(dim=1).values
# all_labels[task].extend(labels.cpu().numpy())
all_preds[task].extend(preds.cpu().numpy()) all_preds[task].extend(preds.cpu().numpy())
all_confidences[task].extend(confidence.cpu().numpy()) all_confidences[task].extend(confidence.cpu().numpy())
df = pd.DataFrame({ df = pd.DataFrame({"text": infer_df[args.text_column]})
"text": infer_df["review_description"]
})
for task in active_tasks: # ensures ALL tasks included for task in active_tasks: # ensures ALL tasks included
df[f"{task}_pred"] = [label_names[task][p] for p in all_preds[task]] df[f"{task}_pred"] = [label_names[task][p] for p in all_preds[task]]
df[f"{task}_confidence"] = all_confidences[task] df[f"{task}_confidence"] = all_confidences[task]
summary = { output_path = filename
"mode": args.mode, df.to_csv(output_path, index=False)
"dataset": args.dataset, if not args.text:
"task": args.task, print(f"Inference finished. Predictions saved to {output_path}")
"model_path": args.model_path, else:
"results": {} print(f"Inference finished.\n")
} print(df.to_string(index=False))
again = input("Do you want to enter another text for inference? (y/n): ")
if again.lower() == 'y':
main()
else:
print("Exiting interactive inference.")
if __name__ == "__main__": if __name__ == "__main__":