From 82e6277cc1a47a69938533a51371061c0b338e78 Mon Sep 17 00:00:00 2001 From: charlie-rasberry Date: Wed, 1 Apr 2026 01:28:14 +0100 Subject: [PATCH] Inference working with cli text, csv text, just need to add timings and I am done --- notebooks/getting_csv_for_inference.ipynb | 4 +- src/dataset.py | 7 +- src/infer.py | 92 ++++++++++++++--------- 3 files changed, 63 insertions(+), 40 deletions(-) diff --git a/notebooks/getting_csv_for_inference.ipynb b/notebooks/getting_csv_for_inference.ipynb index b1bdbef..82468d9 100644 --- a/notebooks/getting_csv_for_inference.ipynb +++ b/notebooks/getting_csv_for_inference.ipynb @@ -217,12 +217,12 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "0b1c7f73", "metadata": {}, "outputs": [], "source": [ - "df = df[['review_description']].reset_index(drop=True)" + "df = df[['review_description']].reset_index(drop=True)\n" ] }, { diff --git a/src/dataset.py b/src/dataset.py index e4809b3..3530a1a 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -1,5 +1,5 @@ # dataset.py -# tokenize data using (sentencepiece) XLM-RoBERTa +# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer # Takes a row from the csv, tokenizes the review and returns a tensor import torch import pandas as pd @@ -65,7 +65,10 @@ class InferenceDataset(Dataset): return len(self.df) def __getitem__(self, idx): - review = self.df.iloc[idx][self.text_column] + #review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews + review = str(self.df.iloc[idx][self.text_column]) + if review == 'nan' or review.strip() == '': + review = ' ' encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt') return { 'input_ids': encoding['input_ids'].squeeze(0), diff --git a/src/infer.py b/src/infer.py index d8ad8ff..6076e7d 100644 --- a/src/infer.py +++ b/src/infer.py @@ -1,4 +1,4 @@ -# evauluate.py +# infer.py import os import torch import time @@ -33,14 +33,14 @@ np.random.seed(SEED) def parse_args(): parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.") - parser.add_argument("--model_path", type=str, required=True, help=".pt file path") + parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/") parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"]) - parser.add_argument("--interactive", help="Loops reading input until exit()") - parser.add_argument("--text", help="Use command line text for input") - parser.add_argument("--dataset", type=str, help="Enter a file for inference") + parser.add_argument("--interactive", action="store_true", help="Loops reading input until exit()") + parser.add_argument("--text", action="store_true", help="Use command line text for input") + parser.add_argument("--dataset", type=str, help="Enter a file name for inference (stored in data/processed/)", default="review") parser.add_argument("--batch_size", type=int, default=16) parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl") - parser.add_argument("--text_column", type=str, required=True, default="review", help="Where is the text column") + parser.add_argument("--text_column", type=str, default="review", help="Where is the text column") return parser.parse_args() @@ -52,27 +52,47 @@ def main(): # this section is nearly identical to the first part of evaluate.py args = parse_args() - print(f'='*50) - print(f' '*15 + "Starting inference") + print(f'{"="*50}') + print(f'{"Starting inference"}') if torch.cuda.is_available(): - print(f' '*15 + "GPU:", torch.cuda.get_device_name(0)) + print(f"Using CUDA for inference: {torch.cuda.get_device_name(0)}") torch.cuda.manual_seed_all(SEED) torch.cuda.manual_seed(SEED) else: - print(f' '*15 + "No GPUs available") - print(f'='*50 + "\n") - print(f"Running inference on: {args.model_path.upper()} using data/processed/{args.dataset}.csv") + print(f'{" "*15, "No GPUs available"}') + print(f'{"="*50}\n') + print(f"Running inference on: outputs/{args.model_path} using data/processed/{args.dataset}.csv") print("Loading model, tokenizer and datasets ...") tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") - infer = f"data/processed/{args.dataset}.csv" - infer_df = pd.read_csv(infer) - infer_data = InferenceDataset(infer, tokenizer, args.text_column) - infer_loader = DataLoader(infer_data, batch_size=args.batch_size) + + # Let the user decide if they want to run inference on the whole dataset or via the shell input + if not args.interactive and not args.text: + infer = f"data/processed/{args.dataset}.csv" + infer_df = pd.read_csv(infer) + filename = f"outputs/inference/{args.model_path}_{args.task}_predictions_{args.dataset}.csv" + else: + infer_df = pd.DataFrame(columns=[args.text_column]) + print("Entering interactive mode. Type 'exit()' to quit.") + while True: + user_input = input("Enter text for inference: ") + if user_input.lower() == "exit()": + break + infer_df = pd.concat([infer_df, pd.DataFrame({args.text_column: [user_input]})], ignore_index=True) + filename = f"outputs/inference/{args.model_path}_{args.task}_predictions_interactive.csv" + infer_df.to_csv(filename, index=False) + infer = filename + + if infer is not None: + infer_data = InferenceDataset(infer, tokenizer, args.text_column) + infer_loader = DataLoader(infer_data, batch_size=args.batch_size) + else: + print("No dataset provided for inference. Exiting.") + return if args.mode == "mtl": model = Model().to(device) print(f"Loading weights from {args.model_path}...") - model.load_state_dict(torch.load(args.model_path, map_location=device)) + model.load_state_dict(torch.load(f"outputs/{args.model_path}", map_location=device)) model.eval() active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment'] else: @@ -87,46 +107,46 @@ def main(): model = SingleTaskModel(args.task, task_classes[args.task]).to(device) active_tasks = [args.task] print(f"Loading weights from {args.model_path}...") - model.load_state_dict(torch.load(args.model_path, map_location=device)) + model.load_state_dict(torch.load(f"outputs/{args.model_path}", map_location=device)) model.eval() - # the above section is nearly identical to the first part of evaluate.py - all_labels = {task: [] for task in active_tasks} - all_preds = {task: [] for task in active_tasks} - all_confidences = {task: [] for task in active_tasks} - print("Running inference on test set") + + all_preds = {task: [] for task in active_tasks} + all_confidences = {task: [] for task in active_tasks} + print(f"Running inference on {args.dataset} dataset") + with torch.no_grad(): for batch in infer_loader: input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) outputs = model(input_ids, attention_mask) for task in active_tasks: - # labels = batch[task].to(device) logits = outputs[task] preds = torch.argmax(logits, dim=1) probs = F.softmax(logits, dim=1) confidence = probs.max(dim=1).values - # all_labels[task].extend(labels.cpu().numpy()) all_preds[task].extend(preds.cpu().numpy()) all_confidences[task].extend(confidence.cpu().numpy()) - df = pd.DataFrame({ - "text": infer_df["review_description"] - }) + df = pd.DataFrame({"text": infer_df[args.text_column]}) for task in active_tasks: # ensures ALL tasks included df[f"{task}_pred"] = [label_names[task][p] for p in all_preds[task]] df[f"{task}_confidence"] = all_confidences[task] - summary = { - "mode": args.mode, - "dataset": args.dataset, - "task": args.task, - "model_path": args.model_path, - "results": {} - } - + output_path = filename + df.to_csv(output_path, index=False) + if not args.text: + print(f"Inference finished. Predictions saved to {output_path}") + else: + print(f"Inference finished.\n") + print(df.to_string(index=False)) + again = input("Do you want to enter another text for inference? (y/n): ") + if again.lower() == 'y': + main() + else: + print("Exiting interactive inference.") if __name__ == "__main__":