Inference working with cli text, csv text, just need to add timings and I am done
This commit is contained in:
@@ -217,12 +217,12 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 36,
|
"execution_count": null,
|
||||||
"id": "0b1c7f73",
|
"id": "0b1c7f73",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"df = df[['review_description']].reset_index(drop=True)"
|
"df = df[['review_description']].reset_index(drop=True)\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
# dataset.py
|
# dataset.py
|
||||||
# tokenize data using (sentencepiece) XLM-RoBERTa
|
# tokenize data using (sentencepiece) XLM-RoBERTas tokenizer
|
||||||
# Takes a row from the csv, tokenizes the review and returns a tensor
|
# Takes a row from the csv, tokenizes the review and returns a tensor
|
||||||
import torch
|
import torch
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -65,7 +65,10 @@ class InferenceDataset(Dataset):
|
|||||||
return len(self.df)
|
return len(self.df)
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
review = self.df.iloc[idx][self.text_column]
|
#review = self.df.iloc[idx][self.text_column] no longer enough due to missing values as I kept all reviews
|
||||||
|
review = str(self.df.iloc[idx][self.text_column])
|
||||||
|
if review == 'nan' or review.strip() == '':
|
||||||
|
review = ' '
|
||||||
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
encoding = self.tokenizer(review, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
|
||||||
return {
|
return {
|
||||||
'input_ids': encoding['input_ids'].squeeze(0),
|
'input_ids': encoding['input_ids'].squeeze(0),
|
||||||
|
|||||||
92
src/infer.py
92
src/infer.py
@@ -1,4 +1,4 @@
|
|||||||
# evauluate.py
|
# infer.py
|
||||||
import os
|
import os
|
||||||
import torch
|
import torch
|
||||||
import time
|
import time
|
||||||
@@ -33,14 +33,14 @@ np.random.seed(SEED)
|
|||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
|
parser = argparse.ArgumentParser(description="RECLASS, Multitask learning for review classification.")
|
||||||
parser.add_argument("--model_path", type=str, required=True, help=".pt file path")
|
parser.add_argument("--model_path", type=str, required=True, help=".pt file in outputs/")
|
||||||
parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
|
parser.add_argument("--task", type=str, default="all", choices=["all", "bug_report", "feature_request", "aspect", "aspect_sentiment"])
|
||||||
parser.add_argument("--interactive", help="Loops reading input until exit()")
|
parser.add_argument("--interactive", action="store_true", help="Loops reading input until exit()")
|
||||||
parser.add_argument("--text", help="Use command line text for input")
|
parser.add_argument("--text", action="store_true", help="Use command line text for input")
|
||||||
parser.add_argument("--dataset", type=str, help="Enter a file for inference")
|
parser.add_argument("--dataset", type=str, help="Enter a file name for inference (stored in data/processed/)", default="review")
|
||||||
parser.add_argument("--batch_size", type=int, default=16)
|
parser.add_argument("--batch_size", type=int, default=16)
|
||||||
parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl")
|
parser.add_argument("--mode", type=str, required=True, choices=["mtl", "stl"], help="mtl or stl")
|
||||||
parser.add_argument("--text_column", type=str, required=True, default="review", help="Where is the text column")
|
parser.add_argument("--text_column", type=str, default="review", help="Where is the text column")
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
@@ -52,27 +52,47 @@ def main():
|
|||||||
|
|
||||||
# this section is nearly identical to the first part of evaluate.py
|
# this section is nearly identical to the first part of evaluate.py
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
print(f'='*50)
|
print(f'{"="*50}')
|
||||||
print(f' '*15 + "Starting inference")
|
print(f'{"Starting inference"}')
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
print(f' '*15 + "GPU:", torch.cuda.get_device_name(0))
|
print(f"Using CUDA for inference: {torch.cuda.get_device_name(0)}")
|
||||||
torch.cuda.manual_seed_all(SEED)
|
torch.cuda.manual_seed_all(SEED)
|
||||||
torch.cuda.manual_seed(SEED)
|
torch.cuda.manual_seed(SEED)
|
||||||
else:
|
else:
|
||||||
print(f' '*15 + "No GPUs available")
|
print(f'{" "*15, "No GPUs available"}')
|
||||||
print(f'='*50 + "\n")
|
print(f'{"="*50}\n')
|
||||||
print(f"Running inference on: {args.model_path.upper()} using data/processed/{args.dataset}.csv")
|
print(f"Running inference on: outputs/{args.model_path} using data/processed/{args.dataset}.csv")
|
||||||
print("Loading model, tokenizer and datasets ...")
|
print("Loading model, tokenizer and datasets ...")
|
||||||
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
|
||||||
infer = f"data/processed/{args.dataset}.csv"
|
|
||||||
infer_df = pd.read_csv(infer)
|
# Let the user decide if they want to run inference on the whole dataset or via the shell input
|
||||||
infer_data = InferenceDataset(infer, tokenizer, args.text_column)
|
if not args.interactive and not args.text:
|
||||||
infer_loader = DataLoader(infer_data, batch_size=args.batch_size)
|
infer = f"data/processed/{args.dataset}.csv"
|
||||||
|
infer_df = pd.read_csv(infer)
|
||||||
|
filename = f"outputs/inference/{args.model_path}_{args.task}_predictions_{args.dataset}.csv"
|
||||||
|
else:
|
||||||
|
infer_df = pd.DataFrame(columns=[args.text_column])
|
||||||
|
print("Entering interactive mode. Type 'exit()' to quit.")
|
||||||
|
while True:
|
||||||
|
user_input = input("Enter text for inference: ")
|
||||||
|
if user_input.lower() == "exit()":
|
||||||
|
break
|
||||||
|
infer_df = pd.concat([infer_df, pd.DataFrame({args.text_column: [user_input]})], ignore_index=True)
|
||||||
|
filename = f"outputs/inference/{args.model_path}_{args.task}_predictions_interactive.csv"
|
||||||
|
infer_df.to_csv(filename, index=False)
|
||||||
|
infer = filename
|
||||||
|
|
||||||
|
if infer is not None:
|
||||||
|
infer_data = InferenceDataset(infer, tokenizer, args.text_column)
|
||||||
|
infer_loader = DataLoader(infer_data, batch_size=args.batch_size)
|
||||||
|
else:
|
||||||
|
print("No dataset provided for inference. Exiting.")
|
||||||
|
return
|
||||||
|
|
||||||
if args.mode == "mtl":
|
if args.mode == "mtl":
|
||||||
model = Model().to(device)
|
model = Model().to(device)
|
||||||
print(f"Loading weights from {args.model_path}...")
|
print(f"Loading weights from {args.model_path}...")
|
||||||
model.load_state_dict(torch.load(args.model_path, map_location=device))
|
model.load_state_dict(torch.load(f"outputs/{args.model_path}", map_location=device))
|
||||||
model.eval()
|
model.eval()
|
||||||
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
|
active_tasks = ['bug_report', 'feature_request', 'aspect', 'aspect_sentiment']
|
||||||
else:
|
else:
|
||||||
@@ -87,46 +107,46 @@ def main():
|
|||||||
model = SingleTaskModel(args.task, task_classes[args.task]).to(device)
|
model = SingleTaskModel(args.task, task_classes[args.task]).to(device)
|
||||||
active_tasks = [args.task]
|
active_tasks = [args.task]
|
||||||
print(f"Loading weights from {args.model_path}...")
|
print(f"Loading weights from {args.model_path}...")
|
||||||
model.load_state_dict(torch.load(args.model_path, map_location=device))
|
model.load_state_dict(torch.load(f"outputs/{args.model_path}", map_location=device))
|
||||||
model.eval()
|
model.eval()
|
||||||
# the above section is nearly identical to the first part of evaluate.py
|
|
||||||
all_labels = {task: [] for task in active_tasks}
|
all_preds = {task: [] for task in active_tasks}
|
||||||
all_preds = {task: [] for task in active_tasks}
|
all_confidences = {task: [] for task in active_tasks}
|
||||||
all_confidences = {task: [] for task in active_tasks}
|
print(f"Running inference on {args.dataset} dataset")
|
||||||
print("Running inference on test set")
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
for batch in infer_loader:
|
for batch in infer_loader:
|
||||||
input_ids = batch['input_ids'].to(device)
|
input_ids = batch['input_ids'].to(device)
|
||||||
attention_mask = batch['attention_mask'].to(device)
|
attention_mask = batch['attention_mask'].to(device)
|
||||||
outputs = model(input_ids, attention_mask)
|
outputs = model(input_ids, attention_mask)
|
||||||
for task in active_tasks:
|
for task in active_tasks:
|
||||||
# labels = batch[task].to(device)
|
|
||||||
logits = outputs[task]
|
logits = outputs[task]
|
||||||
preds = torch.argmax(logits, dim=1)
|
preds = torch.argmax(logits, dim=1)
|
||||||
|
|
||||||
probs = F.softmax(logits, dim=1)
|
probs = F.softmax(logits, dim=1)
|
||||||
confidence = probs.max(dim=1).values
|
confidence = probs.max(dim=1).values
|
||||||
|
|
||||||
# all_labels[task].extend(labels.cpu().numpy())
|
|
||||||
all_preds[task].extend(preds.cpu().numpy())
|
all_preds[task].extend(preds.cpu().numpy())
|
||||||
all_confidences[task].extend(confidence.cpu().numpy())
|
all_confidences[task].extend(confidence.cpu().numpy())
|
||||||
|
|
||||||
df = pd.DataFrame({
|
df = pd.DataFrame({"text": infer_df[args.text_column]})
|
||||||
"text": infer_df["review_description"]
|
|
||||||
})
|
|
||||||
|
|
||||||
for task in active_tasks: # ensures ALL tasks included
|
for task in active_tasks: # ensures ALL tasks included
|
||||||
df[f"{task}_pred"] = [label_names[task][p] for p in all_preds[task]]
|
df[f"{task}_pred"] = [label_names[task][p] for p in all_preds[task]]
|
||||||
df[f"{task}_confidence"] = all_confidences[task]
|
df[f"{task}_confidence"] = all_confidences[task]
|
||||||
|
|
||||||
summary = {
|
output_path = filename
|
||||||
"mode": args.mode,
|
df.to_csv(output_path, index=False)
|
||||||
"dataset": args.dataset,
|
if not args.text:
|
||||||
"task": args.task,
|
print(f"Inference finished. Predictions saved to {output_path}")
|
||||||
"model_path": args.model_path,
|
else:
|
||||||
"results": {}
|
print(f"Inference finished.\n")
|
||||||
}
|
print(df.to_string(index=False))
|
||||||
|
again = input("Do you want to enter another text for inference? (y/n): ")
|
||||||
|
if again.lower() == 'y':
|
||||||
|
main()
|
||||||
|
else:
|
||||||
|
print("Exiting interactive inference.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user