Files
ReClass/src/multitag.py

296 lines
13 KiB
Python

# multitag.py
# Manual annotation tool for labelling reviews in the Uber reviews dataset, for multitask training
import tkinter as tk
from tkinter import ttk
import pandas as pd
import os
class MultiTag:
def __init__(self):
self.binary_map = {
'1': 'Yes',
'0': 'No'
}
self.aspect_map = {
'A': 'Driver',
'S': 'App',
'D': 'Pricing',
'F': 'Service',
'G': 'Payment',
'H': 'General'
}
self.sentiment_map = {
'A': 'Positive',
'S': 'Neutral',
'D': 'Negative'
}
self.root = tk.Tk()
# root.geometry("400x300")
self.active_column = 0 # used for highlighting the current column
self.btn_width = 15 # button width
self.number_of_aspects = 6 # number of aspect buttons
self.root.title("MultiTag")
# Colors for active label
self.color_incomplete = "#003366"
self.color_complete = "#00AA00"
# Paths
tagged_path = "data/uber_reviews_tagged.csv"
sampled_path = "data/uber_reviews_sampled.csv"
if not os.path.exists(tagged_path):
print(f"Tagged file did not exist, making one at: {sampled_path}")
sampled_df = pd.read_csv(sampled_path, low_memory=False)
sampled_df.to_csv(tagged_path, index=False)
self.load_review_data(tagged_path)
# =============== GUI Elements ====================
# highlight for the current box
self.highlight = tk.Frame(self.root, bg="#003366", height=20, width=130)
self.highlight.grid(row=11, column=0)
# ROW 0: Progress indication
self.progress_label = ttk.Label(
self.root,
text="Loading...",
font=("Arial", 12, "bold")
)
self.progress_label.grid(row=0, column=0, columnspan=4, pady=(5, 0))
# ROW 1: Review display
self.display_review = tk.Text(self.root, height=18, width=100, wrap='word', font=("Arial", 11))
self.display_review.grid(row=1, column=0, columnspan=4, padx=10, pady=10)
# ROW 2: Status label
self.status_label = ttk.Label(
self.root,
text="Fill in all fields...",
font=("Arial", 10),
foreground="gray"
)
self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))
# ROW 3: Field labels
ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))
# ROW 4: Input buttons
# Feature Requests
self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
# Bug Reports
self.bug_true = ttk.Button(self.root, text="1",command=lambda: self.bug_pressed("1"), width= self.btn_width).grid(row=4, column=1, pady=2)
self.bug_false = ttk.Button(self.root, text="0",command=lambda: self.bug_pressed("0"), width= self.btn_width).grid(row=5, column=1, pady=2)
# Aspect Buttons
self.aspect_a = ttk.Button(self.root, text="A: Driver",command=lambda: self.aspect_pressed("A"), width= self.btn_width).grid(row=4, column=2, pady=2)
self.aspect_s = ttk.Button(self.root, text="S: App", command=lambda: self.aspect_pressed("S"), width= self.btn_width).grid(row=5, column=2, pady=2)
self.aspect_d = ttk.Button(self.root, text="D: Pricing", command=lambda: self.aspect_pressed("D"), width= self.btn_width).grid(row=6, column=2, pady=2)
self.aspect_f = ttk.Button(self.root, text="F: Service", command=lambda: self.aspect_pressed("F"), width= self.btn_width).grid(row=7, column=2, pady=2)
self.aspect_g = ttk.Button(self.root, text="G: Payment", command=lambda: self.aspect_pressed("G"), width= self.btn_width).grid(row=8, column=2, pady=2)
self.aspect_h = ttk.Button(self.root, text="H: General", command=lambda: self.aspect_pressed("H"), width= self.btn_width).grid(row=9, column=2, pady=2)
# self.aspect_j = ttk.Button(self.root, text="J: ASPECT HERE", command=lambda: self.aspect_pressed("J"), width= self.btn_width).grid(row=4, column=2, pady=2)
# self.aspect_k = ttk.Button(self.root, text="K: ASPECT HERE", command=lambda: self.aspect_pressed("K"), width= self.btn_width).grid(row=4, column=2, pady=2)
# self.aspect_l = ttk.Button(self.root, text="L: ASPECT HERE", command=lambda: self.aspect_pressed("L"), width= self.btn_width).grid(row=4, column=2, pady=2)
# Aspect sentiment buttons
self.aspect_positive = ttk.Button(self.root, text="A: Positive", command=lambda: self.sentiment_pressed("A"), width= self.btn_width).grid(row=4, column=3, pady=2)
self.aspect_neutral = ttk.Button(self.root, text="S: Neutral", command=lambda: self.sentiment_pressed("S"), width= self.btn_width).grid(row=5, column=3, pady=2)
self.aspect_negative = ttk.Button(self.root, text="D: Negative", command=lambda: self.sentiment_pressed("D"), width= self.btn_width).grid(row=6, column=3, pady=2)
# Highlight box - positioned below buttons
# self.highlight = tk.Frame(self.root, bg=self.color_incomplete, height=20, width=130)
self.highlight.grid(row=10, column=0, pady=(5, 5))
# Key bindings
self.root.bind("q", self.quit_app)
self.root.bind("<Return>", self.try_submit)
self.root.bind("1", self.handle_key)
self.root.bind("0", self.handle_key)
self.root.bind("a", self.handle_key)
self.root.bind("s", self.handle_key)
self.root.bind("d", self.handle_key)
self.root.bind("f", self.handle_key)
self.root.bind("g", self.handle_key)
self.root.bind("h", self.handle_key)
self.display_next_review()
self.root.mainloop()
def handle_key(self, event):
key = event.char
# Feature Request and Bug Report are binary input (1 and 0 keys)
if key in ['1', '0']:
if self.active_column == 0:
self.feature_pressed(key)
elif self.active_column == 1:
self.bug_pressed(key)
# Column 2: aspects (a,s,d,f,g,h,j,k,l)
elif key in 'asdfgh' and self.active_column == 2:
self.aspect_pressed(key.upper())
# Column 3: sentiment (a,s,d)
elif key in 'asd' and self.active_column == 3:
self.sentiment_pressed(key.upper())
def update_status(self):
"""Update status label and highlight"""
if self.all_labels_complete():
self.highlight.configure(bg=self.color_complete)
self.status_label.configure(
text="Complete Tag [ENTER] | Quit [q]",
foreground="green",
font=("Arial", 10, "bold")
)
else:
self.highlight.configure(bg=self.color_incomplete)
self.status_label.configure(
text="Fill in all fields...",
foreground="gray",
font=("Arial", 10)
)
def update_progress(self):
"""Update progress counter"""
tagged_count = (self.review_data['tagged'] == 1).sum()
total_count = len(self.review_data)
remaining = total_count - tagged_count
progress_text = f"Progress: {tagged_count} / {total_count} tagged ({remaining} remaining)"
self.progress_label.configure(text=progress_text)
def move_highlight(self, col):
"""Move the highlight box directly under the button pressed."""
self.highlight.grid(row=10, column=col, pady=(5,5))
self.update_status()
# Setters
def feature_pressed(self, value):
self.review_data.at[self.current_review_index, "feature_request"] = self.binary_map[value]
self.active_column = 1
self.move_highlight(1)
def bug_pressed(self, value):
self.review_data.at[self.current_review_index, "bug_report"] = self.binary_map[value]
self.active_column = 2
self.move_highlight(2)
def aspect_pressed(self, value):
self.review_data.at[self.current_review_index, "aspect"] = self.aspect_map[value]
self.active_column = 3
self.move_highlight(3)
def sentiment_pressed(self, value):
self.review_data.at[self.current_review_index, "aspect_sentiment"] = self.sentiment_map[value]
self.active_column = 0 # Reset for next review
self.update_status()
def load_review_data(self, data_path):
"""Load review data from a CSV file. Adds annotation columns if they don't exist."""
self.review_data = pd.read_csv(data_path, low_memory=False)
if "tagged" not in self.review_data.columns:
self.review_data["tagged"] = 0
if "feature_request" not in self.review_data.columns:
self.review_data["feature_request"] = ""
if "bug_report" not in self.review_data.columns:
self.review_data["bug_report"] = ""
if "aspect" not in self.review_data.columns:
self.review_data["aspect"] = ""
if "aspect_sentiment" not in self.review_data.columns:
self.review_data["aspect_sentiment"] = ""
print(f"Loaded {len(self.review_data)} reviews from {data_path}")
def display_next_review(self):
"""Display the next unlabelled review in the text box."""
self.current_review_index = self.get_current_review_index()
if self.current_review_index < len(self.review_data):
review = self.review_data.iloc[self.current_review_index]
self.review_data.at[self.current_review_index, "feature_request"] = ""
self.review_data.at[self.current_review_index, "bug_report"] = ""
self.review_data.at[self.current_review_index, "aspect"] = ""
self.review_data.at[self.current_review_index, "aspect_sentiment"] = ""
self.display_review.delete(1.0, tk.END) # Clear the text box
self.display_review.insert(tk.END, review["review"]) # Display the review text
# self.current_review_index += 1
# Mark as tagged
# self.review_data.at[self.current_review_index - 1, "tagged"] = 1
self.active_column = 0 # reset to start at feature request
self.highlight.grid(row=10, column=0, pady=(5, 5))
self.highlight.configure(bg=self.color_incomplete)
self.status_label.configure(
text="Fill in all fields...",
foreground="gray",
font=("Arial", 10)
)
self.update_progress()
self.update_progress()
else:
print("No more reviews to display. DONE ☉ ‿ ⚆")
def submit_tag(self):
self.review_data.at[self.current_review_index, "tagged"] = 1
self.save_tags("data/uber_reviews_tagged.csv")
self.display_next_review()
def try_submit(self, event):
"""Try to submit current review if all labels complete."""
if self.all_labels_complete():
self.submit_tag()
print(f"Review {self.current_review_index + 1} tagged")
else:
print(" ☠ Complete all fields first! ☠ ")
self.status_label.configure(
text=" ☠ Complete all fields first! ☠ ",
foreground="red",
font=("Arial", 10, "bold")
)
self.root.after(2000, self.update_status)
def all_labels_complete(self):
row = self.review_data.iloc[self.current_review_index]
return (row["feature_request"] != "" and
row["bug_report"] != "" and
row["aspect"] != "" and
row["aspect_sentiment"] != "")
def save_tags(self, save_path):
"""Save the current tagged data to a CSV file."""
self.review_data.to_csv(save_path, index=False)
def quit_app(self, event):
tagged_count = (self.review_data['tagged'] == 1).sum()
print(f"\n{'='*50}")
print(f"SESSION COMPLETE")
print(f"{'='*50}")
print(f"Total tagged: {tagged_count} / {len(self.review_data)}")
print(f"Saved to: data/uber_reviews_tagged.csv")
print(f"Bye (ʘ‿ʘ)╯")
self.save_tags("data/uber_reviews_tagged.csv")
self.root.destroy()
def get_current_review_index(self):
for i in range(len(self.review_data)):
if self.review_data.iloc[i]["tagged"] == 0:
return i
return len(self.review_data) # all reviews tagged
app = MultiTag()