data processing pipeline now finished just need to annotate reviews
This commit is contained in:
@@ -1,70 +1,125 @@
|
||||
# TODO: Refactor,especially change expected names as I jumped the gun when first making this without sampling properly
|
||||
# TODO: Add button labels and finalise the categories of aspects
|
||||
# TODO: Ensure there is persistent progress tracking implentation before labelling
|
||||
# TODO: Finalise keybinds
|
||||
# TODO: Display progress e.g. review 1020 of 5000
|
||||
# TODO: Validate saving progres
|
||||
# TODO: Loop instead of pressing enter
|
||||
# TODO: Autosave ? / confirm quit at least
|
||||
# TODO: More visual q's
|
||||
|
||||
# multitag.py
|
||||
# This app enables manual annotation of reviews in the Uber dataset, for training with
|
||||
# to achieve review classifications with multi task deep learning
|
||||
|
||||
import tkinter as tk
|
||||
from tkinter import ttk
|
||||
import pandas as pd
|
||||
|
||||
"""
|
||||
app to classify / manually annotate reviews for ml training
|
||||
currently has hotkeys for each option 1 0 asdfghjkl
|
||||
path must be to tagged not sampled, it wont remember
|
||||
"""
|
||||
|
||||
# import langdetect
|
||||
import os
|
||||
|
||||
class MultiTag:
|
||||
def __init__(self):
|
||||
|
||||
self.binary_map = {
|
||||
'1': 'Yes',
|
||||
'0': 'No'
|
||||
}
|
||||
|
||||
self.aspect_map = {
|
||||
'A': 'Driver',
|
||||
'S': 'App',
|
||||
'D': 'Pricing',
|
||||
'F': 'Service',
|
||||
'G': 'Payment',
|
||||
'H': 'General'
|
||||
}
|
||||
|
||||
self.sentiment_map = {
|
||||
'A': 'Positive',
|
||||
'S': 'Neutral',
|
||||
'D': 'Negative'
|
||||
}
|
||||
|
||||
|
||||
self.root = tk.Tk()
|
||||
# root.geometry("400x300")
|
||||
self.active_column = 0 # used for highlighting the current column
|
||||
self.btn_width = 15 # button width
|
||||
self.number_of_aspects = 9 # number of aspect buttons
|
||||
self.number_of_aspects = 6 # number of aspect buttons
|
||||
self.root.title("MultiTag")
|
||||
|
||||
self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
|
||||
self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
|
||||
#self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
|
||||
#self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
|
||||
|
||||
# Colors for active label
|
||||
self.color_incomplete = "#003366"
|
||||
self.color_complete = "#00AA00"
|
||||
|
||||
# Paths
|
||||
tagged_path = "multitag/data/uber_reviews_tagged.csv"
|
||||
sampled_path = "multitag/data/uber_reviews_sampled.csv"
|
||||
# self.load_review_data("data/uber_reviews_sampled.csv")
|
||||
# self.load_review_data("data/uber_reviews_tagged.csv")
|
||||
if not os.path.exists(tagged_path):
|
||||
print(f"Tagged file did not exist, making one at: {sampled_path}")
|
||||
sampled_df = pd.read_csv(sampled_path, low_memory=False)
|
||||
sampled_df.to_csv(tagged_path, index=False)
|
||||
self.load_review_data(tagged_path)
|
||||
|
||||
|
||||
# =============== GUI Elements ====================
|
||||
|
||||
# highlight for the current box
|
||||
self.highlight = tk.Frame(self.root, bg="#003366", height=20, width=130)
|
||||
self.highlight.grid(row=11, column=0)
|
||||
|
||||
# ROW 0: Progress indication
|
||||
self.progress_label = ttk.Label(
|
||||
self.root,
|
||||
text="Loading...",
|
||||
font=("Arial", 12, "bold")
|
||||
)
|
||||
self.progress_label.grid(row=0, column=0, columnspan=4, pady=(5, 0))
|
||||
|
||||
# Labels
|
||||
ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row= 1, column=0)
|
||||
ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 1, column=1)
|
||||
ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 1, column=2)
|
||||
ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 1, column=3)
|
||||
# ROW 1: Review display
|
||||
self.display_review = tk.Text(self.root, height=18, width=100, wrap='word', font=("Arial", 11))
|
||||
self.display_review.grid(row=1, column=0, columnspan=4, padx=10, pady=10)
|
||||
|
||||
# ROW 2: Status label
|
||||
self.status_label = ttk.Label(
|
||||
self.root,
|
||||
text="Fill in all fields...",
|
||||
font=("Arial", 10),
|
||||
foreground="gray"
|
||||
)
|
||||
|
||||
self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))
|
||||
|
||||
|
||||
self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=2, column=0)
|
||||
self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=3, column=0)
|
||||
# Labels ROW 3
|
||||
ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
|
||||
ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
|
||||
ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
|
||||
ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))
|
||||
|
||||
self.bug_true = ttk.Button(self.root, text="1",command=lambda: self.bug_pressed("1"), width= self.btn_width).grid(row = 2, column=1)
|
||||
self.bug_false = ttk.Button(self.root, text="0",command=lambda: self.bug_pressed("0"), width= self.btn_width).grid(row = 3, column=1)
|
||||
# ROW 4 |Buttons|
|
||||
# Feature Requests
|
||||
self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
|
||||
self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
|
||||
# Bug Reports
|
||||
self.bug_true = ttk.Button(self.root, text="1",command=lambda: self.bug_pressed("1"), width= self.btn_width).grid(row=4, column=1, pady=2)
|
||||
self.bug_false = ttk.Button(self.root, text="0",command=lambda: self.bug_pressed("0"), width= self.btn_width).grid(row=5, column=1, pady=2)
|
||||
# Aspect Buttons
|
||||
self.aspect_a = ttk.Button(self.root, text="A: Driver",command=lambda: self.aspect_pressed("A"), width= self.btn_width).grid(row=4, column=2, pady=2)
|
||||
self.aspect_s = ttk.Button(self.root, text="S: App", command=lambda: self.aspect_pressed("S"), width= self.btn_width).grid(row=5, column=2, pady=2)
|
||||
self.aspect_d = ttk.Button(self.root, text="D: Pricing", command=lambda: self.aspect_pressed("D"), width= self.btn_width).grid(row=6, column=2, pady=2)
|
||||
self.aspect_f = ttk.Button(self.root, text="F: Service", command=lambda: self.aspect_pressed("F"), width= self.btn_width).grid(row=7, column=2, pady=2)
|
||||
self.aspect_g = ttk.Button(self.root, text="G: Payment", command=lambda: self.aspect_pressed("G"), width= self.btn_width).grid(row=8, column=2, pady=2)
|
||||
self.aspect_h = ttk.Button(self.root, text="H: General", command=lambda: self.aspect_pressed("H"), width= self.btn_width).grid(row=9, column=2, pady=2)
|
||||
# self.aspect_j = ttk.Button(self.root, text="J: ASPECT HERE", command=lambda: self.aspect_pressed("J"), width= self.btn_width).grid(row=4, column=2, pady=2)
|
||||
# self.aspect_k = ttk.Button(self.root, text="K: ASPECT HERE", command=lambda: self.aspect_pressed("K"), width= self.btn_width).grid(row=4, column=2, pady=2)
|
||||
# self.aspect_l = ttk.Button(self.root, text="L: ASPECT HERE", command=lambda: self.aspect_pressed("L"), width= self.btn_width).grid(row=4, column=2, pady=2)
|
||||
# Aspect sentiment buttons
|
||||
self.aspect_positive = ttk.Button(self.root, text="A: Positive", command=lambda: self.sentiment_pressed("A"), width= self.btn_width).grid(row=4, column=3, pady=2)
|
||||
self.aspect_neutral = ttk.Button(self.root, text="S: Neutral", command=lambda: self.sentiment_pressed("S"), width= self.btn_width).grid(row=5, column=3, pady=2)
|
||||
self.aspect_negative = ttk.Button(self.root, text="D: Negative", command=lambda: self.sentiment_pressed("D"), width= self.btn_width).grid(row=6, column=3, pady=2)
|
||||
|
||||
self.aspect_a = ttk.Button(self.root, text="A: ASPECT HERE",command=lambda: self.aspect_pressed("A"), width= self.btn_width).grid(row = 2, column=2)
|
||||
self.aspect_s = ttk.Button(self.root, text="S: ASPECT HERE", command=lambda: self.aspect_pressed("S"), width= self.btn_width).grid(row = 3, column=2)
|
||||
self.aspect_d = ttk.Button(self.root, text="D: ASPECT HERE", command=lambda: self.aspect_pressed("D"), width= self.btn_width).grid(row = 4, column=2)
|
||||
self.aspect_f = ttk.Button(self.root, text="F: ASPECT HERE", command=lambda: self.aspect_pressed("F"), width= self.btn_width).grid(row = 5, column=2)
|
||||
self.aspect_g = ttk.Button(self.root, text="G: ASPECT HERE", command=lambda: self.aspect_pressed("G"), width= self.btn_width).grid(row = 6, column=2)
|
||||
self.aspect_h = ttk.Button(self.root, text="H: ASPECT HERE", command=lambda: self.aspect_pressed("H"), width= self.btn_width).grid(row = 7, column=2)
|
||||
self.aspect_j = ttk.Button(self.root, text="J: ASPECT HERE", command=lambda: self.aspect_pressed("J"), width= self.btn_width).grid(row = 8, column=2)
|
||||
self.aspect_k = ttk.Button(self.root, text="K: ASPECT HERE", command=lambda: self.aspect_pressed("K"), width= self.btn_width).grid(row = 9, column=2)
|
||||
self.aspect_l = ttk.Button(self.root, text="L: ASPECT HERE", command=lambda: self.aspect_pressed("L"), width= self.btn_width).grid(row = 10, column=2)
|
||||
# Highlight box - positioned below buttons
|
||||
# self.highlight = tk.Frame(self.root, bg=self.color_incomplete, height=20, width=130)
|
||||
self.highlight.grid(row=10, column=0, pady=(5, 5))
|
||||
|
||||
self.aspect_positive = ttk.Button(self.root, text="A: Positive", command=lambda: self.sentiment_pressed("A"), width= self.btn_width).grid(row=2, column=3)
|
||||
self.aspect_neutral = ttk.Button(self.root, text="S: Neutral", command=lambda: self.sentiment_pressed("S"), width= self.btn_width).grid(row=3, column=3)
|
||||
self.aspect_negative = ttk.Button(self.root, text="D: Negative", command=lambda: self.sentiment_pressed("D"), width= self.btn_width).grid(row=4, column=3)
|
||||
|
||||
# keys
|
||||
# Key bindings
|
||||
self.root.bind("q", self.quit_app)
|
||||
self.root.bind("<Return>", self.try_submit)
|
||||
self.root.bind("1", self.handle_key)
|
||||
@@ -75,17 +130,14 @@ class MultiTag:
|
||||
self.root.bind("f", self.handle_key)
|
||||
self.root.bind("g", self.handle_key)
|
||||
self.root.bind("h", self.handle_key)
|
||||
self.root.bind("j", self.handle_key)
|
||||
self.root.bind("k", self.handle_key)
|
||||
self.root.bind("l", self.handle_key)
|
||||
# self.root.bind("j", self.handle_key)
|
||||
# self.root.bind("k", self.handle_key)
|
||||
# self.root.bind("l", self.handle_key)
|
||||
|
||||
|
||||
self.load_review_data("data/uber_reviews_sampled.csv")
|
||||
# self.load_review_data("data/uber_reviews_tagged.csv")
|
||||
|
||||
self.display_next_review()
|
||||
# self.save_tags("data/uber_reviews_tagged.csv")
|
||||
|
||||
self.root.mainloop()
|
||||
|
||||
def handle_key(self, event):
|
||||
@@ -98,36 +150,63 @@ class MultiTag:
|
||||
elif self.active_column == 1:
|
||||
self.bug_pressed(key)
|
||||
# Column 2: aspects (a,s,d,f,g,h,j,k,l)
|
||||
elif key in 'asdfghjkl' and self.active_column == 2:
|
||||
elif key in 'asdfgh' and self.active_column == 2:
|
||||
self.aspect_pressed(key.upper())
|
||||
# Column 3: sentiment (a,s,d)
|
||||
elif key in 'asd' and self.active_column == 3:
|
||||
self.sentiment_pressed(key.upper())
|
||||
|
||||
def move_highlight(self, row, col):
|
||||
def update_status(self):
|
||||
"""Update status label and highlight color based on completion state"""
|
||||
if self.all_labels_complete():
|
||||
self.highlight.configure(bg=self.color_complete)
|
||||
self.status_label.configure(
|
||||
text="Complete Tag [ENTER] | Quit [q]",
|
||||
foreground="green",
|
||||
font=("Arial", 10, "bold")
|
||||
)
|
||||
else:
|
||||
self.highlight.configure(bg=self.color_incomplete)
|
||||
self.status_label.configure(
|
||||
text="Fill in all fields...",
|
||||
foreground="gray",
|
||||
font=("Arial", 10)
|
||||
)
|
||||
|
||||
def update_progress(self):
|
||||
"""Update progress counter"""
|
||||
tagged_count = (self.review_data['tagged'] == 1).sum()
|
||||
total_count = len(self.review_data)
|
||||
remaining = total_count - tagged_count
|
||||
|
||||
progress_text = f"Progress: {tagged_count} / {total_count} tagged ({remaining} remaining)"
|
||||
self.progress_label.configure(text=progress_text)
|
||||
|
||||
def move_highlight(self, col):
|
||||
"""Move the highlight box directly under the button pressed."""
|
||||
self.highlight.grid(row=row, column=col)
|
||||
self.highlight.grid() # make sure it’s visible
|
||||
|
||||
self.highlight.grid(row=10, column=col, pady=(5,5))
|
||||
self.update_status()
|
||||
|
||||
# Setters
|
||||
def feature_pressed(self, value):
|
||||
self.review_data.at[self.current_review_index, "feature_request"] = value
|
||||
self.review_data.at[self.current_review_index, "feature_request"] = self.binary_map[value]
|
||||
self.active_column = 1
|
||||
self.move_highlight(self.number_of_aspects + 2, 1)
|
||||
self.move_highlight(1)
|
||||
|
||||
def bug_pressed(self, value):
|
||||
self.review_data.at[self.current_review_index, "bug_report"] = value
|
||||
self.review_data.at[self.current_review_index, "bug_report"] = self.binary_map[value]
|
||||
self.active_column = 2
|
||||
self.move_highlight(self.number_of_aspects + 2, 2)
|
||||
self.move_highlight(2)
|
||||
|
||||
def aspect_pressed(self, value):
|
||||
self.review_data.at[self.current_review_index, "aspect"] = value
|
||||
self.review_data.at[self.current_review_index, "aspect"] = self.aspect_map[value]
|
||||
self.active_column = 3
|
||||
self.move_highlight(self.number_of_aspects + 2, 3)
|
||||
self.move_highlight(3)
|
||||
|
||||
def sentiment_pressed(self, value):
|
||||
self.review_data.at[self.current_review_index, "aspect_sentiment"] = value
|
||||
self.review_data.at[self.current_review_index, "aspect_sentiment"] = self.sentiment_map[value]
|
||||
self.active_column = 0 # Reset for next review
|
||||
self.update_status()
|
||||
|
||||
|
||||
def load_review_data(self, data_path):
|
||||
@@ -150,31 +229,49 @@ class MultiTag:
|
||||
self.current_review_index = self.get_current_review_index()
|
||||
if self.current_review_index < len(self.review_data):
|
||||
review = self.review_data.iloc[self.current_review_index]
|
||||
|
||||
self.review_data.at[self.current_review_index, "feature_request"] = ""
|
||||
self.review_data.at[self.current_review_index, "bug_report"] = ""
|
||||
self.review_data.at[self.current_review_index, "aspect"] = ""
|
||||
self.review_data.at[self.current_review_index, "aspect_sentiment"] = ""
|
||||
|
||||
self.display_review.delete(1.0, tk.END) # Clear the text box
|
||||
self.display_review.insert(tk.END, review["review_description"]) # Display the review text
|
||||
self.display_review.insert(tk.END, review["review"]) # Display the review text
|
||||
# self.current_review_index += 1
|
||||
# Mark as tagged
|
||||
# self.review_data.at[self.current_review_index - 1, "tagged"] = 1
|
||||
self.active_column = 0 # reset to start at feature request
|
||||
self.move_highlight(self.number_of_aspects + 2, 0)
|
||||
self.highlight.grid(row=10, column=0, pady=(5, 5))
|
||||
self.highlight.configure(bg=self.color_incomplete)
|
||||
self.status_label.configure(
|
||||
text="Fill in all fields...",
|
||||
foreground="gray",
|
||||
font=("Arial", 10)
|
||||
)
|
||||
self.update_progress()
|
||||
self.update_progress()
|
||||
|
||||
else:
|
||||
print("No more reviews to display.")
|
||||
print("No more reviews to display. DONE ☉ ‿ ⚆")
|
||||
|
||||
def submit_tag(self):
|
||||
self.review_data.at[self.current_review_index, "tagged"] = 1
|
||||
self.save_tags("data/uber_reviews_tagged.csv")
|
||||
self.save_tags("multitag/data/uber_reviews_tagged.csv")
|
||||
self.display_next_review()
|
||||
|
||||
def try_submit(self, event):
|
||||
"""Try to submit current review if all labels complete."""
|
||||
if self.all_labels_complete():
|
||||
self.submit_tag()
|
||||
self.move_highlight(self.number_of_aspects + 2, 0)
|
||||
|
||||
print("Labels submitted, loading next review")
|
||||
print(f"Review {self.current_review_index + 1} tagged")
|
||||
else:
|
||||
print("Please complete all labels before submitting")
|
||||
print(" ☠ Complete all fields first! ☠ ")
|
||||
self.status_label.configure(
|
||||
text=" ☠ Complete all fields first! ☠ ",
|
||||
foreground="red",
|
||||
font=("Arial", 10, "bold")
|
||||
)
|
||||
self.root.after(2000, self.update_status)
|
||||
|
||||
def all_labels_complete(self):
|
||||
row = self.review_data.iloc[self.current_review_index]
|
||||
@@ -186,17 +283,24 @@ class MultiTag:
|
||||
def save_tags(self, save_path):
|
||||
"""Save the tagged data to a CSV file."""
|
||||
self.review_data.to_csv(save_path, index=False)
|
||||
print(f"Tagged data saved to {save_path}")
|
||||
# print(f"Tagged data saved to {save_path}")
|
||||
|
||||
def quit_app(self, event):
|
||||
tagged_count = (self.review_data['tagged'] == 1).sum()
|
||||
print(f"\n{'='*50}")
|
||||
print(f"SESSION COMPLETE")
|
||||
print(f"{'='*50}")
|
||||
print(f"Total tagged: {tagged_count} / {len(self.review_data)}")
|
||||
print(f"Saved to: multitag/data/uber_reviews_tagged.csv")
|
||||
print(f"Bye (ʘ‿ʘ)╯")
|
||||
self.save_tags("multitag/data/uber_reviews_tagged.csv")
|
||||
self.root.destroy()
|
||||
self.save_tags("data/uber_reviews_tagged.csv")
|
||||
|
||||
def get_current_review_index(self):
|
||||
for i in range(len(self.review_data)):
|
||||
if self.review_data.iloc[i]["tagged"] == 0:
|
||||
return i
|
||||
return self.review_data.shape[0] # all reviews tagged
|
||||
return len(self.review_data) # all reviews tagged
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user