House Cleaning
This commit is contained in:
0
src/infer.py
Normal file
0
src/infer.py
Normal file
307
src/multitag.py
Normal file
307
src/multitag.py
Normal file
@@ -0,0 +1,307 @@
|
||||
# multitag.py
|
||||
# This app enables manual annotation of reviews in the Uber dataset, for training with
|
||||
# to achieve review classifications with multi task deep learning
|
||||
|
||||
import tkinter as tk
|
||||
from tkinter import ttk
|
||||
import pandas as pd
|
||||
# import langdetect
|
||||
import os
|
||||
|
||||
class MultiTag:
|
||||
def __init__(self):
|
||||
|
||||
self.binary_map = {
|
||||
'1': 'Yes',
|
||||
'0': 'No'
|
||||
}
|
||||
|
||||
self.aspect_map = {
|
||||
'A': 'Driver',
|
||||
'S': 'App',
|
||||
'D': 'Pricing',
|
||||
'F': 'Service',
|
||||
'G': 'Payment',
|
||||
'H': 'General'
|
||||
}
|
||||
|
||||
self.sentiment_map = {
|
||||
'A': 'Positive',
|
||||
'S': 'Neutral',
|
||||
'D': 'Negative'
|
||||
}
|
||||
|
||||
|
||||
self.root = tk.Tk()
|
||||
# root.geometry("400x300")
|
||||
self.active_column = 0 # used for highlighting the current column
|
||||
self.btn_width = 15 # button width
|
||||
self.number_of_aspects = 6 # number of aspect buttons
|
||||
self.root.title("MultiTag")
|
||||
|
||||
#self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
|
||||
#self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
|
||||
|
||||
# Colors for active label
|
||||
self.color_incomplete = "#003366"
|
||||
self.color_complete = "#00AA00"
|
||||
|
||||
# Paths
|
||||
tagged_path = "multitag/data/uber_reviews_tagged.csv"
|
||||
sampled_path = "multitag/data/uber_reviews_sampled.csv"
|
||||
# self.load_review_data("data/uber_reviews_sampled.csv")
|
||||
# self.load_review_data("data/uber_reviews_tagged.csv")
|
||||
if not os.path.exists(tagged_path):
|
||||
print(f"Tagged file did not exist, making one at: {sampled_path}")
|
||||
sampled_df = pd.read_csv(sampled_path, low_memory=False)
|
||||
sampled_df.to_csv(tagged_path, index=False)
|
||||
self.load_review_data(tagged_path)
|
||||
|
||||
|
||||
# =============== GUI Elements ====================
|
||||
|
||||
# highlight for the current box
|
||||
self.highlight = tk.Frame(self.root, bg="#003366", height=20, width=130)
|
||||
self.highlight.grid(row=11, column=0)
|
||||
|
||||
# ROW 0: Progress indication
|
||||
self.progress_label = ttk.Label(
|
||||
self.root,
|
||||
text="Loading...",
|
||||
font=("Arial", 12, "bold")
|
||||
)
|
||||
self.progress_label.grid(row=0, column=0, columnspan=4, pady=(5, 0))
|
||||
|
||||
# ROW 1: Review display
|
||||
self.display_review = tk.Text(self.root, height=18, width=100, wrap='word', font=("Arial", 11))
|
||||
self.display_review.grid(row=1, column=0, columnspan=4, padx=10, pady=10)
|
||||
|
||||
# ROW 2: Status label
|
||||
self.status_label = ttk.Label(
|
||||
self.root,
|
||||
text="Fill in all fields...",
|
||||
font=("Arial", 10),
|
||||
foreground="gray"
|
||||
)
|
||||
|
||||
self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))
|
||||
|
||||
|
||||
# Labels ROW 3
|
||||
ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
|
||||
ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
|
||||
ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
|
||||
ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))
|
||||
|
||||
# ROW 4 |Buttons|
|
||||
# Feature Requests
|
||||
self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
|
||||
self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
|
||||
# Bug Reports
|
||||
self.bug_true = ttk.Button(self.root, text="1",command=lambda: self.bug_pressed("1"), width= self.btn_width).grid(row=4, column=1, pady=2)
|
||||
self.bug_false = ttk.Button(self.root, text="0",command=lambda: self.bug_pressed("0"), width= self.btn_width).grid(row=5, column=1, pady=2)
|
||||
# Aspect Buttons
|
||||
self.aspect_a = ttk.Button(self.root, text="A: Driver",command=lambda: self.aspect_pressed("A"), width= self.btn_width).grid(row=4, column=2, pady=2)
|
||||
self.aspect_s = ttk.Button(self.root, text="S: App", command=lambda: self.aspect_pressed("S"), width= self.btn_width).grid(row=5, column=2, pady=2)
|
||||
self.aspect_d = ttk.Button(self.root, text="D: Pricing", command=lambda: self.aspect_pressed("D"), width= self.btn_width).grid(row=6, column=2, pady=2)
|
||||
self.aspect_f = ttk.Button(self.root, text="F: Service", command=lambda: self.aspect_pressed("F"), width= self.btn_width).grid(row=7, column=2, pady=2)
|
||||
self.aspect_g = ttk.Button(self.root, text="G: Payment", command=lambda: self.aspect_pressed("G"), width= self.btn_width).grid(row=8, column=2, pady=2)
|
||||
self.aspect_h = ttk.Button(self.root, text="H: General", command=lambda: self.aspect_pressed("H"), width= self.btn_width).grid(row=9, column=2, pady=2)
|
||||
# self.aspect_j = ttk.Button(self.root, text="J: ASPECT HERE", command=lambda: self.aspect_pressed("J"), width= self.btn_width).grid(row=4, column=2, pady=2)
|
||||
# self.aspect_k = ttk.Button(self.root, text="K: ASPECT HERE", command=lambda: self.aspect_pressed("K"), width= self.btn_width).grid(row=4, column=2, pady=2)
|
||||
# self.aspect_l = ttk.Button(self.root, text="L: ASPECT HERE", command=lambda: self.aspect_pressed("L"), width= self.btn_width).grid(row=4, column=2, pady=2)
|
||||
# Aspect sentiment buttons
|
||||
self.aspect_positive = ttk.Button(self.root, text="A: Positive", command=lambda: self.sentiment_pressed("A"), width= self.btn_width).grid(row=4, column=3, pady=2)
|
||||
self.aspect_neutral = ttk.Button(self.root, text="S: Neutral", command=lambda: self.sentiment_pressed("S"), width= self.btn_width).grid(row=5, column=3, pady=2)
|
||||
self.aspect_negative = ttk.Button(self.root, text="D: Negative", command=lambda: self.sentiment_pressed("D"), width= self.btn_width).grid(row=6, column=3, pady=2)
|
||||
|
||||
# Highlight box - positioned below buttons
|
||||
# self.highlight = tk.Frame(self.root, bg=self.color_incomplete, height=20, width=130)
|
||||
self.highlight.grid(row=10, column=0, pady=(5, 5))
|
||||
|
||||
# Key bindings
|
||||
self.root.bind("q", self.quit_app)
|
||||
self.root.bind("<Return>", self.try_submit)
|
||||
self.root.bind("1", self.handle_key)
|
||||
self.root.bind("0", self.handle_key)
|
||||
self.root.bind("a", self.handle_key)
|
||||
self.root.bind("s", self.handle_key)
|
||||
self.root.bind("d", self.handle_key)
|
||||
self.root.bind("f", self.handle_key)
|
||||
self.root.bind("g", self.handle_key)
|
||||
self.root.bind("h", self.handle_key)
|
||||
# self.root.bind("j", self.handle_key)
|
||||
# self.root.bind("k", self.handle_key)
|
||||
# self.root.bind("l", self.handle_key)
|
||||
|
||||
|
||||
|
||||
self.display_next_review()
|
||||
# self.save_tags("data/uber_reviews_tagged.csv")
|
||||
self.root.mainloop()
|
||||
|
||||
def handle_key(self, event):
|
||||
key = event.char
|
||||
|
||||
# Column 0 or 1: feature/bug (1 and 0)
|
||||
if key in ['1', '0']:
|
||||
if self.active_column == 0:
|
||||
self.feature_pressed(key)
|
||||
elif self.active_column == 1:
|
||||
self.bug_pressed(key)
|
||||
# Column 2: aspects (a,s,d,f,g,h,j,k,l)
|
||||
elif key in 'asdfgh' and self.active_column == 2:
|
||||
self.aspect_pressed(key.upper())
|
||||
# Column 3: sentiment (a,s,d)
|
||||
elif key in 'asd' and self.active_column == 3:
|
||||
self.sentiment_pressed(key.upper())
|
||||
|
||||
def update_status(self):
|
||||
"""Update status label and highlight color based on completion state"""
|
||||
if self.all_labels_complete():
|
||||
self.highlight.configure(bg=self.color_complete)
|
||||
self.status_label.configure(
|
||||
text="Complete Tag [ENTER] | Quit [q]",
|
||||
foreground="green",
|
||||
font=("Arial", 10, "bold")
|
||||
)
|
||||
else:
|
||||
self.highlight.configure(bg=self.color_incomplete)
|
||||
self.status_label.configure(
|
||||
text="Fill in all fields...",
|
||||
foreground="gray",
|
||||
font=("Arial", 10)
|
||||
)
|
||||
|
||||
def update_progress(self):
|
||||
"""Update progress counter"""
|
||||
tagged_count = (self.review_data['tagged'] == 1).sum()
|
||||
total_count = len(self.review_data)
|
||||
remaining = total_count - tagged_count
|
||||
|
||||
progress_text = f"Progress: {tagged_count} / {total_count} tagged ({remaining} remaining)"
|
||||
self.progress_label.configure(text=progress_text)
|
||||
|
||||
def move_highlight(self, col):
|
||||
"""Move the highlight box directly under the button pressed."""
|
||||
self.highlight.grid(row=10, column=col, pady=(5,5))
|
||||
self.update_status()
|
||||
|
||||
# Setters
|
||||
def feature_pressed(self, value):
|
||||
self.review_data.at[self.current_review_index, "feature_request"] = self.binary_map[value]
|
||||
self.active_column = 1
|
||||
self.move_highlight(1)
|
||||
|
||||
def bug_pressed(self, value):
|
||||
self.review_data.at[self.current_review_index, "bug_report"] = self.binary_map[value]
|
||||
self.active_column = 2
|
||||
self.move_highlight(2)
|
||||
|
||||
def aspect_pressed(self, value):
|
||||
self.review_data.at[self.current_review_index, "aspect"] = self.aspect_map[value]
|
||||
self.active_column = 3
|
||||
self.move_highlight(3)
|
||||
|
||||
def sentiment_pressed(self, value):
|
||||
self.review_data.at[self.current_review_index, "aspect_sentiment"] = self.sentiment_map[value]
|
||||
self.active_column = 0 # Reset for next review
|
||||
self.update_status()
|
||||
|
||||
|
||||
def load_review_data(self, data_path):
|
||||
"""Load review data from a CSV file."""
|
||||
self.review_data = pd.read_csv(data_path, low_memory=False)
|
||||
if "tagged" not in self.review_data.columns:
|
||||
self.review_data["tagged"] = 0 # Initialize tagged column if not present
|
||||
if "feature_request" not in self.review_data.columns:
|
||||
self.review_data["feature_request"] = "" # Initialize feature_request column if not present
|
||||
if "bug_report" not in self.review_data.columns:
|
||||
self.review_data["bug_report"] = "" # Initialize bug_report column if not present
|
||||
if "aspect" not in self.review_data.columns:
|
||||
self.review_data["aspect"] = "" # Initialize aspect column if not present
|
||||
if "aspect_sentiment" not in self.review_data.columns:
|
||||
self.review_data["aspect_sentiment"] = "" # Initialize aspect_sentiment column if not present
|
||||
print(f"Loaded {len(self.review_data)} reviews from {data_path}")
|
||||
|
||||
def display_next_review(self):
|
||||
"""Display the next review in the text box."""
|
||||
self.current_review_index = self.get_current_review_index()
|
||||
if self.current_review_index < len(self.review_data):
|
||||
review = self.review_data.iloc[self.current_review_index]
|
||||
|
||||
self.review_data.at[self.current_review_index, "feature_request"] = ""
|
||||
self.review_data.at[self.current_review_index, "bug_report"] = ""
|
||||
self.review_data.at[self.current_review_index, "aspect"] = ""
|
||||
self.review_data.at[self.current_review_index, "aspect_sentiment"] = ""
|
||||
|
||||
self.display_review.delete(1.0, tk.END) # Clear the text box
|
||||
self.display_review.insert(tk.END, review["review"]) # Display the review text
|
||||
# self.current_review_index += 1
|
||||
# Mark as tagged
|
||||
# self.review_data.at[self.current_review_index - 1, "tagged"] = 1
|
||||
self.active_column = 0 # reset to start at feature request
|
||||
self.highlight.grid(row=10, column=0, pady=(5, 5))
|
||||
self.highlight.configure(bg=self.color_incomplete)
|
||||
self.status_label.configure(
|
||||
text="Fill in all fields...",
|
||||
foreground="gray",
|
||||
font=("Arial", 10)
|
||||
)
|
||||
self.update_progress()
|
||||
self.update_progress()
|
||||
|
||||
else:
|
||||
print("No more reviews to display. DONE ☉ ‿ ⚆")
|
||||
|
||||
def submit_tag(self):
|
||||
self.review_data.at[self.current_review_index, "tagged"] = 1
|
||||
self.save_tags("multitag/data/uber_reviews_tagged.csv")
|
||||
self.display_next_review()
|
||||
|
||||
def try_submit(self, event):
|
||||
"""Try to submit current review if all labels complete."""
|
||||
if self.all_labels_complete():
|
||||
self.submit_tag()
|
||||
print(f"Review {self.current_review_index + 1} tagged")
|
||||
else:
|
||||
print(" ☠ Complete all fields first! ☠ ")
|
||||
self.status_label.configure(
|
||||
text=" ☠ Complete all fields first! ☠ ",
|
||||
foreground="red",
|
||||
font=("Arial", 10, "bold")
|
||||
)
|
||||
self.root.after(2000, self.update_status)
|
||||
|
||||
def all_labels_complete(self):
|
||||
row = self.review_data.iloc[self.current_review_index]
|
||||
return (row["feature_request"] != "" and
|
||||
row["bug_report"] != "" and
|
||||
row["aspect"] != "" and
|
||||
row["aspect_sentiment"] != "")
|
||||
|
||||
def save_tags(self, save_path):
|
||||
"""Save the tagged data to a CSV file."""
|
||||
self.review_data.to_csv(save_path, index=False)
|
||||
# print(f"Tagged data saved to {save_path}")
|
||||
|
||||
def quit_app(self, event):
|
||||
tagged_count = (self.review_data['tagged'] == 1).sum()
|
||||
print(f"\n{'='*50}")
|
||||
print(f"SESSION COMPLETE")
|
||||
print(f"{'='*50}")
|
||||
print(f"Total tagged: {tagged_count} / {len(self.review_data)}")
|
||||
print(f"Saved to: multitag/data/uber_reviews_tagged.csv")
|
||||
print(f"Bye (ʘ‿ʘ)╯")
|
||||
self.save_tags("multitag/data/uber_reviews_tagged.csv")
|
||||
self.root.destroy()
|
||||
|
||||
def get_current_review_index(self):
|
||||
for i in range(len(self.review_data)):
|
||||
if self.review_data.iloc[i]["tagged"] == 0:
|
||||
return i
|
||||
return len(self.review_data) # all reviews tagged
|
||||
|
||||
|
||||
|
||||
app = MultiTag()
|
||||
176
src/preprocess.py
Normal file
176
src/preprocess.py
Normal file
@@ -0,0 +1,176 @@
|
||||
# preprocess.py
|
||||
|
||||
import pandas as pd
|
||||
import re
|
||||
from langdetect import detect, LangDetectException
|
||||
|
||||
def clean_text(text):
|
||||
"""Clean review text by removing URLS, emails, excessive whitespace
|
||||
|
||||
Input:
|
||||
text - the review text to clean
|
||||
|
||||
Outputs:
|
||||
str: the cleaned review text
|
||||
"""
|
||||
if pd.isna(text):
|
||||
return ""
|
||||
|
||||
# Convert to lower for uniformity
|
||||
text = str(text).lower()
|
||||
|
||||
# Remove URLs using regex
|
||||
text = re.sub(r'http\S+|www\S+', '', text)
|
||||
|
||||
# Remove emails
|
||||
text = re.sub(r'\S+@\S+', '', text)
|
||||
|
||||
# Normalize punctuation
|
||||
text = re.sub(r'\.{2,}', '.', text)
|
||||
text = re.sub(r'!{2,}', '!', text)
|
||||
text = re.sub(r'\?{2,}', '?', text)
|
||||
|
||||
# Remove excessive whitespace by replacing with single whitespace where there is trailing spaces
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
return text
|
||||
|
||||
def detect_language(text):
|
||||
"""Detect language of text"""
|
||||
try:
|
||||
if pd.isna(text) or len(str(text).strip()) < 10:
|
||||
return 'unknown'
|
||||
return detect(str(text))
|
||||
except LangDetectException:
|
||||
return 'unknown'
|
||||
|
||||
def preprocess_uber_reviews(input_path, output_path):
|
||||
"""
|
||||
preprocess_uber_reviews by loading, cleaning, and filtering the data.
|
||||
|
||||
- No language detection due to unreliability on short informal text
|
||||
- Data is labelled as English, but contains non-english text
|
||||
- Assumes location of the datasets hardcoded, doesn't handle if it doesn't exist
|
||||
- Assumes there is a column named "review_description"
|
||||
|
||||
1. Load from csv pd.read_csv()
|
||||
2. Remove rows with missing descriptions
|
||||
3. Clean text by removing URLS, emails, and excessive whitespace
|
||||
4. Calculate word count for each review
|
||||
5. Removes duplicate reviews
|
||||
6. Removes less than 5 word reviews
|
||||
6. Saves the cleaned dataset to uber_reviews_cleaned.csv
|
||||
|
||||
Inputs:
|
||||
input_path (str): Path to uber_reviews.csv
|
||||
output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv
|
||||
|
||||
Outputs:
|
||||
pd.df_clean: the dataframe of cleaned processed reviews
|
||||
"""
|
||||
print("="*50)
|
||||
print("PREPROCESSING UBER REVIEWS")
|
||||
print("="*50)
|
||||
|
||||
# 1. Load data
|
||||
print("\n1. Loading data...")
|
||||
df = pd.read_csv(input_path, low_memory=False)
|
||||
print(f" Original size: {len(df):,} reviews")
|
||||
|
||||
# 2. Remove missing reviews
|
||||
print("\n2. Removing missing reviews...")
|
||||
df = df.dropna(subset=['review_description'])
|
||||
print(f" After removing nulls: {len(df):,} reviews")
|
||||
|
||||
# 3. Clean text
|
||||
print("\n3. Cleaning text...")
|
||||
df['review_clean'] = df['review_description'].apply(clean_text)
|
||||
|
||||
# 4. Calculate word count
|
||||
df['word_count'] = df['review_clean'].str.split().str.len()
|
||||
|
||||
# 5. Remove short reviews
|
||||
review_length_limit = 5 ### limit review length ###
|
||||
print(f"\n4. Removing short reviews so reviews have better context / (usefulness) (< {review_length_limit})...")
|
||||
# 1 word reviews provide little to draw conclusions from and bloat the
|
||||
# dataset a lot, nearly 50% of reviews!
|
||||
|
||||
# display changes
|
||||
before = len(df)
|
||||
df = df[df['word_count'] >= review_length_limit]
|
||||
removed = before - len(df)
|
||||
print(f" Removed: {removed:,} reviews ({removed/before*100:.1f}%)")
|
||||
print(f" Remaining: {len(df):,} reviews")
|
||||
|
||||
# 6. Remove duplicates
|
||||
print("\n5. Removing duplicates...")
|
||||
before = len(df)
|
||||
df = df.drop_duplicates(subset=['review_clean'])
|
||||
removed = before - len(df)
|
||||
print(f" Removed: {removed:,} duplicates")
|
||||
print(f" Remaining: {len(df):,} reviews")
|
||||
|
||||
# 7. Final dataset
|
||||
df_clean = df[['review_clean', 'rating', 'word_count']].copy()
|
||||
df_clean.rename(columns={'review_clean': 'review'}, inplace=True)
|
||||
df_clean = df_clean.reset_index(drop=True)
|
||||
|
||||
# 8. Save
|
||||
print(f"\n6. Saving to {output_path}...")
|
||||
df_clean.to_csv(output_path, index=False)
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*50)
|
||||
print("PREPROCESSING COMPLETE")
|
||||
print("="*50)
|
||||
print(f"\nFinal dataset: {len(df_clean):,} reviews")
|
||||
print(f"Quality filters: word_count >= 5, duplicates removed")
|
||||
# while this does remove a some legitimate reviews which would provide use in classification
|
||||
# it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5
|
||||
# it showed the most amount of formative reviews without seeming excessive in data removal
|
||||
|
||||
print("\nRating distribution:")
|
||||
rating_dist = df_clean['rating'].value_counts().sort_index()
|
||||
for rating, count in rating_dist.items():
|
||||
percentage = count / len(df_clean) * 100
|
||||
print(f" {rating}{"✭"*rating}: {count:,} ({percentage:.1f}%)")
|
||||
|
||||
print("\nWord count statistics:")
|
||||
print(f" Mean: {df_clean['word_count'].mean():.1f} words")
|
||||
print(f" Median: {df_clean['word_count'].median():.1f} words")
|
||||
print(f" Min: {df_clean['word_count'].min()} words")
|
||||
print(f" Max: {df_clean['word_count'].max()} words")
|
||||
|
||||
print("\nVerify New Data:")
|
||||
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
|
||||
print(f" Null values: {df_clean.isnull().sum().to_dict()}")
|
||||
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
|
||||
# lang detection takes 5+ mins so leaving it commented for now
|
||||
#df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
|
||||
#print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
|
||||
|
||||
# Sample reviews from each rating
|
||||
print("\n" + "="*50)
|
||||
print("SAMPLE CLEANED REVIEWS")
|
||||
print("="*50)
|
||||
for rating in [1,2,3,4,5]:
|
||||
if len(df_clean[df_clean['rating'] == rating]) > 0:
|
||||
sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
|
||||
print(f"\n{rating} {"✭" * rating} REVIEWS:")
|
||||
for index, row in sample.iterrows():
|
||||
print(f" • ({row['word_count']} words) {row['review'][:100]}")
|
||||
|
||||
# Note about language
|
||||
print("Language detection not applied due to unreliability on short")
|
||||
print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
|
||||
print(" ...Manual annotation phase will identify any non-English reviews")
|
||||
|
||||
return df_clean
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_file = "multitag/data/uber_reviews.csv"
|
||||
output_file = "multitag/data/uber_reviews_cleaned.csv"
|
||||
|
||||
df_clean = preprocess_uber_reviews(input_file, output_file)
|
||||
print("\nPreprocessing complete!")
|
||||
print(f"Clean dataset: {len(df_clean):,} reviews ready for sampling")
|
||||
238
src/sampler.py
Normal file
238
src/sampler.py
Normal file
@@ -0,0 +1,238 @@
|
||||
# TODO: Add verification comparison between ratings
|
||||
# TODO: Clean up the logging print statements
|
||||
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
print(pd.__version__)
|
||||
print(np.__version__)
|
||||
|
||||
path = "multitag/data/uber_reviews_cleaned.csv"
|
||||
sampled_path = "multitag/data/uber_reviews_sampled.csv"
|
||||
original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
|
||||
class Sampler:
|
||||
def __init__(self, data_path, target_samples):
|
||||
|
||||
self.data_path = data_path
|
||||
self.target_samples = 5000 # target number of samples
|
||||
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
|
||||
|
||||
self.original_data = pd.read_csv(original_path, low_memory=False)
|
||||
self.data = pd.read_csv(self.data_path, low_memory=False)
|
||||
self.total = len(self.data) # total number of records in the dataset
|
||||
|
||||
print("="*50)
|
||||
print("SAMPLER INITIALIZED")
|
||||
print("="*50,"\n")
|
||||
|
||||
|
||||
print(f"Total records in dataset: {self.total}")
|
||||
print(f"Data loaded from {self.data_path}, total records: {len(self.data)}")
|
||||
#print(self.data.head())
|
||||
#print(f"\nCurrent distribution:")
|
||||
#print(self.data[self.stratify_column].value_counts().sort_index())
|
||||
#print(f"\nColumns: {self.data.columns.tolist()}")
|
||||
print(f"Percentage distribution (working data):")
|
||||
print((self.data[self.stratify_column].value_counts(normalize=True).sort_index() * 100).round(1),"\n")
|
||||
_origdist = self.original_data[self.stratify_column].value_counts(normalize=True).sort_index()
|
||||
print(f"Original Distribution from {original_path}:")
|
||||
print((_origdist*100).round(1),"\n")
|
||||
|
||||
self.data.info()
|
||||
|
||||
# add sampling method here
|
||||
# random sample 5000 entries with stratifiying by rating
|
||||
"""
|
||||
rating
|
||||
5 57.1% (611133)
|
||||
1 26.5% (283895)
|
||||
4 7.8% (82953)
|
||||
3 4.7% (49928)
|
||||
2 3.9% (41707)
|
||||
Name: proportion, dtype: object
|
||||
"""
|
||||
"""
|
||||
|
||||
Sample size by rating
|
||||
Redundant calculation, kept for clarity
|
||||
Doesn't factor that the distribution changed greatly after preprocessing
|
||||
|
||||
"""
|
||||
def get_stratified_sample(self) -> pd.DataFrame:
|
||||
stratified_sample = (
|
||||
self.data
|
||||
.reset_index(drop=True)
|
||||
.apply(self.x)
|
||||
.sample(n=self.target_samples, random_state=42)
|
||||
)
|
||||
return stratified_sample
|
||||
|
||||
|
||||
|
||||
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
|
||||
def x(self, x):
|
||||
n = int(len(x) / self.total * self.target_samples)
|
||||
n = max(n,1)
|
||||
return x.sample(n=n, random_state=42)
|
||||
"""
|
||||
get_proportional_sample()
|
||||
|
||||
"""
|
||||
|
||||
"""
|
||||
original_distribution_sample()
|
||||
The main sampling method for our labelling as it
|
||||
keeps composition of the original uber dataset
|
||||
which is a fairer comparison, may also work better in general
|
||||
|
||||
inputs:
|
||||
|
||||
outputs:
|
||||
|
||||
"""
|
||||
def original_distribution_sample(self):
|
||||
original_dist = {
|
||||
5: int(0.571 * self.target_samples),
|
||||
1: int(0.265 * self.target_samples),
|
||||
4: int(0.078 * self.target_samples),
|
||||
3: int(0.047 * self.target_samples),
|
||||
2: int(0.039 * self.target_samples)
|
||||
}
|
||||
print("Target Distribution =", original_dist)
|
||||
samples = []
|
||||
for rating, num_samples in original_dist.items():
|
||||
rating_data = self.data[self.data[self.stratify_column] == rating]
|
||||
if len(rating_data) < num_samples:
|
||||
print("Missing samples available for rating")
|
||||
num_samples = len(rating_data)
|
||||
sample = rating_data.sample(n = num_samples,random_state=42)
|
||||
samples.append(sample)
|
||||
original_sample = pd.concat(samples, ignore_index=True)
|
||||
return original_sample
|
||||
|
||||
"""
|
||||
sample_with_keywords()
|
||||
|
||||
In order to train on more bugs and features data in
|
||||
future this method was created
|
||||
- 2000 balanced by rating (400 per)
|
||||
- 1500 likely bugs using bug_keywords list
|
||||
- 1500 likely features using feature_keywords list
|
||||
|
||||
inputs:
|
||||
outputs:
|
||||
|
||||
"""
|
||||
|
||||
def sample_with_keywords(self):
|
||||
#TODO add keywords for feature classification
|
||||
print(f"\n{"="*50}")
|
||||
print("Keyword influenced / rating stratified set")
|
||||
print(f"\n{"="*50}")
|
||||
|
||||
bug_keywords = ["crash","freeze", "error",
|
||||
"stop", "doesnt work", "doesn't work","loading",
|
||||
"blank", "stuck", "load", "broken", "break",
|
||||
"glitch", "issue", "fix", "needs","please repair",
|
||||
"failed", "responding"
|
||||
]
|
||||
feature_keywords = ["need","should","add","wish","would","benefit",
|
||||
"please add","should have", "want", "missing",
|
||||
"require", "suggestion", "request", "could you",
|
||||
"include", "hope", "why not", "greatly", "option",
|
||||
"new","system"
|
||||
]
|
||||
self.data['likely_bug'] = self.data['review'].apply(
|
||||
lambda x:any(keyword in str(x).lower() for keyword in bug_keywords)
|
||||
)
|
||||
self.data['likely_feature'] = self.data['review'].apply(
|
||||
lambda x: any (keyword in str(x).lower() for keyword in feature_keywords)
|
||||
)
|
||||
print(f"Reviews with bug_keywords = {self.data['likely_bug'].sum():,}")
|
||||
print(f"Reviews with feature_keywords = {self.data['likely_feature'].sum():,}")
|
||||
|
||||
print(f"Sampling 2000 reviews balanced (400 per rating)...")
|
||||
base_sample = self.data.groupby(self.stratify_column).apply(
|
||||
lambda x: x.sample(n=min(400, len(x)), random_state=42),
|
||||
include_groups = False
|
||||
).reset_index(drop=True)
|
||||
|
||||
print(f"Sampling 1500 possible bug reports...")
|
||||
bugs = self.data[self.data['likely_bug'] & ~self.data.index.isin(base_sample.index)]
|
||||
bug_sample = bugs.sample(n=min(1500, len(bugs)), random_state=42)
|
||||
|
||||
print(f"Sampling 1500 possible feature requests...")
|
||||
features = self.data[
|
||||
self.data['likely_feature'] &
|
||||
~self.data.index.isin(base_sample.index) &
|
||||
~self.data.index.isin(bug_sample.index)
|
||||
]
|
||||
feature_sample = features.sample(n=min(1500, len(features)), random_state=42)
|
||||
|
||||
# Combine all samples
|
||||
keyword_sample = pd.concat([base_sample, bug_sample, feature_sample], ignore_index=True)
|
||||
|
||||
# Drop helper columns
|
||||
keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
|
||||
|
||||
|
||||
|
||||
print(f"\n Total samples: {len(keyword_sample):,}")
|
||||
return keyword_sample
|
||||
|
||||
def sample_tiny_size(self):
|
||||
mini_sample = self.data.sample(200) # reading some samples manually
|
||||
return mini_sample
|
||||
|
||||
|
||||
|
||||
def save_sample(self, sample_df,output_path):
|
||||
"""Save sample and display statistics"""
|
||||
sample_df.to_csv(output_path, index=False)
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print("SAMPLE SAVED")
|
||||
print(f"{'='*50}")
|
||||
print(f"Location: {output_path}")
|
||||
print(f"Total samples: {len(sample_df):,}")
|
||||
print(f"\nDistribution:")
|
||||
for rating in sorted(sample_df[self.stratify_column].unique()):
|
||||
count = (sample_df[self.stratify_column] == rating).sum()
|
||||
pct = count / len(sample_df) * 100
|
||||
print(f" {rating}★: {count:,} ({pct:.1f}%)")
|
||||
|
||||
def main():
|
||||
|
||||
sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
|
||||
|
||||
# Choose sampling strategy
|
||||
print(f"\n{'='*50}")
|
||||
print("SAMPLING STRATEGY OPTIONS")
|
||||
print(f"{'='*50}")
|
||||
print("1. get_stratified_sample() stratified by current distribution")
|
||||
print("2. original_distribution_sample() stratified by the original data distribution")
|
||||
print("3. get_keyword_boosted_sample() stratified using original distribution but also using a keyword dictionary")
|
||||
|
||||
choice = input("\nEnter choice (1-4): ").strip()
|
||||
|
||||
if choice == '1':
|
||||
sample = sampler.get_stratified_sample()
|
||||
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
||||
|
||||
elif choice == '2':
|
||||
sample = sampler.original_distribution_sample()
|
||||
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
||||
|
||||
elif choice == '3':
|
||||
sample = sampler.sample_with_keywords()
|
||||
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
||||
|
||||
elif choice == '4':
|
||||
sample = sampler.sample_tiny_size()
|
||||
sampler.save_sample(sample,"multitag/data/uber_review_temp.csv")
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
src/train.py
Normal file
0
src/train.py
Normal file
Reference in New Issue
Block a user