House Cleaning

This commit is contained in:
charlie-rasberry
2026-01-28 16:41:27 +00:00
parent 6cf36faf64
commit 8d3dee6d30
10 changed files with 150 additions and 483 deletions

0
src/infer.py Normal file
View File

307
src/multitag.py Normal file
View File

@@ -0,0 +1,307 @@
# multitag.py
# This app enables manual annotation of reviews in the Uber dataset, for training with
# to achieve review classifications with multi task deep learning
import tkinter as tk
from tkinter import ttk
import pandas as pd
# import langdetect
import os
class MultiTag:
def __init__(self):
self.binary_map = {
'1': 'Yes',
'0': 'No'
}
self.aspect_map = {
'A': 'Driver',
'S': 'App',
'D': 'Pricing',
'F': 'Service',
'G': 'Payment',
'H': 'General'
}
self.sentiment_map = {
'A': 'Positive',
'S': 'Neutral',
'D': 'Negative'
}
self.root = tk.Tk()
# root.geometry("400x300")
self.active_column = 0 # used for highlighting the current column
self.btn_width = 15 # button width
self.number_of_aspects = 6 # number of aspect buttons
self.root.title("MultiTag")
#self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
#self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
# Colors for active label
self.color_incomplete = "#003366"
self.color_complete = "#00AA00"
# Paths
tagged_path = "multitag/data/uber_reviews_tagged.csv"
sampled_path = "multitag/data/uber_reviews_sampled.csv"
# self.load_review_data("data/uber_reviews_sampled.csv")
# self.load_review_data("data/uber_reviews_tagged.csv")
if not os.path.exists(tagged_path):
print(f"Tagged file did not exist, making one at: {sampled_path}")
sampled_df = pd.read_csv(sampled_path, low_memory=False)
sampled_df.to_csv(tagged_path, index=False)
self.load_review_data(tagged_path)
# =============== GUI Elements ====================
# highlight for the current box
self.highlight = tk.Frame(self.root, bg="#003366", height=20, width=130)
self.highlight.grid(row=11, column=0)
# ROW 0: Progress indication
self.progress_label = ttk.Label(
self.root,
text="Loading...",
font=("Arial", 12, "bold")
)
self.progress_label.grid(row=0, column=0, columnspan=4, pady=(5, 0))
# ROW 1: Review display
self.display_review = tk.Text(self.root, height=18, width=100, wrap='word', font=("Arial", 11))
self.display_review.grid(row=1, column=0, columnspan=4, padx=10, pady=10)
# ROW 2: Status label
self.status_label = ttk.Label(
self.root,
text="Fill in all fields...",
font=("Arial", 10),
foreground="gray"
)
self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))
# Labels ROW 3
ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))
# ROW 4 |Buttons|
# Feature Requests
self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
# Bug Reports
self.bug_true = ttk.Button(self.root, text="1",command=lambda: self.bug_pressed("1"), width= self.btn_width).grid(row=4, column=1, pady=2)
self.bug_false = ttk.Button(self.root, text="0",command=lambda: self.bug_pressed("0"), width= self.btn_width).grid(row=5, column=1, pady=2)
# Aspect Buttons
self.aspect_a = ttk.Button(self.root, text="A: Driver",command=lambda: self.aspect_pressed("A"), width= self.btn_width).grid(row=4, column=2, pady=2)
self.aspect_s = ttk.Button(self.root, text="S: App", command=lambda: self.aspect_pressed("S"), width= self.btn_width).grid(row=5, column=2, pady=2)
self.aspect_d = ttk.Button(self.root, text="D: Pricing", command=lambda: self.aspect_pressed("D"), width= self.btn_width).grid(row=6, column=2, pady=2)
self.aspect_f = ttk.Button(self.root, text="F: Service", command=lambda: self.aspect_pressed("F"), width= self.btn_width).grid(row=7, column=2, pady=2)
self.aspect_g = ttk.Button(self.root, text="G: Payment", command=lambda: self.aspect_pressed("G"), width= self.btn_width).grid(row=8, column=2, pady=2)
self.aspect_h = ttk.Button(self.root, text="H: General", command=lambda: self.aspect_pressed("H"), width= self.btn_width).grid(row=9, column=2, pady=2)
# self.aspect_j = ttk.Button(self.root, text="J: ASPECT HERE", command=lambda: self.aspect_pressed("J"), width= self.btn_width).grid(row=4, column=2, pady=2)
# self.aspect_k = ttk.Button(self.root, text="K: ASPECT HERE", command=lambda: self.aspect_pressed("K"), width= self.btn_width).grid(row=4, column=2, pady=2)
# self.aspect_l = ttk.Button(self.root, text="L: ASPECT HERE", command=lambda: self.aspect_pressed("L"), width= self.btn_width).grid(row=4, column=2, pady=2)
# Aspect sentiment buttons
self.aspect_positive = ttk.Button(self.root, text="A: Positive", command=lambda: self.sentiment_pressed("A"), width= self.btn_width).grid(row=4, column=3, pady=2)
self.aspect_neutral = ttk.Button(self.root, text="S: Neutral", command=lambda: self.sentiment_pressed("S"), width= self.btn_width).grid(row=5, column=3, pady=2)
self.aspect_negative = ttk.Button(self.root, text="D: Negative", command=lambda: self.sentiment_pressed("D"), width= self.btn_width).grid(row=6, column=3, pady=2)
# Highlight box - positioned below buttons
# self.highlight = tk.Frame(self.root, bg=self.color_incomplete, height=20, width=130)
self.highlight.grid(row=10, column=0, pady=(5, 5))
# Key bindings
self.root.bind("q", self.quit_app)
self.root.bind("<Return>", self.try_submit)
self.root.bind("1", self.handle_key)
self.root.bind("0", self.handle_key)
self.root.bind("a", self.handle_key)
self.root.bind("s", self.handle_key)
self.root.bind("d", self.handle_key)
self.root.bind("f", self.handle_key)
self.root.bind("g", self.handle_key)
self.root.bind("h", self.handle_key)
# self.root.bind("j", self.handle_key)
# self.root.bind("k", self.handle_key)
# self.root.bind("l", self.handle_key)
self.display_next_review()
# self.save_tags("data/uber_reviews_tagged.csv")
self.root.mainloop()
def handle_key(self, event):
key = event.char
# Column 0 or 1: feature/bug (1 and 0)
if key in ['1', '0']:
if self.active_column == 0:
self.feature_pressed(key)
elif self.active_column == 1:
self.bug_pressed(key)
# Column 2: aspects (a,s,d,f,g,h,j,k,l)
elif key in 'asdfgh' and self.active_column == 2:
self.aspect_pressed(key.upper())
# Column 3: sentiment (a,s,d)
elif key in 'asd' and self.active_column == 3:
self.sentiment_pressed(key.upper())
def update_status(self):
"""Update status label and highlight color based on completion state"""
if self.all_labels_complete():
self.highlight.configure(bg=self.color_complete)
self.status_label.configure(
text="Complete Tag [ENTER] | Quit [q]",
foreground="green",
font=("Arial", 10, "bold")
)
else:
self.highlight.configure(bg=self.color_incomplete)
self.status_label.configure(
text="Fill in all fields...",
foreground="gray",
font=("Arial", 10)
)
def update_progress(self):
"""Update progress counter"""
tagged_count = (self.review_data['tagged'] == 1).sum()
total_count = len(self.review_data)
remaining = total_count - tagged_count
progress_text = f"Progress: {tagged_count} / {total_count} tagged ({remaining} remaining)"
self.progress_label.configure(text=progress_text)
def move_highlight(self, col):
"""Move the highlight box directly under the button pressed."""
self.highlight.grid(row=10, column=col, pady=(5,5))
self.update_status()
# Setters
def feature_pressed(self, value):
self.review_data.at[self.current_review_index, "feature_request"] = self.binary_map[value]
self.active_column = 1
self.move_highlight(1)
def bug_pressed(self, value):
self.review_data.at[self.current_review_index, "bug_report"] = self.binary_map[value]
self.active_column = 2
self.move_highlight(2)
def aspect_pressed(self, value):
self.review_data.at[self.current_review_index, "aspect"] = self.aspect_map[value]
self.active_column = 3
self.move_highlight(3)
def sentiment_pressed(self, value):
self.review_data.at[self.current_review_index, "aspect_sentiment"] = self.sentiment_map[value]
self.active_column = 0 # Reset for next review
self.update_status()
def load_review_data(self, data_path):
"""Load review data from a CSV file."""
self.review_data = pd.read_csv(data_path, low_memory=False)
if "tagged" not in self.review_data.columns:
self.review_data["tagged"] = 0 # Initialize tagged column if not present
if "feature_request" not in self.review_data.columns:
self.review_data["feature_request"] = "" # Initialize feature_request column if not present
if "bug_report" not in self.review_data.columns:
self.review_data["bug_report"] = "" # Initialize bug_report column if not present
if "aspect" not in self.review_data.columns:
self.review_data["aspect"] = "" # Initialize aspect column if not present
if "aspect_sentiment" not in self.review_data.columns:
self.review_data["aspect_sentiment"] = "" # Initialize aspect_sentiment column if not present
print(f"Loaded {len(self.review_data)} reviews from {data_path}")
def display_next_review(self):
"""Display the next review in the text box."""
self.current_review_index = self.get_current_review_index()
if self.current_review_index < len(self.review_data):
review = self.review_data.iloc[self.current_review_index]
self.review_data.at[self.current_review_index, "feature_request"] = ""
self.review_data.at[self.current_review_index, "bug_report"] = ""
self.review_data.at[self.current_review_index, "aspect"] = ""
self.review_data.at[self.current_review_index, "aspect_sentiment"] = ""
self.display_review.delete(1.0, tk.END) # Clear the text box
self.display_review.insert(tk.END, review["review"]) # Display the review text
# self.current_review_index += 1
# Mark as tagged
# self.review_data.at[self.current_review_index - 1, "tagged"] = 1
self.active_column = 0 # reset to start at feature request
self.highlight.grid(row=10, column=0, pady=(5, 5))
self.highlight.configure(bg=self.color_incomplete)
self.status_label.configure(
text="Fill in all fields...",
foreground="gray",
font=("Arial", 10)
)
self.update_progress()
self.update_progress()
else:
print("No more reviews to display. DONE ☉ ‿ ⚆")
def submit_tag(self):
self.review_data.at[self.current_review_index, "tagged"] = 1
self.save_tags("multitag/data/uber_reviews_tagged.csv")
self.display_next_review()
def try_submit(self, event):
"""Try to submit current review if all labels complete."""
if self.all_labels_complete():
self.submit_tag()
print(f"Review {self.current_review_index + 1} tagged")
else:
print(" ☠ Complete all fields first! ☠ ")
self.status_label.configure(
text=" ☠ Complete all fields first! ☠ ",
foreground="red",
font=("Arial", 10, "bold")
)
self.root.after(2000, self.update_status)
def all_labels_complete(self):
row = self.review_data.iloc[self.current_review_index]
return (row["feature_request"] != "" and
row["bug_report"] != "" and
row["aspect"] != "" and
row["aspect_sentiment"] != "")
def save_tags(self, save_path):
"""Save the tagged data to a CSV file."""
self.review_data.to_csv(save_path, index=False)
# print(f"Tagged data saved to {save_path}")
def quit_app(self, event):
tagged_count = (self.review_data['tagged'] == 1).sum()
print(f"\n{'='*50}")
print(f"SESSION COMPLETE")
print(f"{'='*50}")
print(f"Total tagged: {tagged_count} / {len(self.review_data)}")
print(f"Saved to: multitag/data/uber_reviews_tagged.csv")
print(f"Bye (ʘ‿ʘ)╯")
self.save_tags("multitag/data/uber_reviews_tagged.csv")
self.root.destroy()
def get_current_review_index(self):
for i in range(len(self.review_data)):
if self.review_data.iloc[i]["tagged"] == 0:
return i
return len(self.review_data) # all reviews tagged
app = MultiTag()

176
src/preprocess.py Normal file
View File

@@ -0,0 +1,176 @@
# preprocess.py
import pandas as pd
import re
from langdetect import detect, LangDetectException
def clean_text(text):
"""Clean review text by removing URLS, emails, excessive whitespace
Input:
text - the review text to clean
Outputs:
str: the cleaned review text
"""
if pd.isna(text):
return ""
# Convert to lower for uniformity
text = str(text).lower()
# Remove URLs using regex
text = re.sub(r'http\S+|www\S+', '', text)
# Remove emails
text = re.sub(r'\S+@\S+', '', text)
# Normalize punctuation
text = re.sub(r'\.{2,}', '.', text)
text = re.sub(r'!{2,}', '!', text)
text = re.sub(r'\?{2,}', '?', text)
# Remove excessive whitespace by replacing with single whitespace where there is trailing spaces
text = re.sub(r'\s+', ' ', text).strip()
return text
def detect_language(text):
"""Detect language of text"""
try:
if pd.isna(text) or len(str(text).strip()) < 10:
return 'unknown'
return detect(str(text))
except LangDetectException:
return 'unknown'
def preprocess_uber_reviews(input_path, output_path):
"""
preprocess_uber_reviews by loading, cleaning, and filtering the data.
- No language detection due to unreliability on short informal text
- Data is labelled as English, but contains non-english text
- Assumes location of the datasets hardcoded, doesn't handle if it doesn't exist
- Assumes there is a column named "review_description"
1. Load from csv pd.read_csv()
2. Remove rows with missing descriptions
3. Clean text by removing URLS, emails, and excessive whitespace
4. Calculate word count for each review
5. Removes duplicate reviews
6. Removes less than 5 word reviews
6. Saves the cleaned dataset to uber_reviews_cleaned.csv
Inputs:
input_path (str): Path to uber_reviews.csv
output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv
Outputs:
pd.df_clean: the dataframe of cleaned processed reviews
"""
print("="*50)
print("PREPROCESSING UBER REVIEWS")
print("="*50)
# 1. Load data
print("\n1. Loading data...")
df = pd.read_csv(input_path, low_memory=False)
print(f" Original size: {len(df):,} reviews")
# 2. Remove missing reviews
print("\n2. Removing missing reviews...")
df = df.dropna(subset=['review_description'])
print(f" After removing nulls: {len(df):,} reviews")
# 3. Clean text
print("\n3. Cleaning text...")
df['review_clean'] = df['review_description'].apply(clean_text)
# 4. Calculate word count
df['word_count'] = df['review_clean'].str.split().str.len()
# 5. Remove short reviews
review_length_limit = 5 ### limit review length ###
print(f"\n4. Removing short reviews so reviews have better context / (usefulness) (< {review_length_limit})...")
# 1 word reviews provide little to draw conclusions from and bloat the
# dataset a lot, nearly 50% of reviews!
# display changes
before = len(df)
df = df[df['word_count'] >= review_length_limit]
removed = before - len(df)
print(f" Removed: {removed:,} reviews ({removed/before*100:.1f}%)")
print(f" Remaining: {len(df):,} reviews")
# 6. Remove duplicates
print("\n5. Removing duplicates...")
before = len(df)
df = df.drop_duplicates(subset=['review_clean'])
removed = before - len(df)
print(f" Removed: {removed:,} duplicates")
print(f" Remaining: {len(df):,} reviews")
# 7. Final dataset
df_clean = df[['review_clean', 'rating', 'word_count']].copy()
df_clean.rename(columns={'review_clean': 'review'}, inplace=True)
df_clean = df_clean.reset_index(drop=True)
# 8. Save
print(f"\n6. Saving to {output_path}...")
df_clean.to_csv(output_path, index=False)
# Summary
print("\n" + "="*50)
print("PREPROCESSING COMPLETE")
print("="*50)
print(f"\nFinal dataset: {len(df_clean):,} reviews")
print(f"Quality filters: word_count >= 5, duplicates removed")
# while this does remove a some legitimate reviews which would provide use in classification
# it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5
# it showed the most amount of formative reviews without seeming excessive in data removal
print("\nRating distribution:")
rating_dist = df_clean['rating'].value_counts().sort_index()
for rating, count in rating_dist.items():
percentage = count / len(df_clean) * 100
print(f" {rating}{""*rating}: {count:,} ({percentage:.1f}%)")
print("\nWord count statistics:")
print(f" Mean: {df_clean['word_count'].mean():.1f} words")
print(f" Median: {df_clean['word_count'].median():.1f} words")
print(f" Min: {df_clean['word_count'].min()} words")
print(f" Max: {df_clean['word_count'].max()} words")
print("\nVerify New Data:")
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
print(f" Null values: {df_clean.isnull().sum().to_dict()}")
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
# lang detection takes 5+ mins so leaving it commented for now
#df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
#print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
# Sample reviews from each rating
print("\n" + "="*50)
print("SAMPLE CLEANED REVIEWS")
print("="*50)
for rating in [1,2,3,4,5]:
if len(df_clean[df_clean['rating'] == rating]) > 0:
sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
print(f"\n{rating} {"" * rating} REVIEWS:")
for index, row in sample.iterrows():
print(f" • ({row['word_count']} words) {row['review'][:100]}")
# Note about language
print("Language detection not applied due to unreliability on short")
print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
print(" ...Manual annotation phase will identify any non-English reviews")
return df_clean
if __name__ == "__main__":
input_file = "multitag/data/uber_reviews.csv"
output_file = "multitag/data/uber_reviews_cleaned.csv"
df_clean = preprocess_uber_reviews(input_file, output_file)
print("\nPreprocessing complete!")
print(f"Clean dataset: {len(df_clean):,} reviews ready for sampling")

238
src/sampler.py Normal file
View File

@@ -0,0 +1,238 @@
# TODO: Add verification comparison between ratings
# TODO: Clean up the logging print statements
import pandas as pd
import numpy as np
print(pd.__version__)
print(np.__version__)
path = "multitag/data/uber_reviews_cleaned.csv"
sampled_path = "multitag/data/uber_reviews_sampled.csv"
original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
class Sampler:
def __init__(self, data_path, target_samples):
self.data_path = data_path
self.target_samples = 5000 # target number of samples
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
self.original_data = pd.read_csv(original_path, low_memory=False)
self.data = pd.read_csv(self.data_path, low_memory=False)
self.total = len(self.data) # total number of records in the dataset
print("="*50)
print("SAMPLER INITIALIZED")
print("="*50,"\n")
print(f"Total records in dataset: {self.total}")
print(f"Data loaded from {self.data_path}, total records: {len(self.data)}")
#print(self.data.head())
#print(f"\nCurrent distribution:")
#print(self.data[self.stratify_column].value_counts().sort_index())
#print(f"\nColumns: {self.data.columns.tolist()}")
print(f"Percentage distribution (working data):")
print((self.data[self.stratify_column].value_counts(normalize=True).sort_index() * 100).round(1),"\n")
_origdist = self.original_data[self.stratify_column].value_counts(normalize=True).sort_index()
print(f"Original Distribution from {original_path}:")
print((_origdist*100).round(1),"\n")
self.data.info()
# add sampling method here
# random sample 5000 entries with stratifiying by rating
"""
rating
5 57.1% (611133)
1 26.5% (283895)
4 7.8% (82953)
3 4.7% (49928)
2 3.9% (41707)
Name: proportion, dtype: object
"""
"""
Sample size by rating
Redundant calculation, kept for clarity
Doesn't factor that the distribution changed greatly after preprocessing
"""
def get_stratified_sample(self) -> pd.DataFrame:
stratified_sample = (
self.data
.reset_index(drop=True)
.apply(self.x)
.sample(n=self.target_samples, random_state=42)
)
return stratified_sample
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
def x(self, x):
n = int(len(x) / self.total * self.target_samples)
n = max(n,1)
return x.sample(n=n, random_state=42)
"""
get_proportional_sample()
"""
"""
original_distribution_sample()
The main sampling method for our labelling as it
keeps composition of the original uber dataset
which is a fairer comparison, may also work better in general
inputs:
outputs:
"""
def original_distribution_sample(self):
original_dist = {
5: int(0.571 * self.target_samples),
1: int(0.265 * self.target_samples),
4: int(0.078 * self.target_samples),
3: int(0.047 * self.target_samples),
2: int(0.039 * self.target_samples)
}
print("Target Distribution =", original_dist)
samples = []
for rating, num_samples in original_dist.items():
rating_data = self.data[self.data[self.stratify_column] == rating]
if len(rating_data) < num_samples:
print("Missing samples available for rating")
num_samples = len(rating_data)
sample = rating_data.sample(n = num_samples,random_state=42)
samples.append(sample)
original_sample = pd.concat(samples, ignore_index=True)
return original_sample
"""
sample_with_keywords()
In order to train on more bugs and features data in
future this method was created
- 2000 balanced by rating (400 per)
- 1500 likely bugs using bug_keywords list
- 1500 likely features using feature_keywords list
inputs:
outputs:
"""
def sample_with_keywords(self):
#TODO add keywords for feature classification
print(f"\n{"="*50}")
print("Keyword influenced / rating stratified set")
print(f"\n{"="*50}")
bug_keywords = ["crash","freeze", "error",
"stop", "doesnt work", "doesn't work","loading",
"blank", "stuck", "load", "broken", "break",
"glitch", "issue", "fix", "needs","please repair",
"failed", "responding"
]
feature_keywords = ["need","should","add","wish","would","benefit",
"please add","should have", "want", "missing",
"require", "suggestion", "request", "could you",
"include", "hope", "why not", "greatly", "option",
"new","system"
]
self.data['likely_bug'] = self.data['review'].apply(
lambda x:any(keyword in str(x).lower() for keyword in bug_keywords)
)
self.data['likely_feature'] = self.data['review'].apply(
lambda x: any (keyword in str(x).lower() for keyword in feature_keywords)
)
print(f"Reviews with bug_keywords = {self.data['likely_bug'].sum():,}")
print(f"Reviews with feature_keywords = {self.data['likely_feature'].sum():,}")
print(f"Sampling 2000 reviews balanced (400 per rating)...")
base_sample = self.data.groupby(self.stratify_column).apply(
lambda x: x.sample(n=min(400, len(x)), random_state=42),
include_groups = False
).reset_index(drop=True)
print(f"Sampling 1500 possible bug reports...")
bugs = self.data[self.data['likely_bug'] & ~self.data.index.isin(base_sample.index)]
bug_sample = bugs.sample(n=min(1500, len(bugs)), random_state=42)
print(f"Sampling 1500 possible feature requests...")
features = self.data[
self.data['likely_feature'] &
~self.data.index.isin(base_sample.index) &
~self.data.index.isin(bug_sample.index)
]
feature_sample = features.sample(n=min(1500, len(features)), random_state=42)
# Combine all samples
keyword_sample = pd.concat([base_sample, bug_sample, feature_sample], ignore_index=True)
# Drop helper columns
keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
print(f"\n Total samples: {len(keyword_sample):,}")
return keyword_sample
def sample_tiny_size(self):
mini_sample = self.data.sample(200) # reading some samples manually
return mini_sample
def save_sample(self, sample_df,output_path):
"""Save sample and display statistics"""
sample_df.to_csv(output_path, index=False)
print(f"\n{'='*50}")
print("SAMPLE SAVED")
print(f"{'='*50}")
print(f"Location: {output_path}")
print(f"Total samples: {len(sample_df):,}")
print(f"\nDistribution:")
for rating in sorted(sample_df[self.stratify_column].unique()):
count = (sample_df[self.stratify_column] == rating).sum()
pct = count / len(sample_df) * 100
print(f" {rating}★: {count:,} ({pct:.1f}%)")
def main():
sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
# Choose sampling strategy
print(f"\n{'='*50}")
print("SAMPLING STRATEGY OPTIONS")
print(f"{'='*50}")
print("1. get_stratified_sample() stratified by current distribution")
print("2. original_distribution_sample() stratified by the original data distribution")
print("3. get_keyword_boosted_sample() stratified using original distribution but also using a keyword dictionary")
choice = input("\nEnter choice (1-4): ").strip()
if choice == '1':
sample = sampler.get_stratified_sample()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
elif choice == '2':
sample = sampler.original_distribution_sample()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
elif choice == '3':
sample = sampler.sample_with_keywords()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
elif choice == '4':
sample = sampler.sample_tiny_size()
sampler.save_sample(sample,"multitag/data/uber_review_temp.csv")
if __name__ == "__main__":
main()

0
src/train.py Normal file
View File