Added multitag.py (65% complete), preprocess.py (complete), sampler.py (80% complete)

This commit is contained in:
2025-11-09 01:45:09 +00:00
parent 4d6e2511e6
commit a178284ffc
5 changed files with 179 additions and 1517 deletions

2
.gitignore vendored
View File

@@ -1,2 +1,4 @@
multitag/data/*.csv multitag/data/*.csv
multitag/raw_data/ multitag/raw_data/
multitag/.ipynb_checkpoints
multitag/.vscode

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,14 @@
# TODO: Refactor,especially change expected names as I jumped the gun when first making this without sampling properly
# TODO: Add button labels and finalise the categories of aspects
# TODO: Ensure there is persistent progress tracking implentation before labelling
# TODO: Finalise keybinds
# TODO: Display progress e.g. review 1020 of 5000
# TODO: Validate saving progres
# TODO: Loop instead of pressing enter
# TODO: Autosave ? / confirm quit at least
# TODO: More visual q's
import tkinter as tk import tkinter as tk
from tkinter import ttk from tkinter import ttk
import pandas as pd import pandas as pd

View File

@@ -88,9 +88,12 @@ def preprocess_uber_reviews(input_path, output_path):
df['word_count'] = df['review_clean'].str.split().str.len() df['word_count'] = df['review_clean'].str.split().str.len()
# 5. Remove short reviews # 5. Remove short reviews
review_length_limit = 5 review_length_limit = 5 ### limit review length ###
print(f"\n4. Removing short reviews (< {review_length_limit})...") print(f"\n4. Removing short reviews so reviews have better context / (usefulness) (< {review_length_limit})...")
print(" Rationale: Insufficient context for classification") # 1 word reviews provide little to draw conclusions from and bloat the
# dataset a lot, nearly 50% of reviews!
# display changes
before = len(df) before = len(df)
df = df[df['word_count'] >= review_length_limit] df = df[df['word_count'] >= review_length_limit]
removed = before - len(df) removed = before - len(df)
@@ -119,8 +122,10 @@ def preprocess_uber_reviews(input_path, output_path):
print("PREPROCESSING COMPLETE") print("PREPROCESSING COMPLETE")
print("="*50) print("="*50)
print(f"\nFinal dataset: {len(df_clean):,} reviews") print(f"\nFinal dataset: {len(df_clean):,} reviews")
print(f"Data source: Indian Uber market (predominantly English)") print(f"Quality filters: word_count >= 5, duplicates removed")
print(f"Quality filters: word_count >= 5, duplicates removed") # while this does remove a some legitimate reviews which would provide use in classification
# it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5
# it showed the most amount of formative reviews without seeming excessive in data removal
print("\nRating distribution:") print("\nRating distribution:")
rating_dist = df_clean['rating'].value_counts().sort_index() rating_dist = df_clean['rating'].value_counts().sort_index()
@@ -138,7 +143,7 @@ def preprocess_uber_reviews(input_path, output_path):
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}") print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
print(f" Null values: {df_clean.isnull().sum().to_dict()}") print(f" Null values: {df_clean.isnull().sum().to_dict()}")
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}") print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
# lang detection takes 5+ mins # lang detection takes 5+ mins so leaving it commented for now
#df_clean['detected_lang'] = df_clean['review'].apply(detect_language) #df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
#print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}") #print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
@@ -150,13 +155,13 @@ def preprocess_uber_reviews(input_path, output_path):
if len(df_clean[df_clean['rating'] == rating]) > 0: if len(df_clean[df_clean['rating'] == rating]) > 0:
sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating]))) sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
print(f"\n{rating} {"" * rating} REVIEWS:") print(f"\n{rating} {"" * rating} REVIEWS:")
for idx, row in sample.iterrows(): for index, row in sample.iterrows():
print(f" • ({row['word_count']} words) {row['review'][:100]}") print(f" • ({row['word_count']} words) {row['review'][:100]}")
# Note about language # Note about language
print("Language detection not applied due to unreliability on short") print("Language detection not applied due to unreliability on short")
print("informal text. Dataset is from the Indian market, labeled as English.") print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
print("Manual annotation phase will identify any non-English reviews. And put aside.") print(" ...Manual annotation phase will identify any non-English reviews")
return df_clean return df_clean

View File

@@ -1,22 +1,45 @@
# TODO: Fix get_stratified_sample() replace broken x() with actual working logic
# TODO: Add verification comparison between ratings
# TODO: implement sample_with_keywords() add to lists, and implement logic
# TODO: Clean up the logging print statements
import pandas as pd import pandas as pd
import numpy as np import numpy as np
print(pd.__version__) print(pd.__version__)
print(np.__version__) print(np.__version__)
path = "data/uber_reviews.csv" path = "multitag/data/uber_reviews_cleaned.csv"
sampled_path = "data/uber_reviews_sampled.csv" sampled_path = "multitag/data/uber_reviews_sampled.csv"
original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
class Sampler: class Sampler:
def __init__(self, data_path): def __init__(self, data_path, target_samples):
self.data_path = data_path self.data_path = data_path
self.target_samples = 5000 # target number of samples
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
self.original_data = pd.read_csv(original_path, low_memory=False)
self.data = pd.read_csv(self.data_path, low_memory=False) self.data = pd.read_csv(self.data_path, low_memory=False)
self.total = len(self.data) # total number of records in the dataset self.total = len(self.data) # total number of records in the dataset
self.target_samples = 5000 # target number of samples
self.stratify_column = "rating" # column to stratify by
print("="*50)
print("SAMPLER INITIALIZED")
print("="*50,"\n")
print(f"Total records in dataset: {self.total}")
print(f"Data loaded from {self.data_path}, total records: {len(self.data)}") print(f"Data loaded from {self.data_path}, total records: {len(self.data)}")
print(self.data.head()) #print(self.data.head())
#print(f"\nCurrent distribution:")
#print(self.data[self.stratify_column].value_counts().sort_index())
#print(f"\nColumns: {self.data.columns.tolist()}")
print(f"Percentage distribution (working data):")
print((self.data[self.stratify_column].value_counts(normalize=True).sort_index() * 100).round(1),"\n")
_origdist = self.original_data[self.stratify_column].value_counts(normalize=True).sort_index()
print(f"Original Distribution from {original_path}:")
print((_origdist*100).round(1),"\n")
self.data.info() self.data.info()
@@ -31,36 +54,128 @@ class Sampler:
2 3.9% (41707) 2 3.9% (41707)
Name: proportion, dtype: object Name: proportion, dtype: object
""" """
def get_stratified_sample(self):
stratified_sample = self.data.groupby(self.stratify_column).apply(
lambda x: x.sample(n=int(len(x) / self.total * self.target_samples)),
# include_groups=False
)
return stratified_sample
sampler = Sampler("data/uber_reviews.csv")
to_sample = input("Do you want to create a stratified sample of the data? (y/n): ")
if to_sample == 'y':
sampled = sampler.get_stratified_sample()
sampled.to_csv("data/uber_reviews_sampled.csv", index=False)
print("Original columns:", sampler.data.columns.tolist())
print("Sampled columns:", sampled.columns.tolist())
print("Stratified sample saved to data/uber_reviews_sampled.csv")
elif to_sample == 'n':
sampled_data = pd.read_csv("data/uber_reviews_sampled.csv", low_memory=False)
"""
debug to check sampled data matches original columns
print("Original columns:", sampler.data.columns.tolist())
print("Sampled columns:", sampled_data.columns.tolist())
""" """
print("Original data distribution:") Sample size by rating
print(sampler.data["rating"].value_counts()) Redundant calculation, kept for clarity
print("Sampled data distribution:") Doesn't factor that the distribution changed greatly after preprocessing
print(sampled_data["rating"].value_counts())
else: """
print("Invalid input, please enter 'y' or 'n'") def get_stratified_sample(self) -> pd.Series:
stratified_sample = self.data.groupby(self.stratify_column).apply(self.x)
return stratified_sample
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
def x(self, ):
return lambda x: x.sample(n=int(len(x) / self.total * self.target_samples))
"""
get_proportional_sample()
"""
"""
original_distribution_sample()
The main sampling method for our labelling as it
keeps composition of the original uber dataset
which is a fairer comparison, may also work better in general
inputs:
outputs:
"""
def original_distribution_sample(self):
original_dist = {
5: int(0.571 * self.target_samples),
1: int(0.265 * self.target_samples),
4: int(0.078 * self.target_samples),
3: int(0.047 * self.target_samples),
2: int(0.039 * self.target_samples)
}
print("Target Distribution =", original_dist)
samples = []
for rating, num_samples in original_dist.items():
rating_data = self.data[self.data[self.stratify_column] == rating]
if len(rating_data) < num_samples:
print("Missing samples available for rating")
num_samples = len(rating_data)
sample = rating_data.sample(n = num_samples,random_state=33)
samples.append(sample)
original_sample = pd.concat(samples, ignore_index=True)
return original_sample
"""
sample_with_keywords()
In order to train on more bugs and features data in
future this method was created
- 2000 balanced by rating (400 per)
- 1500 likely bugs using bug_keywords list
- 1500 likely features using feature_keywords list
inputs:
outputs:
"""
def sample_with_keywords():
#TODO add keywords for feature classification
print(f"\n{"="*50}")
print("Keyword influenced / rating stratified set")
print(f"\n{"="*50}")
bug_keywords = ["crash","crashes", "freeze", "freezes", "error",
"stops", "doesnt work", "doesn't work","loading",
"blank", "stuck", "load", "loads", "broken", "breaks",
"glitch", "glitches", "issue", "could you", "fix",
"failed"]
return
def save_sample(self, sample_df,output_path):
"""Save sample and display statistics"""
sample_df.to_csv(output_path, index=False)
print(f"\n{'='*50}")
print("SAMPLE SAVED")
print(f"{'='*50}")
print(f"Location: {output_path}")
print(f"Total samples: {len(sample_df):,}")
print(f"\nDistribution:")
for rating in sorted(sample_df[self.stratify_column].unique()):
count = (sample_df[self.stratify_column] == rating).sum()
pct = count / len(sample_df) * 100
print(f" {rating}★: {count:,} ({pct:.1f}%)")
def main():
sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
# Choose sampling strategy
print(f"\n{'='*50}")
print("SAMPLING STRATEGY OPTIONS")
print(f"{'='*50}")
print("1. get_stratified_sample() stratified by current distribution")
print("2. original_distribution_sample() stratified by the original data distribution")
print("3. get_keyword_boosted_sample() stratified using original distribution but also using a keyword dictionary")
choice = input("\nEnter choice (1-3): ").strip()
if choice == '1':
sample = sampler.get_stratified_sample()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
elif choice == '2':
sample = sampler.original_distribution_sample()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
elif choice == '3':
sample = sampler.get_keyword_boosted_sample()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
if __name__ == "__main__":
main()