Added multitag.py (65% complete), preprocess.py (complete), sampler.py (80% complete)

2025-11-09 01:45:09 +00:00
parent 4d6e2511e6
commit a178284ffc
5 changed files with 179 additions and 1517 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 multitag/data/*.csv
 multitag/raw_data/
 multitag/.ipynb_checkpoints
 multitag/.vscode
--- a/.ipynb_checkpoints/datasets_reviews-checkpoint.ipynb
+++ b/.ipynb_checkpoints/datasets_reviews-checkpoint.ipynb
--- a/multitag/multitag.py
+++ b/multitag/multitag.py
@@ -1,3 +1,14 @@
 #   TODO:   Refactor,especially change expected names as I jumped the gun when first making this without sampling properly
 #   TODO:   Add button labels and finalise the categories of aspects
 #   TODO:   Ensure there is persistent progress tracking implentation before labelling 
 #   TODO:   Finalise keybinds
 #   TODO:   Display progress    e.g. review 1020 of 5000
 #   TODO:   Validate saving progres
 #   TODO:   Loop instead of pressing enter
 #   TODO:   Autosave ? / confirm quit at least
 #   TODO:   More visual q's
 import tkinter as tk
 from tkinter import ttk
 import pandas as pd
--- a/multitag/preprocess.py
+++ b/multitag/preprocess.py
@@ -88,9 +88,12 @@ def preprocess_uber_reviews(input_path, output_path):
    df['word_count'] = df['review_clean'].str.split().str.len()
    # 5. Remove short reviews
-    review_length_limit = 5
+    review_length_limit = 5     ### limit review length ###
-    print(f"\n4. Removing short reviews (< {review_length_limit})...")
+    print(f"\n4. Removing short reviews so reviews have better context / (usefulness) (< {review_length_limit})...") 
-    print("   Rationale: Insufficient context for classification")
+    # 1 word reviews provide little to draw conclusions from and bloat the 
    # dataset a lot, nearly 50% of reviews!
    # display changes
    before = len(df)
    df = df[df['word_count'] >= review_length_limit]
    removed = before - len(df)
@@ -119,8 +122,10 @@ def preprocess_uber_reviews(input_path, output_path):
    print("PREPROCESSING COMPLETE")
    print("="*50)
    print(f"\nFinal dataset: {len(df_clean):,} reviews")
    print(f"Data source: Indian Uber market (predominantly English)")
    print(f"Quality filters: word_count >= 5, duplicates removed") 
    # while this does remove a some legitimate reviews which would provide use in classification
    # it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5 
    # it showed the most amount of formative reviews without seeming excessive in data removal
    print("\nRating distribution:")
    rating_dist = df_clean['rating'].value_counts().sort_index()
@@ -138,7 +143,7 @@ def preprocess_uber_reviews(input_path, output_path):
    print(f"  Short reviews: {df_clean[df_clean['word_count'] < 5]}")
    print(f"  Null values: {df_clean.isnull().sum().to_dict()}")
    print(f"  Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
-    # lang detection takes 5+ mins
+    # lang detection takes 5+ mins so leaving it commented for now 
    #df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
    #print(f"  Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
@@ -150,13 +155,13 @@ def preprocess_uber_reviews(input_path, output_path):
        if len(df_clean[df_clean['rating'] == rating]) > 0:
            sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
            print(f"\n{rating} {"✭" * rating} REVIEWS:")
-            for idx, row in sample.iterrows():
+            for index, row in sample.iterrows():
                print(f"  • ({row['word_count']} words) {row['review'][:100]}")
    # Note about language
    print("Language detection not applied due to unreliability on short")
-    print("informal text. Dataset is from the Indian market, labeled as English.")
+    print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
-    print("Manual annotation phase will identify any non-English reviews. And put aside.")
+    print(" ...Manual annotation phase will identify any non-English reviews")
    return df_clean
--- a/multitag/sampler.py
+++ b/multitag/sampler.py
@@ -1,22 +1,45 @@
 #   TODO:   Fix get_stratified_sample() replace broken x() with actual working logic
 #   TODO:   Add verification comparison between ratings
 #   TODO:   implement sample_with_keywords() add to lists, and implement logic
 #   TODO:   Clean up the logging print statements
 import pandas as pd
 import numpy as np
 print(pd.__version__)
 print(np.__version__)
-path = "data/uber_reviews.csv"
+path = "multitag/data/uber_reviews_cleaned.csv"
-sampled_path = "data/uber_reviews_sampled.csv"
+sampled_path = "multitag/data/uber_reviews_sampled.csv"
 original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
 class Sampler:
-    def __init__(self, data_path):
+    def __init__(self, data_path, target_samples):
        self.data_path = data_path
        self.target_samples = 5000  # target number of samples
        self.stratify_column = "rating"  # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
        self.original_data = pd.read_csv(original_path, low_memory=False)
        self.data = pd.read_csv(self.data_path, low_memory=False)
        self.total = len(self.data)  # total number of records in the dataset
        self.target_samples = 5000  # target number of samples
        self.stratify_column = "rating"  # column to stratify by
        print("="*50)
        print("SAMPLER INITIALIZED")
        print("="*50,"\n")
        print(f"Total records in dataset: {self.total}")
        print(f"Data loaded from {self.data_path}, total records: {len(self.data)}")
-        print(self.data.head())
+        #print(self.data.head())
        #print(f"\nCurrent distribution:")
        #print(self.data[self.stratify_column].value_counts().sort_index())
        #print(f"\nColumns: {self.data.columns.tolist()}")
        print(f"Percentage distribution (working data):")
        print((self.data[self.stratify_column].value_counts(normalize=True).sort_index() * 100).round(1),"\n")
        _origdist = self.original_data[self.stratify_column].value_counts(normalize=True).sort_index()
        print(f"Original Distribution from {original_path}:")
        print((_origdist*100).round(1),"\n")
        self.data.info()
@@ -31,36 +54,128 @@ class Sampler:
    2     3.9% (41707)
    Name: proportion, dtype: object
    """
    """
-    def get_stratified_sample(self):
+    Sample size by rating
-        stratified_sample = self.data.groupby(self.stratify_column).apply(
+    Redundant calculation, kept for clarity
-            lambda x: x.sample(n=int(len(x) / self.total * self.target_samples)),
+    Doesn't factor that the distribution changed greatly after preprocessing
-            # include_groups=False
+
-    )
+    """
    def get_stratified_sample(self) -> pd.Series:
        stratified_sample = self.data.groupby(self.stratify_column).apply(self.x)
        return stratified_sample
 sampler = Sampler("data/uber_reviews.csv")
-
+    # x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
-to_sample = input("Do you want to create a stratified sample of the data? (y/n): ")             
+    def x(self, ):    
-
+        return lambda x: x.sample(n=int(len(x) / self.total * self.target_samples))
 if to_sample == 'y':
    sampled = sampler.get_stratified_sample()
    sampled.to_csv("data/uber_reviews_sampled.csv", index=False)
    print("Original columns:", sampler.data.columns.tolist())
    print("Sampled columns:", sampled.columns.tolist())
    print("Stratified sample saved to data/uber_reviews_sampled.csv")
 elif to_sample == 'n':
    sampled_data = pd.read_csv("data/uber_reviews_sampled.csv", low_memory=False)
    """
-    debug to check sampled data matches original columns
+    get_proportional_sample()
-    print("Original columns:", sampler.data.columns.tolist())
+
    print("Sampled columns:", sampled_data.columns.tolist())
    """
-    print("Original data distribution:")
+    """
-    print(sampler.data["rating"].value_counts())
+    original_distribution_sample()
-    print("Sampled data distribution:")
+    The main sampling method for our labelling as it 
-    print(sampled_data["rating"].value_counts())
+    keeps composition of the original uber dataset
-else:
+    which is a fairer comparison, may also work better in general
-    print("Invalid input, please enter 'y' or 'n'")
+
    inputs:
    outputs:
    """
    def original_distribution_sample(self):
        original_dist = {
            5: int(0.571 * self.target_samples), 
            1: int(0.265 * self.target_samples),  
            4: int(0.078 * self.target_samples),  
            3: int(0.047 * self.target_samples),  
            2: int(0.039 * self.target_samples)   
        }        
        print("Target Distribution =", original_dist)
        samples = []
        for rating, num_samples in original_dist.items():
            rating_data = self.data[self.data[self.stratify_column] == rating]
            if len(rating_data) < num_samples:
                print("Missing samples available for rating")
                num_samples = len(rating_data)
            sample = rating_data.sample(n = num_samples,random_state=33)
            samples.append(sample)
        original_sample = pd.concat(samples, ignore_index=True)
        return original_sample
    """
    sample_with_keywords()
    In order to train on more bugs and features data in 
    future this method was created
    - 2000 balanced by rating (400 per)
    - 1500 likely bugs using bug_keywords list
    - 1500 likely features using feature_keywords list
    inputs:
    outputs:
    """
    def sample_with_keywords():
        #TODO add keywords for feature classification
        print(f"\n{"="*50}")
        print("Keyword influenced / rating stratified set")
        print(f"\n{"="*50}")
        bug_keywords = ["crash","crashes", "freeze", "freezes", "error",
                        "stops", "doesnt work", "doesn't work","loading",
                        "blank", "stuck", "load", "loads", "broken", "breaks",
                        "glitch", "glitches", "issue", "could you", "fix",
                        "failed"]
        return 
    def save_sample(self, sample_df,output_path):
        """Save sample and display statistics"""
        sample_df.to_csv(output_path, index=False)
        print(f"\n{'='*50}")
        print("SAMPLE SAVED")
        print(f"{'='*50}")
        print(f"Location: {output_path}")
        print(f"Total samples: {len(sample_df):,}")
        print(f"\nDistribution:")
        for rating in sorted(sample_df[self.stratify_column].unique()):
            count = (sample_df[self.stratify_column] == rating).sum()
            pct = count / len(sample_df) * 100
            print(f"  {rating}★: {count:,} ({pct:.1f}%)")
 def main():
    sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
    # Choose sampling strategy
    print(f"\n{'='*50}")
    print("SAMPLING STRATEGY OPTIONS")
    print(f"{'='*50}")
    print("1. get_stratified_sample() stratified by current distribution")
    print("2. original_distribution_sample() stratified by the original data distribution")
    print("3. get_keyword_boosted_sample() stratified using original distribution but also using a keyword dictionary")
    choice = input("\nEnter choice (1-3): ").strip()
    if choice == '1':
        sample = sampler.get_stratified_sample()
        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
    elif choice == '2':
        sample = sampler.original_distribution_sample()
        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
    elif choice == '3':
        sample = sampler.get_keyword_boosted_sample()
        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
 if __name__ == "__main__":
    main()