House Cleaning

2026-01-28 16:41:27 +00:00
parent 6cf36faf64
commit 8d3dee6d30
10 changed files with 150 additions and 483 deletions
--- a/src/infer.py
+++ b/src/infer.py
--- a/src/multitag.py
+++ b/src/multitag.py
@@ -0,0 +1,307 @@
+# multitag.py
+# This app enables manual annotation of reviews in the Uber dataset, for training with 
+# to achieve review classifications with multi task deep learning
+
+import tkinter as tk
+from tkinter import ttk
+import pandas as pd
+# import langdetect
+import os
+
+class MultiTag:
+    def __init__(self):
+  
+        self.binary_map = {
+            '1': 'Yes',
+            '0': 'No'
+        }
+        
+        self.aspect_map = {
+            'A': 'Driver',
+            'S': 'App', 
+            'D': 'Pricing',
+            'F': 'Service',
+            'G': 'Payment',
+            'H': 'General'
+        }
+        
+        self.sentiment_map = {
+            'A': 'Positive',
+            'S': 'Neutral',
+            'D': 'Negative'
+        }
+
+
+        self.root = tk.Tk()
+        # root.geometry("400x300")
+        self.active_column = 0  # used for highlighting the current column 
+        self.btn_width = 15 # button width
+        self.number_of_aspects = 6  # number of aspect buttons
+        self.root.title("MultiTag")
+
+        #self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
+        #self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
+
+        # Colors for active label
+        self.color_incomplete = "#003366"
+        self.color_complete = "#00AA00"
+
+        # Paths
+        tagged_path = "multitag/data/uber_reviews_tagged.csv"
+        sampled_path = "multitag/data/uber_reviews_sampled.csv"
+        # self.load_review_data("data/uber_reviews_sampled.csv")
+        # self.load_review_data("data/uber_reviews_tagged.csv")
+        if not os.path.exists(tagged_path):
+            print(f"Tagged file did not exist, making one at: {sampled_path}")
+            sampled_df = pd.read_csv(sampled_path, low_memory=False)
+            sampled_df.to_csv(tagged_path, index=False)
+        self.load_review_data(tagged_path)
+
+
+        # =============== GUI Elements ====================
+
+        # highlight for the current box
+        self.highlight = tk.Frame(self.root, bg="#003366", height=20, width=130)
+        self.highlight.grid(row=11, column=0)
+
+        # ROW 0: Progress indication
+        self.progress_label = ttk.Label(
+            self.root, 
+            text="Loading...", 
+            font=("Arial", 12, "bold")
+        )
+        self.progress_label.grid(row=0, column=0, columnspan=4, pady=(5, 0))
+
+        # ROW 1: Review display
+        self.display_review = tk.Text(self.root, height=18, width=100, wrap='word', font=("Arial", 11))
+        self.display_review.grid(row=1, column=0, columnspan=4, padx=10, pady=10)
+
+        # ROW 2: Status label
+        self.status_label = ttk.Label(
+            self.root, 
+            text="Fill in all fields...", 
+            font=("Arial", 10),
+            foreground="gray"
+        )
+
+        self.status_label.grid(row=2, column=0, columnspan=4, pady=(0, 5))
+
+
+        #   Labels ROW 3
+        ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row=3, column=0, pady=(5, 2))
+        ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 3, column=1, pady=(5, 2))
+        ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 3, column=2, pady=(5, 2))
+        ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 3, column=3, pady=(5, 2))
+
+        # ROW 4 |Buttons| 
+        # Feature Requests
+        self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=4, column=0, pady=2)
+        self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=5, column=0, pady=2)
+        # Bug Reports
+        self.bug_true = ttk.Button(self.root, text="1",command=lambda: self.bug_pressed("1"), width= self.btn_width).grid(row=4, column=1, pady=2)
+        self.bug_false = ttk.Button(self.root, text="0",command=lambda: self.bug_pressed("0"), width= self.btn_width).grid(row=5, column=1, pady=2)
+        # Aspect Buttons
+        self.aspect_a = ttk.Button(self.root, text="A: Driver",command=lambda: self.aspect_pressed("A"), width= self.btn_width).grid(row=4, column=2, pady=2)
+        self.aspect_s = ttk.Button(self.root, text="S: App", command=lambda: self.aspect_pressed("S"), width= self.btn_width).grid(row=5, column=2, pady=2)
+        self.aspect_d = ttk.Button(self.root, text="D: Pricing", command=lambda: self.aspect_pressed("D"), width= self.btn_width).grid(row=6, column=2, pady=2)
+        self.aspect_f = ttk.Button(self.root, text="F: Service", command=lambda: self.aspect_pressed("F"), width= self.btn_width).grid(row=7, column=2, pady=2)
+        self.aspect_g = ttk.Button(self.root, text="G: Payment", command=lambda: self.aspect_pressed("G"), width= self.btn_width).grid(row=8, column=2, pady=2)
+        self.aspect_h = ttk.Button(self.root, text="H: General", command=lambda: self.aspect_pressed("H"), width= self.btn_width).grid(row=9, column=2, pady=2)
+        # self.aspect_j = ttk.Button(self.root, text="J: ASPECT HERE", command=lambda: self.aspect_pressed("J"), width= self.btn_width).grid(row=4, column=2, pady=2)
+        # self.aspect_k = ttk.Button(self.root, text="K: ASPECT HERE", command=lambda: self.aspect_pressed("K"), width= self.btn_width).grid(row=4, column=2, pady=2)
+        # self.aspect_l = ttk.Button(self.root, text="L: ASPECT HERE", command=lambda: self.aspect_pressed("L"), width= self.btn_width).grid(row=4, column=2, pady=2)
+        # Aspect sentiment buttons
+        self.aspect_positive = ttk.Button(self.root, text="A: Positive", command=lambda: self.sentiment_pressed("A"), width= self.btn_width).grid(row=4, column=3, pady=2)
+        self.aspect_neutral = ttk.Button(self.root, text="S: Neutral", command=lambda: self.sentiment_pressed("S"), width= self.btn_width).grid(row=5, column=3, pady=2)
+        self.aspect_negative = ttk.Button(self.root, text="D: Negative", command=lambda: self.sentiment_pressed("D"), width= self.btn_width).grid(row=6, column=3, pady=2)
+
+        # Highlight box - positioned below buttons
+        # self.highlight = tk.Frame(self.root, bg=self.color_incomplete, height=20, width=130)
+        self.highlight.grid(row=10, column=0, pady=(5, 5))
+
+        #   Key bindings
+        self.root.bind("q", self.quit_app)
+        self.root.bind("<Return>", self.try_submit)
+        self.root.bind("1", self.handle_key)
+        self.root.bind("0", self.handle_key)
+        self.root.bind("a", self.handle_key)
+        self.root.bind("s", self.handle_key)
+        self.root.bind("d", self.handle_key)
+        self.root.bind("f", self.handle_key)
+        self.root.bind("g", self.handle_key)
+        self.root.bind("h", self.handle_key)
+        # self.root.bind("j", self.handle_key)
+        # self.root.bind("k", self.handle_key)
+        # self.root.bind("l", self.handle_key)
+
+
+    
+        self.display_next_review()
+        #   self.save_tags("data/uber_reviews_tagged.csv")
+        self.root.mainloop()
+
+    def handle_key(self, event):
+        key = event.char
+    
+        # Column 0 or 1: feature/bug (1 and 0)
+        if key in ['1', '0']:
+            if self.active_column == 0:
+                self.feature_pressed(key)
+            elif self.active_column == 1:
+                self.bug_pressed(key)
+        # Column 2: aspects (a,s,d,f,g,h,j,k,l)
+        elif key in 'asdfgh' and self.active_column == 2:
+            self.aspect_pressed(key.upper())
+        # Column 3: sentiment (a,s,d)
+        elif key in 'asd' and self.active_column == 3:
+            self.sentiment_pressed(key.upper())
+
+    def update_status(self):
+        """Update status label and highlight color based on completion state"""
+        if self.all_labels_complete():
+            self.highlight.configure(bg=self.color_complete)
+            self.status_label.configure(
+                text="Complete Tag [ENTER] | Quit [q]", 
+                foreground="green",
+                font=("Arial", 10, "bold")
+            )
+        else:
+            self.highlight.configure(bg=self.color_incomplete)
+            self.status_label.configure(
+                text="Fill in all fields...", 
+                foreground="gray",
+                font=("Arial", 10)
+            )
+
+    def update_progress(self):
+        """Update progress counter"""
+        tagged_count = (self.review_data['tagged'] == 1).sum()
+        total_count = len(self.review_data)
+        remaining = total_count - tagged_count
+        
+        progress_text = f"Progress: {tagged_count} / {total_count} tagged ({remaining} remaining)"
+        self.progress_label.configure(text=progress_text)
+    
+    def move_highlight(self, col):
+        """Move the highlight box directly under the button pressed."""
+        self.highlight.grid(row=10, column=col, pady=(5,5))
+        self.update_status()
+
+    # Setters
+    def feature_pressed(self, value):
+        self.review_data.at[self.current_review_index, "feature_request"] = self.binary_map[value]
+        self.active_column = 1
+        self.move_highlight(1)
+
+    def bug_pressed(self, value):
+        self.review_data.at[self.current_review_index, "bug_report"] = self.binary_map[value]
+        self.active_column = 2
+        self.move_highlight(2)
+
+    def aspect_pressed(self, value):
+        self.review_data.at[self.current_review_index, "aspect"] = self.aspect_map[value]
+        self.active_column = 3
+        self.move_highlight(3)
+
+    def sentiment_pressed(self, value):
+        self.review_data.at[self.current_review_index, "aspect_sentiment"] = self.sentiment_map[value]
+        self.active_column = 0  # Reset for next review
+        self.update_status()
+
+
+    def load_review_data(self, data_path):
+        """Load review data from a CSV file."""
+        self.review_data = pd.read_csv(data_path, low_memory=False)
+        if "tagged" not in self.review_data.columns:
+            self.review_data["tagged"] = 0              # Initialize tagged column if not present
+        if "feature_request" not in self.review_data.columns:
+            self.review_data["feature_request"] = ""    # Initialize feature_request column if not present
+        if "bug_report" not in self.review_data.columns:
+            self.review_data["bug_report"] = ""         # Initialize bug_report column if not present
+        if "aspect" not in self.review_data.columns:
+            self.review_data["aspect"] = ""             # Initialize aspect column if not present
+        if "aspect_sentiment" not in self.review_data.columns:
+            self.review_data["aspect_sentiment"] = ""   # Initialize aspect_sentiment column if not present
+        print(f"Loaded {len(self.review_data)} reviews from {data_path}")
+    
+    def display_next_review(self):
+        """Display the next review in the text box."""
+        self.current_review_index = self.get_current_review_index()
+        if self.current_review_index < len(self.review_data):
+            review = self.review_data.iloc[self.current_review_index]
+
+            self.review_data.at[self.current_review_index, "feature_request"] = ""
+            self.review_data.at[self.current_review_index, "bug_report"] = ""
+            self.review_data.at[self.current_review_index, "aspect"] = ""
+            self.review_data.at[self.current_review_index, "aspect_sentiment"] = ""
+
+            self.display_review.delete(1.0, tk.END)  # Clear the text box
+            self.display_review.insert(tk.END, review["review"])  # Display the review text
+            # self.current_review_index += 1
+            # Mark as tagged
+            #   self.review_data.at[self.current_review_index - 1, "tagged"] = 1
+            self.active_column = 0  # reset to start at feature request
+            self.highlight.grid(row=10, column=0, pady=(5, 5))
+            self.highlight.configure(bg=self.color_incomplete)
+            self.status_label.configure(
+                text="Fill in all fields...", 
+                foreground="gray",
+                font=("Arial", 10)
+            )
+            self.update_progress()
+            self.update_progress()
+            
+        else:
+            print("No more reviews to display. DONE 	☉ ‿ ⚆")
+
+    def submit_tag(self):
+        self.review_data.at[self.current_review_index, "tagged"] = 1
+        self.save_tags("multitag/data/uber_reviews_tagged.csv")
+        self.display_next_review()
+
+    def try_submit(self, event):
+        """Try to submit current review if all labels complete."""
+        if self.all_labels_complete():
+            self.submit_tag()
+            print(f"Review {self.current_review_index + 1} tagged")
+        else:
+            print("      ☠      Complete all fields first!     ☠      ")
+            self.status_label.configure(
+                text="      ☠      Complete all fields first!     ☠      ", 
+                foreground="red",
+                font=("Arial", 10, "bold")
+            )
+            self.root.after(2000, self.update_status)
+    
+    def all_labels_complete(self):
+        row = self.review_data.iloc[self.current_review_index]
+        return (row["feature_request"] != "" and 
+            row["bug_report"] != "" and 
+            row["aspect"] != "" and 
+            row["aspect_sentiment"] != "")
+    
+    def save_tags(self, save_path):
+        """Save the tagged data to a CSV file."""
+        self.review_data.to_csv(save_path, index=False)
+        # print(f"Tagged data saved to {save_path}")
+
+    def quit_app(self, event):
+        tagged_count = (self.review_data['tagged'] == 1).sum()
+        print(f"\n{'='*50}")
+        print(f"SESSION COMPLETE")
+        print(f"{'='*50}")
+        print(f"Total tagged: {tagged_count} / {len(self.review_data)}")
+        print(f"Saved to: multitag/data/uber_reviews_tagged.csv")
+        print(f"Bye    (ʘ‿ʘ)╯")
+        self.save_tags("multitag/data/uber_reviews_tagged.csv")
+        self.root.destroy()
+
+    def get_current_review_index(self):
+        for i in range(len(self.review_data)):
+            if self.review_data.iloc[i]["tagged"] == 0:
+                return i
+        return len(self.review_data)  # all reviews tagged
+    
+    
+   
+app = MultiTag()
--- a/src/preprocess.py
+++ b/src/preprocess.py
@@ -0,0 +1,176 @@
+# preprocess.py
+
+import pandas as pd
+import re
+from langdetect import detect, LangDetectException
+
+def clean_text(text):
+    """Clean review text by removing URLS, emails, excessive whitespace
+
+    Input: 
+    text - the review text to clean
+
+    Outputs:
+    str: the cleaned review text
+    """
+    if pd.isna(text):
+        return ""
+    
+    # Convert to lower for uniformity
+    text = str(text).lower()
+    
+    # Remove URLs using regex
+    text = re.sub(r'http\S+|www\S+', '', text)
+    
+    # Remove emails
+    text = re.sub(r'\S+@\S+', '', text)
+
+    # Normalize punctuation
+    text = re.sub(r'\.{2,}', '.', text)
+    text = re.sub(r'!{2,}', '!', text)
+    text = re.sub(r'\?{2,}', '?', text)
+    
+    # Remove excessive whitespace by replacing with single whitespace where there is trailing spaces
+    text = re.sub(r'\s+', ' ', text).strip()
+    
+    return text
+
+def detect_language(text):
+    """Detect language of text"""
+    try:
+        if pd.isna(text) or len(str(text).strip()) < 10:
+            return 'unknown'
+        return detect(str(text))
+    except LangDetectException:
+        return 'unknown'
+
+def preprocess_uber_reviews(input_path, output_path):
+    """
+    preprocess_uber_reviews by loading, cleaning, and filtering the data.
+
+    - No language detection due to unreliability on short informal text
+    - Data is labelled as English, but contains non-english text
+    - Assumes location of the datasets hardcoded, doesn't handle if it doesn't exist 
+    - Assumes there is a column named "review_description"
+
+    1. Load from csv pd.read_csv()
+    2. Remove rows with missing descriptions
+    3. Clean text by removing URLS, emails, and excessive whitespace
+    4. Calculate word count for each review
+    5. Removes duplicate reviews  
+    6. Removes less than 5 word reviews
+    6. Saves the cleaned dataset to uber_reviews_cleaned.csv
+
+    Inputs:
+    input_path (str): Path to uber_reviews.csv
+    output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv
+
+    Outputs:
+    pd.df_clean: the dataframe of cleaned processed reviews
+    """
+    print("="*50)
+    print("PREPROCESSING UBER REVIEWS")
+    print("="*50)
+    
+    # 1. Load data
+    print("\n1. Loading data...")
+    df = pd.read_csv(input_path, low_memory=False)
+    print(f"   Original size: {len(df):,} reviews")
+    
+    # 2. Remove missing reviews
+    print("\n2. Removing missing reviews...")
+    df = df.dropna(subset=['review_description'])
+    print(f"   After removing nulls: {len(df):,} reviews")
+    
+    # 3. Clean text
+    print("\n3. Cleaning text...")
+    df['review_clean'] = df['review_description'].apply(clean_text)
+    
+    # 4. Calculate word count
+    df['word_count'] = df['review_clean'].str.split().str.len()
+    
+    # 5. Remove short reviews
+    review_length_limit = 5     ### limit review length ###
+    print(f"\n4. Removing short reviews so reviews have better context / (usefulness) (< {review_length_limit})...") 
+    # 1 word reviews provide little to draw conclusions from and bloat the 
+    # dataset a lot, nearly 50% of reviews!
+
+    # display changes
+    before = len(df)
+    df = df[df['word_count'] >= review_length_limit]
+    removed = before - len(df)
+    print(f"   Removed: {removed:,} reviews ({removed/before*100:.1f}%)")
+    print(f"   Remaining: {len(df):,} reviews")
+    
+    # 6. Remove duplicates
+    print("\n5. Removing duplicates...")
+    before = len(df)
+    df = df.drop_duplicates(subset=['review_clean'])
+    removed = before - len(df)
+    print(f"   Removed: {removed:,} duplicates")
+    print(f"   Remaining: {len(df):,} reviews")
+    
+    # 7. Final dataset
+    df_clean = df[['review_clean', 'rating', 'word_count']].copy()
+    df_clean.rename(columns={'review_clean': 'review'}, inplace=True)
+    df_clean = df_clean.reset_index(drop=True)
+    
+    # 8. Save
+    print(f"\n6. Saving to {output_path}...")
+    df_clean.to_csv(output_path, index=False)
+    
+    # Summary
+    print("\n" + "="*50)
+    print("PREPROCESSING COMPLETE")
+    print("="*50)
+    print(f"\nFinal dataset: {len(df_clean):,} reviews")
+    print(f"Quality filters: word_count >= 5, duplicates removed") 
+    # while this does remove a some legitimate reviews which would provide use in classification
+    # it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5 
+    # it showed the most amount of formative reviews without seeming excessive in data removal
+    
+    print("\nRating distribution:")
+    rating_dist = df_clean['rating'].value_counts().sort_index()
+    for rating, count in rating_dist.items():
+        percentage = count / len(df_clean) * 100
+        print(f"  {rating}{"✭"*rating}: {count:,} ({percentage:.1f}%)")
+    
+    print("\nWord count statistics:")
+    print(f"  Mean: {df_clean['word_count'].mean():.1f} words")
+    print(f"  Median: {df_clean['word_count'].median():.1f} words")
+    print(f"  Min: {df_clean['word_count'].min()} words")
+    print(f"  Max: {df_clean['word_count'].max()} words")
+
+    print("\nVerify New Data:")
+    print(f"  Short reviews: {df_clean[df_clean['word_count'] < 5]}")
+    print(f"  Null values: {df_clean.isnull().sum().to_dict()}")
+    print(f"  Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
+    # lang detection takes 5+ mins so leaving it commented for now 
+    #df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
+    #print(f"  Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
+    
+    # Sample reviews from each rating
+    print("\n" + "="*50)
+    print("SAMPLE CLEANED REVIEWS")
+    print("="*50)
+    for rating in [1,2,3,4,5]:
+        if len(df_clean[df_clean['rating'] == rating]) > 0:
+            sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
+            print(f"\n{rating} {"✭" * rating} REVIEWS:")
+            for index, row in sample.iterrows():
+                print(f"  • ({row['word_count']} words) {row['review'][:100]}")
+    
+    # Note about language
+    print("Language detection not applied due to unreliability on short")
+    print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
+    print(" ...Manual annotation phase will identify any non-English reviews")
+    
+    return df_clean
+
+if __name__ == "__main__":
+    input_file = "multitag/data/uber_reviews.csv"
+    output_file = "multitag/data/uber_reviews_cleaned.csv"
+    
+    df_clean = preprocess_uber_reviews(input_file, output_file)
+    print("\nPreprocessing complete!")
+    print(f"Clean dataset: {len(df_clean):,} reviews ready for sampling")
--- a/src/sampler.py
+++ b/src/sampler.py
@@ -0,0 +1,238 @@
+#   TODO:   Add verification comparison between ratings
+#   TODO:   Clean up the logging print statements
+
+
+import pandas as pd
+import numpy as np
+
+print(pd.__version__)
+print(np.__version__)
+
+path = "multitag/data/uber_reviews_cleaned.csv"
+sampled_path = "multitag/data/uber_reviews_sampled.csv"
+original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
+class Sampler:
+    def __init__(self, data_path, target_samples):
+
+        self.data_path = data_path
+        self.target_samples = 5000  # target number of samples
+        self.stratify_column = "rating"  # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
+
+        self.original_data = pd.read_csv(original_path, low_memory=False)
+        self.data = pd.read_csv(self.data_path, low_memory=False)
+        self.total = len(self.data)  # total number of records in the dataset
+
+        print("="*50)
+        print("SAMPLER INITIALIZED")
+        print("="*50,"\n")
+
+
+        print(f"Total records in dataset: {self.total}")
+        print(f"Data loaded from {self.data_path}, total records: {len(self.data)}")
+        #print(self.data.head())
+        #print(f"\nCurrent distribution:")
+        #print(self.data[self.stratify_column].value_counts().sort_index())
+        #print(f"\nColumns: {self.data.columns.tolist()}")
+        print(f"Percentage distribution (working data):")
+        print((self.data[self.stratify_column].value_counts(normalize=True).sort_index() * 100).round(1),"\n")
+        _origdist = self.original_data[self.stratify_column].value_counts(normalize=True).sort_index()
+        print(f"Original Distribution from {original_path}:")
+        print((_origdist*100).round(1),"\n")
+
+        self.data.info()
+
+    #   add sampling method here
+    #   random sample 5000 entries with stratifiying by rating
+    """
+    rating
+    5    57.1% (611133)
+    1    26.5% (283895)
+    4     7.8% (82953)
+    3     4.7% (49928)
+    2     3.9% (41707)
+    Name: proportion, dtype: object
+    """
+    """
+    
+    Sample size by rating
+    Redundant calculation, kept for clarity
+    Doesn't factor that the distribution changed greatly after preprocessing
+
+    """
+    def get_stratified_sample(self) -> pd.DataFrame:
+           stratified_sample = (
+            self.data
+            .reset_index(drop=True)
+            .apply(self.x)
+            .sample(n=self.target_samples, random_state=42)
+            )
+           return stratified_sample
+        
+    
+
+    # x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
+    def x(self, x):    
+        n = int(len(x) / self.total * self.target_samples)
+        n = max(n,1)
+        return x.sample(n=n, random_state=42)
+    """
+    get_proportional_sample()
+
+    """
+    
+    """
+    original_distribution_sample()
+    The main sampling method for our labelling as it 
+    keeps composition of the original uber dataset
+    which is a fairer comparison, may also work better in general
+
+    inputs:
+
+    outputs:
+
+    """
+    def original_distribution_sample(self):
+        original_dist = {
+            5: int(0.571 * self.target_samples), 
+            1: int(0.265 * self.target_samples),  
+            4: int(0.078 * self.target_samples),  
+            3: int(0.047 * self.target_samples),  
+            2: int(0.039 * self.target_samples)   
+        }        
+        print("Target Distribution =", original_dist)
+        samples = []
+        for rating, num_samples in original_dist.items():
+            rating_data = self.data[self.data[self.stratify_column] == rating]
+            if len(rating_data) < num_samples:
+                print("Missing samples available for rating")
+                num_samples = len(rating_data)
+            sample = rating_data.sample(n = num_samples,random_state=42)
+            samples.append(sample)
+        original_sample = pd.concat(samples, ignore_index=True)
+        return original_sample
+    
+    """
+    sample_with_keywords()
+
+    In order to train on more bugs and features data in 
+    future this method was created
+    - 2000 balanced by rating (400 per)
+    - 1500 likely bugs using bug_keywords list
+    - 1500 likely features using feature_keywords list
+
+    inputs:
+    outputs:
+    
+    """
+
+    def sample_with_keywords(self):
+        #TODO add keywords for feature classification
+        print(f"\n{"="*50}")
+        print("Keyword influenced / rating stratified set")
+        print(f"\n{"="*50}")
+
+        bug_keywords = ["crash","freeze", "error",
+                        "stop", "doesnt work", "doesn't work","loading",
+                        "blank", "stuck", "load", "broken", "break",
+                        "glitch", "issue", "fix", "needs","please repair",
+                        "failed", "responding"
+                        ]
+        feature_keywords = ["need","should","add","wish","would","benefit",
+                            "please add","should have", "want", "missing",
+                            "require", "suggestion", "request", "could you",
+                            "include", "hope", "why not", "greatly", "option",
+                            "new","system"
+                            ]
+        self.data['likely_bug'] = self.data['review'].apply(
+            lambda x:any(keyword in str(x).lower() for keyword in bug_keywords)
+        )
+        self.data['likely_feature'] = self.data['review'].apply(
+            lambda x: any (keyword in str(x).lower() for keyword in feature_keywords)
+        )
+        print(f"Reviews with bug_keywords = {self.data['likely_bug'].sum():,}")
+        print(f"Reviews with feature_keywords = {self.data['likely_feature'].sum():,}")
+
+        print(f"Sampling 2000 reviews balanced (400 per rating)...")
+        base_sample = self.data.groupby(self.stratify_column).apply(
+            lambda x: x.sample(n=min(400, len(x)), random_state=42),
+            include_groups = False
+        ).reset_index(drop=True)
+
+        print(f"Sampling 1500 possible bug reports...")
+        bugs = self.data[self.data['likely_bug'] & ~self.data.index.isin(base_sample.index)]
+        bug_sample = bugs.sample(n=min(1500, len(bugs)), random_state=42)
+        
+        print(f"Sampling 1500 possible feature requests...")
+        features = self.data[
+            self.data['likely_feature'] & 
+            ~self.data.index.isin(base_sample.index) &
+            ~self.data.index.isin(bug_sample.index)
+        ]
+        feature_sample = features.sample(n=min(1500, len(features)), random_state=42)
+
+        # Combine all samples
+        keyword_sample = pd.concat([base_sample, bug_sample, feature_sample], ignore_index=True)
+        
+        # Drop helper columns
+        keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
+
+        
+        
+        print(f"\n Total samples: {len(keyword_sample):,}")
+        return keyword_sample
+
+    def sample_tiny_size(self):
+        mini_sample = self.data.sample(200)     #   reading some samples manually
+        return mini_sample
+
+         
+    
+    def save_sample(self, sample_df,output_path):
+        """Save sample and display statistics"""
+        sample_df.to_csv(output_path, index=False)
+        
+        print(f"\n{'='*50}")
+        print("SAMPLE SAVED")
+        print(f"{'='*50}")
+        print(f"Location: {output_path}")
+        print(f"Total samples: {len(sample_df):,}")
+        print(f"\nDistribution:")
+        for rating in sorted(sample_df[self.stratify_column].unique()):
+            count = (sample_df[self.stratify_column] == rating).sum()
+            pct = count / len(sample_df) * 100
+            print(f"  {rating}★: {count:,} ({pct:.1f}%)")
+
+def main():
+    
+    sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
+
+    # Choose sampling strategy
+    print(f"\n{'='*50}")
+    print("SAMPLING STRATEGY OPTIONS")
+    print(f"{'='*50}")
+    print("1. get_stratified_sample() stratified by current distribution")
+    print("2. original_distribution_sample() stratified by the original data distribution")
+    print("3. get_keyword_boosted_sample() stratified using original distribution but also using a keyword dictionary")
+    
+    choice = input("\nEnter choice (1-4): ").strip()
+    
+    if choice == '1':
+        sample = sampler.get_stratified_sample()
+        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+        
+    elif choice == '2':
+        sample = sampler.original_distribution_sample()
+        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+        
+    elif choice == '3':
+        sample = sampler.sample_with_keywords()
+        sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
+
+    elif choice == '4':
+        sample = sampler.sample_tiny_size()
+        sampler.save_sample(sample,"multitag/data/uber_review_temp.csv")
+        
+
+
+if __name__ == "__main__":
+    main()
--- a/src/train.py
+++ b/src/train.py