Added multitag, includes preprocess.py, sampler.py and multitag.py(the main gui for labelling/annotation)

2025-11-06 17:40:29 +00:00
parent c0d4c13824
commit 4d6e2511e6
6 changed files with 1147 additions and 0 deletions
--- a/multitag/multitag.py
+++ b/multitag/multitag.py
@@ -0,0 +1,192 @@
+import tkinter as tk
+from tkinter import ttk
+import pandas as pd
+
+"""
+app to classify / manually annotate reviews for ml training
+currently has hotkeys for each option 1 0 asdfghjkl
+path must be to tagged not sampled, it wont remember
+"""
+
+
+class MultiTag:
+    def __init__(self):
+        self.root = tk.Tk()
+        # root.geometry("400x300")
+        self.active_column = 0  # used for highlighting the current column 
+        self.btn_width = 15 # button width
+        self.number_of_aspects = 9  # number of aspect buttons
+        self.root.title("MultiTag")
+
+        self.display_review = tk.Text(self.root, height=20, width=100, wrap='word')
+        self.display_review.grid(row=0, column=0, columnspan=4, padx=10, pady=10)
+
+        # highlight for the current box
+        self.highlight = tk.Frame(self.root, bg="#003366", height=20, width=130)
+        self.highlight.grid(row=11, column=0)
+
+
+        #   Labels
+        ttk.Label(self.root, text="Feature Request ? 1 (yes), 0 (no)").grid(row= 1, column=0)
+        ttk.Label(self.root, text="Bug Report ? 1 (yes), 0 (no)").grid(row= 1, column=1)
+        ttk.Label(self.root, text="Aspect ? A/S/D/F/G/H/J/K/L ").grid(row= 1, column=2)
+        ttk.Label(self.root, text="Aspect Sentiment ? A/S/D").grid(row= 1, column=3)
+
+
+        self.feature_true = ttk.Button(self.root, text="1",command=lambda: self.feature_pressed("1"), width= self.btn_width).grid(row=2, column=0)
+        self.feature_false = ttk.Button(self.root, text="0",command=lambda: self.feature_pressed("0"), width= self.btn_width).grid(row=3, column=0)
+
+        self.bug_true = ttk.Button(self.root, text="1",command=lambda: self.bug_pressed("1"), width= self.btn_width).grid(row = 2, column=1)
+        self.bug_false = ttk.Button(self.root, text="0",command=lambda: self.bug_pressed("0"), width= self.btn_width).grid(row = 3, column=1)
+
+        self.aspect_a = ttk.Button(self.root, text="A: ASPECT HERE",command=lambda: self.aspect_pressed("A"), width= self.btn_width).grid(row = 2, column=2)
+        self.aspect_s = ttk.Button(self.root, text="S: ASPECT HERE", command=lambda: self.aspect_pressed("S"), width= self.btn_width).grid(row = 3, column=2)
+        self.aspect_d = ttk.Button(self.root, text="D: ASPECT HERE", command=lambda: self.aspect_pressed("D"), width= self.btn_width).grid(row = 4, column=2)
+        self.aspect_f = ttk.Button(self.root, text="F: ASPECT HERE", command=lambda: self.aspect_pressed("F"), width= self.btn_width).grid(row = 5, column=2)
+        self.aspect_g = ttk.Button(self.root, text="G: ASPECT HERE", command=lambda: self.aspect_pressed("G"), width= self.btn_width).grid(row = 6, column=2)
+        self.aspect_h = ttk.Button(self.root, text="H: ASPECT HERE", command=lambda: self.aspect_pressed("H"), width= self.btn_width).grid(row = 7, column=2)
+        self.aspect_j = ttk.Button(self.root, text="J: ASPECT HERE", command=lambda: self.aspect_pressed("J"), width= self.btn_width).grid(row = 8, column=2)
+        self.aspect_k = ttk.Button(self.root, text="K: ASPECT HERE", command=lambda: self.aspect_pressed("K"), width= self.btn_width).grid(row = 9, column=2)
+        self.aspect_l = ttk.Button(self.root, text="L: ASPECT HERE", command=lambda: self.aspect_pressed("L"), width= self.btn_width).grid(row = 10, column=2)
+
+        self.aspect_positive = ttk.Button(self.root, text="A: Positive", command=lambda: self.sentiment_pressed("A"), width= self.btn_width).grid(row=2, column=3)
+        self.aspect_neutral = ttk.Button(self.root, text="S: Neutral", command=lambda: self.sentiment_pressed("S"), width= self.btn_width).grid(row=3, column=3)
+        self.aspect_negative = ttk.Button(self.root, text="D: Negative", command=lambda: self.sentiment_pressed("D"), width= self.btn_width).grid(row=4, column=3)
+
+        #   keys
+        self.root.bind("q", self.quit_app)
+        self.root.bind("<Return>", self.try_submit)
+        self.root.bind("1", self.handle_key)
+        self.root.bind("0", self.handle_key)
+        self.root.bind("a", self.handle_key)
+        self.root.bind("s", self.handle_key)
+        self.root.bind("d", self.handle_key)
+        self.root.bind("f", self.handle_key)
+        self.root.bind("g", self.handle_key)
+        self.root.bind("h", self.handle_key)
+        self.root.bind("j", self.handle_key)
+        self.root.bind("k", self.handle_key)
+        self.root.bind("l", self.handle_key)
+
+        
+        self.load_review_data("data/uber_reviews_sampled.csv")
+        # self.load_review_data("data/uber_reviews_tagged.csv")
+
+        self.display_next_review()
+        #   self.save_tags("data/uber_reviews_tagged.csv")
+
+        self.root.mainloop()
+
+    def handle_key(self, event):
+        key = event.char
+    
+        # Column 0 or 1: feature/bug (1 and 0)
+        if key in ['1', '0']:
+            if self.active_column == 0:
+                self.feature_pressed(key)
+            elif self.active_column == 1:
+                self.bug_pressed(key)
+        # Column 2: aspects (a,s,d,f,g,h,j,k,l)
+        elif key in 'asdfghjkl' and self.active_column == 2:
+            self.aspect_pressed(key.upper())
+        # Column 3: sentiment (a,s,d)
+        elif key in 'asd' and self.active_column == 3:
+            self.sentiment_pressed(key.upper())
+    
+    def move_highlight(self, row, col):
+        """Move the highlight box directly under the button pressed."""
+        self.highlight.grid(row=row, column=col)
+        self.highlight.grid()  # make sure it’s visible
+
+
+    def feature_pressed(self, value):
+        self.review_data.at[self.current_review_index, "feature_request"] = value
+        self.active_column = 1
+        self.move_highlight(self.number_of_aspects + 2, 1)
+
+    def bug_pressed(self, value):
+        self.review_data.at[self.current_review_index, "bug_report"] = value
+        self.active_column = 2
+        self.move_highlight(self.number_of_aspects + 2, 2)
+
+    def aspect_pressed(self, value):
+        self.review_data.at[self.current_review_index, "aspect"] = value
+        self.active_column = 3
+        self.move_highlight(self.number_of_aspects + 2, 3)
+
+    def sentiment_pressed(self, value):
+        self.review_data.at[self.current_review_index, "aspect_sentiment"] = value
+        self.active_column = 0  # Reset for next review
+
+
+    def load_review_data(self, data_path):
+        """Load review data from a CSV file."""
+        self.review_data = pd.read_csv(data_path, low_memory=False)
+        if "tagged" not in self.review_data.columns:
+            self.review_data["tagged"] = 0  # Initialize tagged column if not present
+        if "feature_request" not in self.review_data.columns:
+            self.review_data["feature_request"] = ""  # Initialize feature_request column if not present
+        if "bug_report" not in self.review_data.columns:
+            self.review_data["bug_report"] = ""  # Initialize bug_report column if not present
+        if "aspect" not in self.review_data.columns:
+            self.review_data["aspect"] = ""  # Initialize aspect column if not present
+        if "aspect_sentiment" not in self.review_data.columns:
+            self.review_data["aspect_sentiment"] = ""  # Initialize aspect_sentiment column if not present
+        print(f"Loaded {len(self.review_data)} reviews from {data_path}")
+    
+    def display_next_review(self):
+        """Display the next review in the text box."""
+        self.current_review_index = self.get_current_review_index()
+        if self.current_review_index < len(self.review_data):
+            review = self.review_data.iloc[self.current_review_index]
+            self.display_review.delete(1.0, tk.END)  # Clear the text box
+            self.display_review.insert(tk.END, review["review_description"])  # Display the review text
+            # self.current_review_index += 1
+            # Mark as tagged
+            #   self.review_data.at[self.current_review_index - 1, "tagged"] = 1
+            self.active_column = 0  # reset to start at feature request
+            self.move_highlight(self.number_of_aspects + 2, 0)
+            
+        else:
+            print("No more reviews to display.")
+
+    def submit_tag(self):
+        self.review_data.at[self.current_review_index, "tagged"] = 1
+        self.save_tags("data/uber_reviews_tagged.csv")
+        self.display_next_review()
+
+    def try_submit(self, event):
+        """Try to submit current review if all labels complete."""
+        if self.all_labels_complete():
+            self.submit_tag()
+            self.move_highlight(self.number_of_aspects + 2, 0)
+
+            print("Labels submitted, loading next review")
+        else:
+            print("Please complete all labels before submitting")
+    
+    def all_labels_complete(self):
+        row = self.review_data.iloc[self.current_review_index]
+        return (row["feature_request"] != "" and 
+            row["bug_report"] != "" and 
+            row["aspect"] != "" and 
+            row["aspect_sentiment"] != "")
+    
+    def save_tags(self, save_path):
+        """Save the tagged data to a CSV file."""
+        self.review_data.to_csv(save_path, index=False)
+        print(f"Tagged data saved to {save_path}")
+
+    def quit_app(self, event):
+        self.root.destroy()
+        self.save_tags("data/uber_reviews_tagged.csv")
+
+    def get_current_review_index(self):
+        for i in range(len(self.review_data)):
+            if self.review_data.iloc[i]["tagged"] == 0:
+                return i
+        return self.review_data.shape[0]  # all reviews tagged
+    
+    
+    
+app = MultiTag()
--- a/multitag/preprocess.py
+++ b/multitag/preprocess.py
@@ -0,0 +1,169 @@
+import pandas as pd
+import re
+from langdetect import detect, LangDetectException
+
+def clean_text(text):
+    """Clean review text by removing URLS, emails, excessive whitespace
+
+    Input: 
+    text - the review text to clean
+
+    Outputs:
+    str: the cleaned review text
+    """
+    if pd.isna(text):
+        return ""
+    
+    # Convert to lower for uniformity
+    text = str(text).lower()
+    
+    # Remove URLs using regex
+    text = re.sub(r'http\S+|www\S+', '', text)
+    
+    # Remove emails
+    text = re.sub(r'\S+@\S+', '', text)
+
+    # Normalize punctuation
+    text = re.sub(r'\.{2,}', '.', text)
+    text = re.sub(r'!{2,}', '!', text)
+    text = re.sub(r'\?{2,}', '?', text)
+    
+    # Remove excessive whitespace by replacing with single whitespace where there is trailing spaces
+    text = re.sub(r'\s+', ' ', text).strip()
+    
+    return text
+
+def detect_language(text):
+    """Detect language of text"""
+    try:
+        if pd.isna(text) or len(str(text).strip()) < 10:
+            return 'unknown'
+        return detect(str(text))
+    except LangDetectException:
+        return 'unknown'
+
+def preprocess_uber_reviews(input_path, output_path):
+    """
+    preprocess_uber_reviews by loading, cleaning, and filtering the data.
+
+    - No language detection due to unreliability on short informal text
+    - Data is labelled as English, but contains non-english text
+    - Assumes location of the datasets hardcoded, doesn't handle if it doesn't exist 
+    - Assumes there is a column named "review_description"
+
+    1. Load from csv pd.read_csv()
+    2. Remove rows with missing descriptions
+    3. Clean text by removing URLS, emails, and excessive whitespace
+    4. Calculate word count for each review
+    5. Removes duplicate reviews  
+    6. Removes less than 5 word reviews
+    6. Saves the cleaned dataset to uber_reviews_cleaned.csv
+
+    Inputs:
+    input_path (str): Path to uber_reviews.csv
+    output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv
+
+    Outputs:
+    pd.df_clean: the dataframe of cleaned processed reviews
+    """
+    print("="*50)
+    print("PREPROCESSING UBER REVIEWS")
+    print("="*50)
+    
+    # 1. Load data
+    print("\n1. Loading data...")
+    df = pd.read_csv(input_path, low_memory=False)
+    print(f"   Original size: {len(df):,} reviews")
+    
+    # 2. Remove missing reviews
+    print("\n2. Removing missing reviews...")
+    df = df.dropna(subset=['review_description'])
+    print(f"   After removing nulls: {len(df):,} reviews")
+    
+    # 3. Clean text
+    print("\n3. Cleaning text...")
+    df['review_clean'] = df['review_description'].apply(clean_text)
+    
+    # 4. Calculate word count
+    df['word_count'] = df['review_clean'].str.split().str.len()
+    
+    # 5. Remove short reviews
+    review_length_limit = 5
+    print(f"\n4. Removing short reviews (< {review_length_limit})...")
+    print("   Rationale: Insufficient context for classification")
+    before = len(df)
+    df = df[df['word_count'] >= review_length_limit]
+    removed = before - len(df)
+    print(f"   Removed: {removed:,} reviews ({removed/before*100:.1f}%)")
+    print(f"   Remaining: {len(df):,} reviews")
+    
+    # 6. Remove duplicates
+    print("\n5. Removing duplicates...")
+    before = len(df)
+    df = df.drop_duplicates(subset=['review_clean'])
+    removed = before - len(df)
+    print(f"   Removed: {removed:,} duplicates")
+    print(f"   Remaining: {len(df):,} reviews")
+    
+    # 7. Final dataset
+    df_clean = df[['review_clean', 'rating', 'word_count']].copy()
+    df_clean.rename(columns={'review_clean': 'review'}, inplace=True)
+    df_clean = df_clean.reset_index(drop=True)
+    
+    # 8. Save
+    print(f"\n6. Saving to {output_path}...")
+    df_clean.to_csv(output_path, index=False)
+    
+    # Summary
+    print("\n" + "="*50)
+    print("PREPROCESSING COMPLETE")
+    print("="*50)
+    print(f"\nFinal dataset: {len(df_clean):,} reviews")
+    print(f"Data source: Indian Uber market (predominantly English)")
+    print(f"Quality filters: word_count >= 5, duplicates removed")
+    
+    print("\nRating distribution:")
+    rating_dist = df_clean['rating'].value_counts().sort_index()
+    for rating, count in rating_dist.items():
+        percentage = count / len(df_clean) * 100
+        print(f"  {rating}{"✭"*rating}: {count:,} ({percentage:.1f}%)")
+    
+    print("\nWord count statistics:")
+    print(f"  Mean: {df_clean['word_count'].mean():.1f} words")
+    print(f"  Median: {df_clean['word_count'].median():.1f} words")
+    print(f"  Min: {df_clean['word_count'].min()} words")
+    print(f"  Max: {df_clean['word_count'].max()} words")
+
+    print("\nVerify New Data:")
+    print(f"  Short reviews: {df_clean[df_clean['word_count'] < 5]}")
+    print(f"  Null values: {df_clean.isnull().sum().to_dict()}")
+    print(f"  Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
+    # lang detection takes 5+ mins
+    #df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
+    #print(f"  Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
+    
+    # Sample reviews from each rating
+    print("\n" + "="*50)
+    print("SAMPLE CLEANED REVIEWS")
+    print("="*50)
+    for rating in [1,2,3,4,5]:
+        if len(df_clean[df_clean['rating'] == rating]) > 0:
+            sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
+            print(f"\n{rating} {"✭" * rating} REVIEWS:")
+            for idx, row in sample.iterrows():
+                print(f"  • ({row['word_count']} words) {row['review'][:100]}")
+    
+    # Note about language
+    print("Language detection not applied due to unreliability on short")
+    print("informal text. Dataset is from the Indian market, labeled as English.")
+    print("Manual annotation phase will identify any non-English reviews. And put aside.")
+    
+    return df_clean
+
+if __name__ == "__main__":
+    input_file = "data/uber_reviews.csv"
+    output_file = "data/uber_reviews_cleaned.csv"
+    
+    df_clean = preprocess_uber_reviews(input_file, output_file)
+    print("\nPreprocessing complete!")
+    print(f"Clean dataset: {len(df_clean):,} reviews ready for sampling")
--- a/multitag/preprocessing_uber.ipynb
+++ b/multitag/preprocessing_uber.ipynb
@@ -0,0 +1,433 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "470fe7c6-1614-4daf-879f-e6c399117c7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "b855045e-2dd1-4fa1-ab5a-8ce8b50b02ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('data/uber_reviews.csv', low_memory=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "e7da1fb6-ede6-46c6-8fbd-fa491d3351c5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source</th>\n",
+       "      <th>review_id</th>\n",
+       "      <th>user_name</th>\n",
+       "      <th>review_title</th>\n",
+       "      <th>review_description</th>\n",
+       "      <th>rating</th>\n",
+       "      <th>thumbs_up</th>\n",
+       "      <th>review_date</th>\n",
+       "      <th>developer_response</th>\n",
+       "      <th>developer_response_date</th>\n",
+       "      <th>appVersion</th>\n",
+       "      <th>laguage_code</th>\n",
+       "      <th>country_code</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Google Play</td>\n",
+       "      <td>18d6584c-d0e9-4833-a744-f607058aee97</td>\n",
+       "      <td>Milky Way</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Suddenly, the driver can't have my location an...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2023-08-10 17:48:51</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>en</td>\n",
+       "      <td>in</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Google Play</td>\n",
+       "      <td>50a08f18-cece-4ddf-b617-028844c8aa28</td>\n",
+       "      <td>Bradlee Severa</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Very cordial.. And helped with a quick turnaro...</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2023-08-10 17:38:35</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.485.10000</td>\n",
+       "      <td>en</td>\n",
+       "      <td>in</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Google Play</td>\n",
+       "      <td>b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7</td>\n",
+       "      <td>Amit Aggarwal</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Very good experience</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2023-08-10 17:38:17</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.486.10002</td>\n",
+       "      <td>en</td>\n",
+       "      <td>in</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Google Play</td>\n",
+       "      <td>502702a9-25ed-4373-a96c-7fa1f06caacd</td>\n",
+       "      <td>Bryant Inman</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>All I use</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2023-08-10 17:37:45</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.467.10008</td>\n",
+       "      <td>en</td>\n",
+       "      <td>in</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Google Play</td>\n",
+       "      <td>f47a3fb6-23db-49bd-9e63-f33c8d724d07</td>\n",
+       "      <td>Addie Whittaker</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>I have enjoyed traveling by Uber my drivers ha...</td>\n",
+       "      <td>5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2023-08-10 17:36:56</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4.486.10002</td>\n",
+       "      <td>en</td>\n",
+       "      <td>in</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        source                             review_id        user_name  \\\n",
+       "0  Google Play  18d6584c-d0e9-4833-a744-f607058aee97        Milky Way   \n",
+       "1  Google Play  50a08f18-cece-4ddf-b617-028844c8aa28   Bradlee Severa   \n",
+       "2  Google Play  b0d8e75a-80a7-4dcd-abaf-72b046dbeeb7    Amit Aggarwal   \n",
+       "3  Google Play  502702a9-25ed-4373-a96c-7fa1f06caacd     Bryant Inman   \n",
+       "4  Google Play  f47a3fb6-23db-49bd-9e63-f33c8d724d07  Addie Whittaker   \n",
+       "\n",
+       "  review_title                                 review_description  rating  \\\n",
+       "0          NaN  Suddenly, the driver can't have my location an...       1   \n",
+       "1          NaN  Very cordial.. And helped with a quick turnaro...       5   \n",
+       "2          NaN                               Very good experience       5   \n",
+       "3          NaN                                          All I use       5   \n",
+       "4          NaN  I have enjoyed traveling by Uber my drivers ha...       5   \n",
+       "\n",
+       "   thumbs_up          review_date developer_response developer_response_date  \\\n",
+       "0        0.0  2023-08-10 17:48:51                NaN                     NaN   \n",
+       "1        0.0  2023-08-10 17:38:35                NaN                     NaN   \n",
+       "2        0.0  2023-08-10 17:38:17                NaN                     NaN   \n",
+       "3        0.0  2023-08-10 17:37:45                NaN                     NaN   \n",
+       "4        0.0  2023-08-10 17:36:56                NaN                     NaN   \n",
+       "\n",
+       "    appVersion laguage_code country_code  \n",
+       "0          NaN           en           in  \n",
+       "1  4.485.10000           en           in  \n",
+       "2  4.486.10002           en           in  \n",
+       "3  4.467.10008           en           in  \n",
+       "4  4.486.10002           en           in  "
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "5c02ec54-4583-4720-88c6-1110b52c3f88",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rating\n",
+      "1    283895\n",
+      "2     41707\n",
+      "3     49928\n",
+      "4     82953\n",
+      "5    611133\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df['rating'].value_counts().sort_index())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "1da5d625-a4ba-49f8-8314-cc9e0f4ef96a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Review length stats:\n",
+      "  Mean: 13.1 words\n",
+      "  Median: 4.0 words\n",
+      "  Min: 1.0 words\n",
+      "  Max: 755.0 words\n"
+     ]
+    }
+   ],
+   "source": [
+    "df['word_count'] = df['review_description'].str.split().str.len()\n",
+    "print('Review length stats:')\n",
+    "print(f\"  Mean: {df['word_count'].mean():.1f} words\")\n",
+    "print(f\"  Median: {df['word_count'].median():.1f} words\")\n",
+    "print(f\"  Min: {df['word_count'].min()} words\")\n",
+    "print(f\"  Max: {df['word_count'].max()} words\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "1c97e396-8f05-4df7-bd0a-1bbecf6911b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "short_reviews = df[df['word_count'] < 5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "55324c94-4944-4844-b00e-dc08c8989f7b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Reviews < 5 words: 569632 (53.3%)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"\\nReviews < 5 words: {len(short_reviews)} ({len(short_reviews)/len(df)*100:.1f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "c45959fe-3e23-4831-a41a-94c89892247f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Missing values:\n",
+      "source                           0\n",
+      "review_id                        0\n",
+      "user_name                        1\n",
+      "review_title               1067436\n",
+      "review_description             169\n",
+      "rating                           0\n",
+      "thumbs_up                     2180\n",
+      "review_date                      0\n",
+      "developer_response          871352\n",
+      "developer_response_date     872338\n",
+      "appVersion                  241548\n",
+      "laguage_code                     0\n",
+      "country_code                     0\n",
+      "word_count                     169\n",
+      "dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"\\nMissing values:\")\n",
+    "print(df.isnull().sum())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "bf14e3db-a1b4-4fad-8102-b7ac25feeefa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Duplicate reviews: 422458\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Duplicate reviews: {df.duplicated(subset=['review_description']).sum()}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "8ccc07fa-9913-4047-ae17-35d2454eb059",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "==========================================\n",
+      "1 STAR REVIEWS:\n",
+      "==========================================\n",
+      "\n",
+      "cant put gift card on dont like app\n",
+      "(Length: 8.0 words)\n",
+      "\n",
+      "Zapłaciłem za przejazd, uber pobral środki z mojego konta. Potem byla aktualizacja ceny na niższą i znowu kazał płacić. Teraz aplikacja zablokowała się na ekranie potwierdzenia płatności.\n",
+      "(Length: 27.0 words)\n",
+      "\n",
+      "The app hasn't been able to process any payment. Takes forever to find a ride. I don't even know why this app still exists. Absolutely useless!\n",
+      "(Length: 26.0 words)\n",
+      "\n",
+      "==========================================\n",
+      "2 STAR REVIEWS:\n",
+      "==========================================\n",
+      "\n",
+      "In spite of receiving payment and acknowledging by email the app shows \n",
+      "payment due and disallowed booking and service not available to me. 4 days \n",
+      "have lapsed no solution to my problem. Problem solvi...\n",
+      "(Length: 37.0 words)\n",
+      "\n",
+      "Poor\n",
+      "(Length: 1.0 words)\n",
+      "\n",
+      "I had to reset my password and now I cant get in. Its telling me that my phone number is already in use. I need this fixed\n",
+      "(Length: 27.0 words)\n",
+      "\n",
+      "==========================================\n",
+      "3 STAR REVIEWS:\n",
+      "==========================================\n",
+      "\n",
+      "Nice\n",
+      "(Length: 1.0 words)\n",
+      "\n",
+      "Good rides\n",
+      "(Length: 2.0 words)\n",
+      "\n",
+      "Nice\n",
+      "(Length: 1.0 words)\n",
+      "\n",
+      "==========================================\n",
+      "4 STAR REVIEWS:\n",
+      "==========================================\n",
+      "\n",
+      "Good service\n",
+      "(Length: 2.0 words)\n",
+      "\n",
+      "A mobile number of the car driver should be an icon if Uber book for any other person, then it can be given the number.\n",
+      "(Length: 25.0 words)\n",
+      "\n",
+      "many times pick up locations is shifted automatically . overall good much better\n",
+      "(Length: 13.0 words)\n",
+      "\n",
+      "==========================================\n",
+      "5 STAR REVIEWS:\n",
+      "==========================================\n",
+      "\n",
+      "So friendly. Thank you\n",
+      "(Length: 4.0 words)\n",
+      "\n",
+      "comfortable journey with effodable price\n",
+      "(Length: 5.0 words)\n",
+      "\n",
+      "Good\n",
+      "(Length: 1.0 words)\n"
+     ]
+    }
+   ],
+   "source": [
+    "for rating in [1, 2, 3, 4, 5]:\n",
+    "    samples = df[df['rating'] == rating].sample(min(3, len(df[df['rating'] == rating])))\n",
+    "    print(f\"\\n{'='*42}\")\n",
+    "    print(f\"{rating} STAR REVIEWS:\")\n",
+    "    print(f\"{'='*42}\")\n",
+    "    for idx, row in samples.iterrows():\n",
+    "        review_text = row['review_description']\n",
+    "        print(f\"\\n{review_text[:200]}{'...' if len(review_text) > 200 else ''}\")\n",
+    "        print(f\"(Length: {row['word_count']} words)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b12dcb89-d291-447a-98f3-02817dc0eb8e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/multitag/sampler.py
+++ b/multitag/sampler.py
@@ -0,0 +1,66 @@
+import pandas as pd
+import numpy as np
+
+print(pd.__version__)
+print(np.__version__)
+
+path = "data/uber_reviews.csv"
+sampled_path = "data/uber_reviews_sampled.csv"
+class Sampler:
+    def __init__(self, data_path):
+
+        self.data_path = data_path
+        self.data = pd.read_csv(self.data_path, low_memory=False)
+        self.total = len(self.data)  # total number of records in the dataset
+        self.target_samples = 5000  # target number of samples
+        self.stratify_column = "rating"  # column to stratify by
+
+        print(f"Data loaded from {self.data_path}, total records: {len(self.data)}")
+        print(self.data.head())
+
+        self.data.info()
+
+    #   add sampling method here
+    #   random sample 5000 entries with stratifiying by rating
+    """
+    rating
+    5    57.1% (611133)
+    1    26.5% (283895)
+    4     7.8% (82953)
+    3     4.7% (49928)
+    2     3.9% (41707)
+    Name: proportion, dtype: object
+    """
+
+    def get_stratified_sample(self):
+        stratified_sample = self.data.groupby(self.stratify_column).apply(
+            lambda x: x.sample(n=int(len(x) / self.total * self.target_samples)),
+            # include_groups=False
+    )
+        return stratified_sample
+sampler = Sampler("data/uber_reviews.csv")
+
+
+
+to_sample = input("Do you want to create a stratified sample of the data? (y/n): ")             
+
+if to_sample == 'y':
+    sampled = sampler.get_stratified_sample()
+    sampled.to_csv("data/uber_reviews_sampled.csv", index=False)
+    print("Original columns:", sampler.data.columns.tolist())
+    print("Sampled columns:", sampled.columns.tolist())
+    print("Stratified sample saved to data/uber_reviews_sampled.csv")
+elif to_sample == 'n':
+    sampled_data = pd.read_csv("data/uber_reviews_sampled.csv", low_memory=False)
+    """
+    debug to check sampled data matches original columns
+    print("Original columns:", sampler.data.columns.tolist())
+    print("Sampled columns:", sampled_data.columns.tolist())
+    """
+    
+    print("Original data distribution:")
+    print(sampler.data["rating"].value_counts())
+    print("Sampled data distribution:")
+    print(sampled_data["rating"].value_counts())
+else:
+    print("Invalid input, please enter 'y' or 'n'")
--- a/multitag/shell.nix
+++ b/multitag/shell.nix
@@ -0,0 +1,16 @@
+{ pkgs ? import <nixpkgs> {} }:
+
+pkgs.mkShell {
+  buildInputs = with pkgs; [
+    python313
+    python313Packages.tkinter
+    python313Packages.pandas
+    python313Packages.numpy
+  ];
+
+  
+  shellHook = ''
+    echo "Development environment loaded"
+    echo "Python: $(python --version)"
+  '';
+}
--- a/multitag/uber_cleaned.ipynb
+++ b/multitag/uber_cleaned.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "739e61bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "from langdetect import detect, LangDetectException\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d9da1b98",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>review</th>\n",
+       "      <th>rating</th>\n",
+       "      <th>word_count</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>suddenly, the driver can't have my location an...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>very cordial.. and helped with a quick turnaro...</td>\n",
+       "      <td>5</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>i have enjoyed traveling by uber my drivers ha...</td>\n",
+       "      <td>5</td>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>app is good but main problem is the drivers ca...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>very bad experience no customer service</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              review  rating  word_count\n",
+       "0  suddenly, the driver can't have my location an...       1          23\n",
+       "1  very cordial.. and helped with a quick turnaro...       5          11\n",
+       "2  i have enjoyed traveling by uber my drivers ha...       5          23\n",
+       "3  app is good but main problem is the drivers ca...       1          23\n",
+       "4            very bad experience no customer service       1           6"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv('data/uber_reviews_cleaned.csv')\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "91dc1d9a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "np.int64(6740)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(df['word_count'] > 100).sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "827b6435",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Max length review:\n",
+      "i've been using uber for a few years now and for the most part haven't had any problems the only problem i have ever had that was never resolved or explained to me was when i was refunded cancellation fee because the driver pulled into me and then literally looked in my face and drove away and canceled the ride.buy still charges me.a cancellation fee the problem wasn't being charged the fee because i was reimbursed a few minutes later the problem was that they gave me a $5 uber credit and i don't have a debit card so i use uber gift cards so i had a balance on the gift card of roughly $4.85 and is an uber credit of $5 but for some reason you can't combine them to use on a single gel ride and there is a minimum of i think $6 or $7 for a ride so i was never able to use the money i was reimbursed and the remaing balance on my gift card was not enough for a full ride and that was 4 months ago and still have not been able to get a response as to how i can use the funds remaining mn my uber account or the uber credits i was reimbursed.... another time the driver took between 7-10 minutes to set the destination and begin driving and then when i was trying to help her with the directions because the gps was bringing us rather far out of the way to reach out destination she was very rude with me and then when we were approaching the turn that we needed to make and i warned her that it was coming up shortly and then said a few more times as we got closer to it she drove passed it the reason i was making sure she was aware of this was because we were in a rather busy highway and she would not be able to turn again for quite some time and i was already cutting it close to being late for work due to the delay in starting the trip and her failure to follow the direction i was giving her to get to.our destination so i said pull over into.the gas station and i will walk it'll be faster then driving at this point but she failed to pull into parking lot she just stopped in the middle.lf the highway granted we were in the right lane but she still put us both in a very dangerous situation and we were in a bad section of newark nj at around 10pm which for most people would put them in an uncomfortable position seeing how as the crime rate is extremely high in this area luckily i'm familiar with the residence of the neighborhood because i lived there for a few years in the past and then she notified uber that i was a disgruntled passenger and made her nervous from my reaction and in was warned to not let anything like this happen again... and the final problem.i had with uber was when my mother had ordered me.an uber through her account and during the ride we some.how started to discuss religion and i'm far from a religious person i do not even consider myself a member of any type of religion at all. but my driver was very dedicated to the religion he practiced so he got very upset and decided to call mother and tell her that i didn't not go the full distance of the original route he was hired to drive and that he had dropped me off a few blocks from where j was going to judge in some illegal activities (he knew i had some personal issues i was overcoming recently because i had mention it to him briefly in the beginning of the trip) and that he was not going to charge her the full ride amount because i did. ot cimplete the full distance and this caused my mother to become very upset with me and called me extremely untilni sent her a picture showing that i was in fact where i said i was going and the app showed that he did drop me off at the predetermined destination.. sorry for the long post but out of the probably 100 or so ride i have used with uber those were the only 3 problems i have ever had and they all 3 were actually in the same week\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Max length review:\\n{df.loc[df['word_count'].idxmax(), 'review']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "7a811e3d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "np.int64(11226)"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def has_spam_repetition(text):\n",
+    "    return bool(re.search(r'(.)\\1{4,}', str(text)))\n",
+    "df['review'].apply(has_spam_repetition).sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "0a550434",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_check = df[df['word_count'] >= 10].sample(1000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "ec7b2ec5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def detect_language(text):\n",
+    "    \"\"\"Detect language of text\"\"\"\n",
+    "    try:\n",
+    "        if pd.isna(text) or len(str(text).strip()) < 10:\n",
+    "            return 'unknown'\n",
+    "        return detect(str(text))\n",
+    "    except LangDetectException:\n",
+    "        return 'unknown'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "d68dac67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_check['lang'] = sample_check['review'].apply(detect_language)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "9a8a49b2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Language distribution in 10+ word reviews:\n",
+      "lang\n",
+      "en    960\n",
+      "es     11\n",
+      "id      7\n",
+      "pt      6\n",
+      "sv      2\n",
+      "sw      2\n",
+      "ar      2\n",
+      "ro      2\n",
+      "ta      1\n",
+      "bn      1\n",
+      "nl      1\n",
+      "da      1\n",
+      "so      1\n",
+      "ru      1\n",
+      "et      1\n",
+      "af      1\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"\\nLanguage distribution in 10+ word reviews:\")\n",
+    "print(sample_check['lang'].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6cf91607",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}