ReClass/multitag/preprocess.py

import pandas as pd
import re
from langdetect import detect, LangDetectException

def clean_text(text):
    """Clean review text by removing URLS, emails, excessive whitespace

    Input:
    text - the review text to clean

    Outputs:
    str: the cleaned review text
    """
    if pd.isna(text):
        return ""

    # Convert to lower for uniformity
    text = str(text).lower()

    # Remove URLs using regex
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)

    # Normalize punctuation
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'!{2,}', '!', text)
    text = re.sub(r'\?{2,}', '?', text)

    # Remove excessive whitespace by replacing with single whitespace where there is trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def detect_language(text):
    """Detect language of text"""
    try:
        if pd.isna(text) or len(str(text).strip()) < 10:
            return 'unknown'
        return detect(str(text))
    except LangDetectException:
        return 'unknown'

def preprocess_uber_reviews(input_path, output_path):
    """
    preprocess_uber_reviews by loading, cleaning, and filtering the data.

    - No language detection due to unreliability on short informal text
    - Data is labelled as English, but contains non-english text
    - Assumes location of the datasets hardcoded, doesn't handle if it doesn't exist
    - Assumes there is a column named "review_description"

    1. Load from csv pd.read_csv()
    2. Remove rows with missing descriptions
    3. Clean text by removing URLS, emails, and excessive whitespace
    4. Calculate word count for each review
    5. Removes duplicate reviews
    6. Removes less than 5 word reviews
    6. Saves the cleaned dataset to uber_reviews_cleaned.csv

    Inputs:
    input_path (str): Path to uber_reviews.csv
    output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv

    Outputs:
    pd.df_clean: the dataframe of cleaned processed reviews
    """
    print("="*50)
    print("PREPROCESSING UBER REVIEWS")
    print("="*50)

    # 1. Load data
    print("\n1. Loading data...")
    df = pd.read_csv(input_path, low_memory=False)
    print(f"   Original size: {len(df):,} reviews")

    # 2. Remove missing reviews
    print("\n2. Removing missing reviews...")
    df = df.dropna(subset=['review_description'])
    print(f"   After removing nulls: {len(df):,} reviews")

    # 3. Clean text
    print("\n3. Cleaning text...")
    df['review_clean'] = df['review_description'].apply(clean_text)

    # 4. Calculate word count
    df['word_count'] = df['review_clean'].str.split().str.len()

    # 5. Remove short reviews
    review_length_limit = 5
    print(f"\n4. Removing short reviews (< {review_length_limit})...")
    print("   Rationale: Insufficient context for classification")
    before = len(df)
    df = df[df['word_count'] >= review_length_limit]
    removed = before - len(df)
    print(f"   Removed: {removed:,} reviews ({removed/before*100:.1f}%)")
    print(f"   Remaining: {len(df):,} reviews")

    # 6. Remove duplicates
    print("\n5. Removing duplicates...")
    before = len(df)
    df = df.drop_duplicates(subset=['review_clean'])
    removed = before - len(df)
    print(f"   Removed: {removed:,} duplicates")
    print(f"   Remaining: {len(df):,} reviews")

    # 7. Final dataset
    df_clean = df[['review_clean', 'rating', 'word_count']].copy()
    df_clean.rename(columns={'review_clean': 'review'}, inplace=True)
    df_clean = df_clean.reset_index(drop=True)

    # 8. Save
    print(f"\n6. Saving to {output_path}...")
    df_clean.to_csv(output_path, index=False)

    # Summary
    print("\n" + "="*50)
    print("PREPROCESSING COMPLETE")
    print("="*50)
    print(f"\nFinal dataset: {len(df_clean):,} reviews")
    print(f"Data source: Indian Uber market (predominantly English)")
    print(f"Quality filters: word_count >= 5, duplicates removed")

    print("\nRating distribution:")
    rating_dist = df_clean['rating'].value_counts().sort_index()
    for rating, count in rating_dist.items():
        percentage = count / len(df_clean) * 100
        print(f"  {rating}{"✭"*rating}: {count:,} ({percentage:.1f}%)")

    print("\nWord count statistics:")
    print(f"  Mean: {df_clean['word_count'].mean():.1f} words")
    print(f"  Median: {df_clean['word_count'].median():.1f} words")
    print(f"  Min: {df_clean['word_count'].min()} words")
    print(f"  Max: {df_clean['word_count'].max()} words")

    print("\nVerify New Data:")
    print(f"  Short reviews: {df_clean[df_clean['word_count'] < 5]}")
    print(f"  Null values: {df_clean.isnull().sum().to_dict()}")
    print(f"  Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
    # lang detection takes 5+ mins
    #df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
    #print(f"  Detected languages:\n {df_clean['detected_lang'].value_counts( )}")

    # Sample reviews from each rating
    print("\n" + "="*50)
    print("SAMPLE CLEANED REVIEWS")
    print("="*50)
    for rating in [1,2,3,4,5]:
        if len(df_clean[df_clean['rating'] == rating]) > 0:
            sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
            print(f"\n{rating} {"✭" * rating} REVIEWS:")
            for idx, row in sample.iterrows():
                print(f"  • ({row['word_count']} words) {row['review'][:100]}")

    # Note about language
    print("Language detection not applied due to unreliability on short")
    print("informal text. Dataset is from the Indian market, labeled as English.")
    print("Manual annotation phase will identify any non-English reviews. And put aside.")

    return df_clean

if __name__ == "__main__":
    input_file = "data/uber_reviews.csv"
    output_file = "data/uber_reviews_cleaned.csv"

    df_clean = preprocess_uber_reviews(input_file, output_file)
    print("\nPreprocessing complete!")
    print(f"Clean dataset: {len(df_clean):,} reviews ready for sampling")