import pandas as pd import re from langdetect import detect, LangDetectException def clean_text(text): """Clean review text by removing URLS, emails, excessive whitespace Input: text - the review text to clean Outputs: str: the cleaned review text """ if pd.isna(text): return "" # Convert to lower for uniformity text = str(text).lower() # Remove URLs using regex text = re.sub(r'http\S+|www\S+', '', text) # Remove emails text = re.sub(r'\S+@\S+', '', text) # Normalize punctuation text = re.sub(r'\.{2,}', '.', text) text = re.sub(r'!{2,}', '!', text) text = re.sub(r'\?{2,}', '?', text) # Remove excessive whitespace by replacing with single whitespace where there is trailing spaces text = re.sub(r'\s+', ' ', text).strip() return text def detect_language(text): """Detect language of text""" try: if pd.isna(text) or len(str(text).strip()) < 10: return 'unknown' return detect(str(text)) except LangDetectException: return 'unknown' def preprocess_uber_reviews(input_path, output_path): """ preprocess_uber_reviews by loading, cleaning, and filtering the data. - No language detection due to unreliability on short informal text - Data is labelled as English, but contains non-english text - Assumes location of the datasets hardcoded, doesn't handle if it doesn't exist - Assumes there is a column named "review_description" 1. Load from csv pd.read_csv() 2. Remove rows with missing descriptions 3. Clean text by removing URLS, emails, and excessive whitespace 4. Calculate word count for each review 5. Removes duplicate reviews 6. Removes less than 5 word reviews 6. Saves the cleaned dataset to uber_reviews_cleaned.csv Inputs: input_path (str): Path to uber_reviews.csv output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv Outputs: pd.df_clean: the dataframe of cleaned processed reviews """ print("="*50) print("PREPROCESSING UBER REVIEWS") print("="*50) # 1. Load data print("\n1. Loading data...") df = pd.read_csv(input_path, low_memory=False) print(f" Original size: {len(df):,} reviews") # 2. Remove missing reviews print("\n2. Removing missing reviews...") df = df.dropna(subset=['review_description']) print(f" After removing nulls: {len(df):,} reviews") # 3. Clean text print("\n3. Cleaning text...") df['review_clean'] = df['review_description'].apply(clean_text) # 4. Calculate word count df['word_count'] = df['review_clean'].str.split().str.len() # 5. Remove short reviews review_length_limit = 5 ### limit review length ### print(f"\n4. Removing short reviews so reviews have better context / (usefulness) (< {review_length_limit})...") # 1 word reviews provide little to draw conclusions from and bloat the # dataset a lot, nearly 50% of reviews! # display changes before = len(df) df = df[df['word_count'] >= review_length_limit] removed = before - len(df) print(f" Removed: {removed:,} reviews ({removed/before*100:.1f}%)") print(f" Remaining: {len(df):,} reviews") # 6. Remove duplicates print("\n5. Removing duplicates...") before = len(df) df = df.drop_duplicates(subset=['review_clean']) removed = before - len(df) print(f" Removed: {removed:,} duplicates") print(f" Remaining: {len(df):,} reviews") # 7. Final dataset df_clean = df[['review_clean', 'rating', 'word_count']].copy() df_clean.rename(columns={'review_clean': 'review'}, inplace=True) df_clean = df_clean.reset_index(drop=True) # 8. Save print(f"\n6. Saving to {output_path}...") df_clean.to_csv(output_path, index=False) # Summary print("\n" + "="*50) print("PREPROCESSING COMPLETE") print("="*50) print(f"\nFinal dataset: {len(df_clean):,} reviews") print(f"Quality filters: word_count >= 5, duplicates removed") # while this does remove a some legitimate reviews which would provide use in classification # it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5 # it showed the most amount of formative reviews without seeming excessive in data removal print("\nRating distribution:") rating_dist = df_clean['rating'].value_counts().sort_index() for rating, count in rating_dist.items(): percentage = count / len(df_clean) * 100 print(f" {rating}{"✭"*rating}: {count:,} ({percentage:.1f}%)") print("\nWord count statistics:") print(f" Mean: {df_clean['word_count'].mean():.1f} words") print(f" Median: {df_clean['word_count'].median():.1f} words") print(f" Min: {df_clean['word_count'].min()} words") print(f" Max: {df_clean['word_count'].max()} words") print("\nVerify New Data:") print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}") print(f" Null values: {df_clean.isnull().sum().to_dict()}") print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}") # lang detection takes 5+ mins so leaving it commented for now #df_clean['detected_lang'] = df_clean['review'].apply(detect_language) #print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}") # Sample reviews from each rating print("\n" + "="*50) print("SAMPLE CLEANED REVIEWS") print("="*50) for rating in [1,2,3,4,5]: if len(df_clean[df_clean['rating'] == rating]) > 0: sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating]))) print(f"\n{rating} {"✭" * rating} REVIEWS:") for index, row in sample.iterrows(): print(f" • ({row['word_count']} words) {row['review'][:100]}") # Note about language print("Language detection not applied due to unreliability on short") print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.") print(" ...Manual annotation phase will identify any non-English reviews") return df_clean if __name__ == "__main__": input_file = "data/uber_reviews.csv" output_file = "data/uber_reviews_cleaned.csv" df_clean = preprocess_uber_reviews(input_file, output_file) print("\nPreprocessing complete!") print(f"Clean dataset: {len(df_clean):,} reviews ready for sampling")