Files
ReClass/multitag/preprocess.py

169 lines
6.0 KiB
Python

import pandas as pd
import re
from langdetect import detect, LangDetectException
def clean_text(text):
"""Clean review text by removing URLS, emails, excessive whitespace
Input:
text - the review text to clean
Outputs:
str: the cleaned review text
"""
if pd.isna(text):
return ""
# Convert to lower for uniformity
text = str(text).lower()
# Remove URLs using regex
text = re.sub(r'http\S+|www\S+', '', text)
# Remove emails
text = re.sub(r'\S+@\S+', '', text)
# Normalize punctuation
text = re.sub(r'\.{2,}', '.', text)
text = re.sub(r'!{2,}', '!', text)
text = re.sub(r'\?{2,}', '?', text)
# Remove excessive whitespace by replacing with single whitespace where there is trailing spaces
text = re.sub(r'\s+', ' ', text).strip()
return text
def detect_language(text):
"""Detect language of text"""
try:
if pd.isna(text) or len(str(text).strip()) < 10:
return 'unknown'
return detect(str(text))
except LangDetectException:
return 'unknown'
def preprocess_uber_reviews(input_path, output_path):
"""
preprocess_uber_reviews by loading, cleaning, and filtering the data.
- No language detection due to unreliability on short informal text
- Data is labelled as English, but contains non-english text
- Assumes location of the datasets hardcoded, doesn't handle if it doesn't exist
- Assumes there is a column named "review_description"
1. Load from csv pd.read_csv()
2. Remove rows with missing descriptions
3. Clean text by removing URLS, emails, and excessive whitespace
4. Calculate word count for each review
5. Removes duplicate reviews
6. Removes less than 5 word reviews
6. Saves the cleaned dataset to uber_reviews_cleaned.csv
Inputs:
input_path (str): Path to uber_reviews.csv
output_path (str): Path to the cleaned CSV uber_reviews_cleaned.csv
Outputs:
pd.df_clean: the dataframe of cleaned processed reviews
"""
print("="*50)
print("PREPROCESSING UBER REVIEWS")
print("="*50)
# 1. Load data
print("\n1. Loading data...")
df = pd.read_csv(input_path, low_memory=False)
print(f" Original size: {len(df):,} reviews")
# 2. Remove missing reviews
print("\n2. Removing missing reviews...")
df = df.dropna(subset=['review_description'])
print(f" After removing nulls: {len(df):,} reviews")
# 3. Clean text
print("\n3. Cleaning text...")
df['review_clean'] = df['review_description'].apply(clean_text)
# 4. Calculate word count
df['word_count'] = df['review_clean'].str.split().str.len()
# 5. Remove short reviews
review_length_limit = 5
print(f"\n4. Removing short reviews (< {review_length_limit})...")
print(" Rationale: Insufficient context for classification")
before = len(df)
df = df[df['word_count'] >= review_length_limit]
removed = before - len(df)
print(f" Removed: {removed:,} reviews ({removed/before*100:.1f}%)")
print(f" Remaining: {len(df):,} reviews")
# 6. Remove duplicates
print("\n5. Removing duplicates...")
before = len(df)
df = df.drop_duplicates(subset=['review_clean'])
removed = before - len(df)
print(f" Removed: {removed:,} duplicates")
print(f" Remaining: {len(df):,} reviews")
# 7. Final dataset
df_clean = df[['review_clean', 'rating', 'word_count']].copy()
df_clean.rename(columns={'review_clean': 'review'}, inplace=True)
df_clean = df_clean.reset_index(drop=True)
# 8. Save
print(f"\n6. Saving to {output_path}...")
df_clean.to_csv(output_path, index=False)
# Summary
print("\n" + "="*50)
print("PREPROCESSING COMPLETE")
print("="*50)
print(f"\nFinal dataset: {len(df_clean):,} reviews")
print(f"Data source: Indian Uber market (predominantly English)")
print(f"Quality filters: word_count >= 5, duplicates removed")
print("\nRating distribution:")
rating_dist = df_clean['rating'].value_counts().sort_index()
for rating, count in rating_dist.items():
percentage = count / len(df_clean) * 100
print(f" {rating}{""*rating}: {count:,} ({percentage:.1f}%)")
print("\nWord count statistics:")
print(f" Mean: {df_clean['word_count'].mean():.1f} words")
print(f" Median: {df_clean['word_count'].median():.1f} words")
print(f" Min: {df_clean['word_count'].min()} words")
print(f" Max: {df_clean['word_count'].max()} words")
print("\nVerify New Data:")
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
print(f" Null values: {df_clean.isnull().sum().to_dict()}")
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
# lang detection takes 5+ mins
#df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
#print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
# Sample reviews from each rating
print("\n" + "="*50)
print("SAMPLE CLEANED REVIEWS")
print("="*50)
for rating in [1,2,3,4,5]:
if len(df_clean[df_clean['rating'] == rating]) > 0:
sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
print(f"\n{rating} {"" * rating} REVIEWS:")
for idx, row in sample.iterrows():
print(f" • ({row['word_count']} words) {row['review'][:100]}")
# Note about language
print("Language detection not applied due to unreliability on short")
print("informal text. Dataset is from the Indian market, labeled as English.")
print("Manual annotation phase will identify any non-English reviews. And put aside.")
return df_clean
if __name__ == "__main__":
input_file = "data/uber_reviews.csv"
output_file = "data/uber_reviews_cleaned.csv"
df_clean = preprocess_uber_reviews(input_file, output_file)
print("\nPreprocessing complete!")
print(f"Clean dataset: {len(df_clean):,} reviews ready for sampling")