Added multitag.py (65% complete), preprocess.py (complete), sampler.py (80% complete)

This commit is contained in:
2025-11-09 01:45:09 +00:00
parent 4d6e2511e6
commit a178284ffc
5 changed files with 179 additions and 1517 deletions

View File

@@ -88,9 +88,12 @@ def preprocess_uber_reviews(input_path, output_path):
df['word_count'] = df['review_clean'].str.split().str.len()
# 5. Remove short reviews
review_length_limit = 5
print(f"\n4. Removing short reviews (< {review_length_limit})...")
print(" Rationale: Insufficient context for classification")
review_length_limit = 5 ### limit review length ###
print(f"\n4. Removing short reviews so reviews have better context / (usefulness) (< {review_length_limit})...")
# 1 word reviews provide little to draw conclusions from and bloat the
# dataset a lot, nearly 50% of reviews!
# display changes
before = len(df)
df = df[df['word_count'] >= review_length_limit]
removed = before - len(df)
@@ -119,8 +122,10 @@ def preprocess_uber_reviews(input_path, output_path):
print("PREPROCESSING COMPLETE")
print("="*50)
print(f"\nFinal dataset: {len(df_clean):,} reviews")
print(f"Data source: Indian Uber market (predominantly English)")
print(f"Quality filters: word_count >= 5, duplicates removed")
print(f"Quality filters: word_count >= 5, duplicates removed")
# while this does remove a some legitimate reviews which would provide use in classification
# it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5
# it showed the most amount of formative reviews without seeming excessive in data removal
print("\nRating distribution:")
rating_dist = df_clean['rating'].value_counts().sort_index()
@@ -138,7 +143,7 @@ def preprocess_uber_reviews(input_path, output_path):
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
print(f" Null values: {df_clean.isnull().sum().to_dict()}")
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
# lang detection takes 5+ mins
# lang detection takes 5+ mins so leaving it commented for now
#df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
#print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
@@ -150,13 +155,13 @@ def preprocess_uber_reviews(input_path, output_path):
if len(df_clean[df_clean['rating'] == rating]) > 0:
sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
print(f"\n{rating} {"" * rating} REVIEWS:")
for idx, row in sample.iterrows():
for index, row in sample.iterrows():
print(f" • ({row['word_count']} words) {row['review'][:100]}")
# Note about language
print("Language detection not applied due to unreliability on short")
print("informal text. Dataset is from the Indian market, labeled as English.")
print("Manual annotation phase will identify any non-English reviews. And put aside.")
print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
print(" ...Manual annotation phase will identify any non-English reviews")
return df_clean