Added multitag.py (65% complete), preprocess.py (complete), sampler.py (80% complete)
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,2 +1,4 @@
|
|||||||
multitag/data/*.csv
|
multitag/data/*.csv
|
||||||
multitag/raw_data/
|
multitag/raw_data/
|
||||||
|
multitag/.ipynb_checkpoints
|
||||||
|
multitag/.vscode
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -1,3 +1,14 @@
|
|||||||
|
# TODO: Refactor,especially change expected names as I jumped the gun when first making this without sampling properly
|
||||||
|
# TODO: Add button labels and finalise the categories of aspects
|
||||||
|
# TODO: Ensure there is persistent progress tracking implentation before labelling
|
||||||
|
# TODO: Finalise keybinds
|
||||||
|
# TODO: Display progress e.g. review 1020 of 5000
|
||||||
|
# TODO: Validate saving progres
|
||||||
|
# TODO: Loop instead of pressing enter
|
||||||
|
# TODO: Autosave ? / confirm quit at least
|
||||||
|
# TODO: More visual q's
|
||||||
|
|
||||||
|
|
||||||
import tkinter as tk
|
import tkinter as tk
|
||||||
from tkinter import ttk
|
from tkinter import ttk
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|||||||
@@ -88,9 +88,12 @@ def preprocess_uber_reviews(input_path, output_path):
|
|||||||
df['word_count'] = df['review_clean'].str.split().str.len()
|
df['word_count'] = df['review_clean'].str.split().str.len()
|
||||||
|
|
||||||
# 5. Remove short reviews
|
# 5. Remove short reviews
|
||||||
review_length_limit = 5
|
review_length_limit = 5 ### limit review length ###
|
||||||
print(f"\n4. Removing short reviews (< {review_length_limit})...")
|
print(f"\n4. Removing short reviews so reviews have better context / (usefulness) (< {review_length_limit})...")
|
||||||
print(" Rationale: Insufficient context for classification")
|
# 1 word reviews provide little to draw conclusions from and bloat the
|
||||||
|
# dataset a lot, nearly 50% of reviews!
|
||||||
|
|
||||||
|
# display changes
|
||||||
before = len(df)
|
before = len(df)
|
||||||
df = df[df['word_count'] >= review_length_limit]
|
df = df[df['word_count'] >= review_length_limit]
|
||||||
removed = before - len(df)
|
removed = before - len(df)
|
||||||
@@ -119,8 +122,10 @@ def preprocess_uber_reviews(input_path, output_path):
|
|||||||
print("PREPROCESSING COMPLETE")
|
print("PREPROCESSING COMPLETE")
|
||||||
print("="*50)
|
print("="*50)
|
||||||
print(f"\nFinal dataset: {len(df_clean):,} reviews")
|
print(f"\nFinal dataset: {len(df_clean):,} reviews")
|
||||||
print(f"Data source: Indian Uber market (predominantly English)")
|
|
||||||
print(f"Quality filters: word_count >= 5, duplicates removed")
|
print(f"Quality filters: word_count >= 5, duplicates removed")
|
||||||
|
# while this does remove a some legitimate reviews which would provide use in classification
|
||||||
|
# it also allows us to find a higher total amount of useful reviews, after seeing the results of 1, 2, 3, 4, 5
|
||||||
|
# it showed the most amount of formative reviews without seeming excessive in data removal
|
||||||
|
|
||||||
print("\nRating distribution:")
|
print("\nRating distribution:")
|
||||||
rating_dist = df_clean['rating'].value_counts().sort_index()
|
rating_dist = df_clean['rating'].value_counts().sort_index()
|
||||||
@@ -138,7 +143,7 @@ def preprocess_uber_reviews(input_path, output_path):
|
|||||||
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
|
print(f" Short reviews: {df_clean[df_clean['word_count'] < 5]}")
|
||||||
print(f" Null values: {df_clean.isnull().sum().to_dict()}")
|
print(f" Null values: {df_clean.isnull().sum().to_dict()}")
|
||||||
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
|
print(f" Duplicate reviews: {df_clean.duplicated(subset=['review']).sum()}")
|
||||||
# lang detection takes 5+ mins
|
# lang detection takes 5+ mins so leaving it commented for now
|
||||||
#df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
|
#df_clean['detected_lang'] = df_clean['review'].apply(detect_language)
|
||||||
#print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
|
#print(f" Detected languages:\n {df_clean['detected_lang'].value_counts( )}")
|
||||||
|
|
||||||
@@ -150,13 +155,13 @@ def preprocess_uber_reviews(input_path, output_path):
|
|||||||
if len(df_clean[df_clean['rating'] == rating]) > 0:
|
if len(df_clean[df_clean['rating'] == rating]) > 0:
|
||||||
sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
|
sample = df_clean[df_clean['rating'] == rating].sample(min(2, len(df_clean[df_clean['rating'] == rating])))
|
||||||
print(f"\n{rating} {"✭" * rating} REVIEWS:")
|
print(f"\n{rating} {"✭" * rating} REVIEWS:")
|
||||||
for idx, row in sample.iterrows():
|
for index, row in sample.iterrows():
|
||||||
print(f" • ({row['word_count']} words) {row['review'][:100]}")
|
print(f" • ({row['word_count']} words) {row['review'][:100]}")
|
||||||
|
|
||||||
# Note about language
|
# Note about language
|
||||||
print("Language detection not applied due to unreliability on short")
|
print("Language detection not applied due to unreliability on short")
|
||||||
print("informal text. Dataset is from the Indian market, labeled as English.")
|
print("informal text. The Uber Reviews Dataset is from the Indian market, labeled as English.")
|
||||||
print("Manual annotation phase will identify any non-English reviews. And put aside.")
|
print(" ...Manual annotation phase will identify any non-English reviews")
|
||||||
|
|
||||||
return df_clean
|
return df_clean
|
||||||
|
|
||||||
|
|||||||
@@ -1,22 +1,45 @@
|
|||||||
|
# TODO: Fix get_stratified_sample() replace broken x() with actual working logic
|
||||||
|
# TODO: Add verification comparison between ratings
|
||||||
|
# TODO: implement sample_with_keywords() add to lists, and implement logic
|
||||||
|
# TODO: Clean up the logging print statements
|
||||||
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
print(pd.__version__)
|
print(pd.__version__)
|
||||||
print(np.__version__)
|
print(np.__version__)
|
||||||
|
|
||||||
path = "data/uber_reviews.csv"
|
path = "multitag/data/uber_reviews_cleaned.csv"
|
||||||
sampled_path = "data/uber_reviews_sampled.csv"
|
sampled_path = "multitag/data/uber_reviews_sampled.csv"
|
||||||
|
original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
|
||||||
class Sampler:
|
class Sampler:
|
||||||
def __init__(self, data_path):
|
def __init__(self, data_path, target_samples):
|
||||||
|
|
||||||
self.data_path = data_path
|
self.data_path = data_path
|
||||||
|
self.target_samples = 5000 # target number of samples
|
||||||
|
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
|
||||||
|
|
||||||
|
self.original_data = pd.read_csv(original_path, low_memory=False)
|
||||||
self.data = pd.read_csv(self.data_path, low_memory=False)
|
self.data = pd.read_csv(self.data_path, low_memory=False)
|
||||||
self.total = len(self.data) # total number of records in the dataset
|
self.total = len(self.data) # total number of records in the dataset
|
||||||
self.target_samples = 5000 # target number of samples
|
|
||||||
self.stratify_column = "rating" # column to stratify by
|
|
||||||
|
|
||||||
|
print("="*50)
|
||||||
|
print("SAMPLER INITIALIZED")
|
||||||
|
print("="*50,"\n")
|
||||||
|
|
||||||
|
|
||||||
|
print(f"Total records in dataset: {self.total}")
|
||||||
print(f"Data loaded from {self.data_path}, total records: {len(self.data)}")
|
print(f"Data loaded from {self.data_path}, total records: {len(self.data)}")
|
||||||
print(self.data.head())
|
#print(self.data.head())
|
||||||
|
#print(f"\nCurrent distribution:")
|
||||||
|
#print(self.data[self.stratify_column].value_counts().sort_index())
|
||||||
|
#print(f"\nColumns: {self.data.columns.tolist()}")
|
||||||
|
print(f"Percentage distribution (working data):")
|
||||||
|
print((self.data[self.stratify_column].value_counts(normalize=True).sort_index() * 100).round(1),"\n")
|
||||||
|
_origdist = self.original_data[self.stratify_column].value_counts(normalize=True).sort_index()
|
||||||
|
print(f"Original Distribution from {original_path}:")
|
||||||
|
print((_origdist*100).round(1),"\n")
|
||||||
|
|
||||||
self.data.info()
|
self.data.info()
|
||||||
|
|
||||||
@@ -31,36 +54,128 @@ class Sampler:
|
|||||||
2 3.9% (41707)
|
2 3.9% (41707)
|
||||||
Name: proportion, dtype: object
|
Name: proportion, dtype: object
|
||||||
"""
|
"""
|
||||||
|
"""
|
||||||
|
|
||||||
def get_stratified_sample(self):
|
Sample size by rating
|
||||||
stratified_sample = self.data.groupby(self.stratify_column).apply(
|
Redundant calculation, kept for clarity
|
||||||
lambda x: x.sample(n=int(len(x) / self.total * self.target_samples)),
|
Doesn't factor that the distribution changed greatly after preprocessing
|
||||||
# include_groups=False
|
|
||||||
)
|
"""
|
||||||
|
def get_stratified_sample(self) -> pd.Series:
|
||||||
|
stratified_sample = self.data.groupby(self.stratify_column).apply(self.x)
|
||||||
return stratified_sample
|
return stratified_sample
|
||||||
sampler = Sampler("data/uber_reviews.csv")
|
|
||||||
|
|
||||||
|
|
||||||
|
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
|
||||||
to_sample = input("Do you want to create a stratified sample of the data? (y/n): ")
|
def x(self, ):
|
||||||
|
return lambda x: x.sample(n=int(len(x) / self.total * self.target_samples))
|
||||||
if to_sample == 'y':
|
|
||||||
sampled = sampler.get_stratified_sample()
|
|
||||||
sampled.to_csv("data/uber_reviews_sampled.csv", index=False)
|
|
||||||
print("Original columns:", sampler.data.columns.tolist())
|
|
||||||
print("Sampled columns:", sampled.columns.tolist())
|
|
||||||
print("Stratified sample saved to data/uber_reviews_sampled.csv")
|
|
||||||
elif to_sample == 'n':
|
|
||||||
sampled_data = pd.read_csv("data/uber_reviews_sampled.csv", low_memory=False)
|
|
||||||
"""
|
"""
|
||||||
debug to check sampled data matches original columns
|
get_proportional_sample()
|
||||||
print("Original columns:", sampler.data.columns.tolist())
|
|
||||||
print("Sampled columns:", sampled_data.columns.tolist())
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
print("Original data distribution:")
|
"""
|
||||||
print(sampler.data["rating"].value_counts())
|
original_distribution_sample()
|
||||||
print("Sampled data distribution:")
|
The main sampling method for our labelling as it
|
||||||
print(sampled_data["rating"].value_counts())
|
keeps composition of the original uber dataset
|
||||||
else:
|
which is a fairer comparison, may also work better in general
|
||||||
print("Invalid input, please enter 'y' or 'n'")
|
|
||||||
|
inputs:
|
||||||
|
|
||||||
|
outputs:
|
||||||
|
|
||||||
|
"""
|
||||||
|
def original_distribution_sample(self):
|
||||||
|
original_dist = {
|
||||||
|
5: int(0.571 * self.target_samples),
|
||||||
|
1: int(0.265 * self.target_samples),
|
||||||
|
4: int(0.078 * self.target_samples),
|
||||||
|
3: int(0.047 * self.target_samples),
|
||||||
|
2: int(0.039 * self.target_samples)
|
||||||
|
}
|
||||||
|
print("Target Distribution =", original_dist)
|
||||||
|
samples = []
|
||||||
|
for rating, num_samples in original_dist.items():
|
||||||
|
rating_data = self.data[self.data[self.stratify_column] == rating]
|
||||||
|
if len(rating_data) < num_samples:
|
||||||
|
print("Missing samples available for rating")
|
||||||
|
num_samples = len(rating_data)
|
||||||
|
sample = rating_data.sample(n = num_samples,random_state=33)
|
||||||
|
samples.append(sample)
|
||||||
|
original_sample = pd.concat(samples, ignore_index=True)
|
||||||
|
return original_sample
|
||||||
|
|
||||||
|
"""
|
||||||
|
sample_with_keywords()
|
||||||
|
|
||||||
|
In order to train on more bugs and features data in
|
||||||
|
future this method was created
|
||||||
|
- 2000 balanced by rating (400 per)
|
||||||
|
- 1500 likely bugs using bug_keywords list
|
||||||
|
- 1500 likely features using feature_keywords list
|
||||||
|
|
||||||
|
inputs:
|
||||||
|
outputs:
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def sample_with_keywords():
|
||||||
|
#TODO add keywords for feature classification
|
||||||
|
print(f"\n{"="*50}")
|
||||||
|
print("Keyword influenced / rating stratified set")
|
||||||
|
print(f"\n{"="*50}")
|
||||||
|
|
||||||
|
bug_keywords = ["crash","crashes", "freeze", "freezes", "error",
|
||||||
|
"stops", "doesnt work", "doesn't work","loading",
|
||||||
|
"blank", "stuck", "load", "loads", "broken", "breaks",
|
||||||
|
"glitch", "glitches", "issue", "could you", "fix",
|
||||||
|
"failed"]
|
||||||
|
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def save_sample(self, sample_df,output_path):
|
||||||
|
"""Save sample and display statistics"""
|
||||||
|
sample_df.to_csv(output_path, index=False)
|
||||||
|
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print("SAMPLE SAVED")
|
||||||
|
print(f"{'='*50}")
|
||||||
|
print(f"Location: {output_path}")
|
||||||
|
print(f"Total samples: {len(sample_df):,}")
|
||||||
|
print(f"\nDistribution:")
|
||||||
|
for rating in sorted(sample_df[self.stratify_column].unique()):
|
||||||
|
count = (sample_df[self.stratify_column] == rating).sum()
|
||||||
|
pct = count / len(sample_df) * 100
|
||||||
|
print(f" {rating}★: {count:,} ({pct:.1f}%)")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
|
||||||
|
|
||||||
|
# Choose sampling strategy
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print("SAMPLING STRATEGY OPTIONS")
|
||||||
|
print(f"{'='*50}")
|
||||||
|
print("1. get_stratified_sample() stratified by current distribution")
|
||||||
|
print("2. original_distribution_sample() stratified by the original data distribution")
|
||||||
|
print("3. get_keyword_boosted_sample() stratified using original distribution but also using a keyword dictionary")
|
||||||
|
|
||||||
|
choice = input("\nEnter choice (1-3): ").strip()
|
||||||
|
|
||||||
|
if choice == '1':
|
||||||
|
sample = sampler.get_stratified_sample()
|
||||||
|
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
||||||
|
|
||||||
|
elif choice == '2':
|
||||||
|
sample = sampler.original_distribution_sample()
|
||||||
|
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
||||||
|
|
||||||
|
elif choice == '3':
|
||||||
|
sample = sampler.get_keyword_boosted_sample()
|
||||||
|
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user