231 lines
8.6 KiB
Python
231 lines
8.6 KiB
Python
# TODO: Fix get_stratified_sample() replace broken x() with actual working logic
|
|
# TODO: Add verification comparison between ratings
|
|
# TODO: implement sample_with_keywords() add to lists, and implement logic
|
|
# TODO: Clean up the logging print statements
|
|
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
print(pd.__version__)
|
|
print(np.__version__)
|
|
|
|
path = "multitag/data/uber_reviews_cleaned.csv"
|
|
sampled_path = "multitag/data/uber_reviews_sampled.csv"
|
|
original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
|
|
class Sampler:
|
|
def __init__(self, data_path, target_samples):
|
|
|
|
self.data_path = data_path
|
|
self.target_samples = 5000 # target number of samples
|
|
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
|
|
|
|
self.original_data = pd.read_csv(original_path, low_memory=False)
|
|
self.data = pd.read_csv(self.data_path, low_memory=False)
|
|
self.total = len(self.data) # total number of records in the dataset
|
|
|
|
print("="*50)
|
|
print("SAMPLER INITIALIZED")
|
|
print("="*50,"\n")
|
|
|
|
|
|
print(f"Total records in dataset: {self.total}")
|
|
print(f"Data loaded from {self.data_path}, total records: {len(self.data)}")
|
|
#print(self.data.head())
|
|
#print(f"\nCurrent distribution:")
|
|
#print(self.data[self.stratify_column].value_counts().sort_index())
|
|
#print(f"\nColumns: {self.data.columns.tolist()}")
|
|
print(f"Percentage distribution (working data):")
|
|
print((self.data[self.stratify_column].value_counts(normalize=True).sort_index() * 100).round(1),"\n")
|
|
_origdist = self.original_data[self.stratify_column].value_counts(normalize=True).sort_index()
|
|
print(f"Original Distribution from {original_path}:")
|
|
print((_origdist*100).round(1),"\n")
|
|
|
|
self.data.info()
|
|
|
|
# add sampling method here
|
|
# random sample 5000 entries with stratifiying by rating
|
|
"""
|
|
rating
|
|
5 57.1% (611133)
|
|
1 26.5% (283895)
|
|
4 7.8% (82953)
|
|
3 4.7% (49928)
|
|
2 3.9% (41707)
|
|
Name: proportion, dtype: object
|
|
"""
|
|
"""
|
|
|
|
Sample size by rating
|
|
Redundant calculation, kept for clarity
|
|
Doesn't factor that the distribution changed greatly after preprocessing
|
|
|
|
"""
|
|
def get_stratified_sample(self) -> pd.DataFrame:
|
|
stratified_sample = (
|
|
self.data
|
|
.reset_index(drop=True)
|
|
.apply(self.x)
|
|
.sample(n=self.target_samples, random_state=42)
|
|
)
|
|
return stratified_sample
|
|
|
|
|
|
|
|
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
|
|
def x(self, x):
|
|
n = int(len(x) / self.total * self.target_samples)
|
|
n = max(n,1)
|
|
return x.sample(n=n, random_state=42)
|
|
"""
|
|
get_proportional_sample()
|
|
|
|
"""
|
|
|
|
"""
|
|
original_distribution_sample()
|
|
The main sampling method for our labelling as it
|
|
keeps composition of the original uber dataset
|
|
which is a fairer comparison, may also work better in general
|
|
|
|
inputs:
|
|
|
|
outputs:
|
|
|
|
"""
|
|
def original_distribution_sample(self):
|
|
original_dist = {
|
|
5: int(0.571 * self.target_samples),
|
|
1: int(0.265 * self.target_samples),
|
|
4: int(0.078 * self.target_samples),
|
|
3: int(0.047 * self.target_samples),
|
|
2: int(0.039 * self.target_samples)
|
|
}
|
|
print("Target Distribution =", original_dist)
|
|
samples = []
|
|
for rating, num_samples in original_dist.items():
|
|
rating_data = self.data[self.data[self.stratify_column] == rating]
|
|
if len(rating_data) < num_samples:
|
|
print("Missing samples available for rating")
|
|
num_samples = len(rating_data)
|
|
sample = rating_data.sample(n = num_samples,random_state=42)
|
|
samples.append(sample)
|
|
original_sample = pd.concat(samples, ignore_index=True)
|
|
return original_sample
|
|
|
|
"""
|
|
sample_with_keywords()
|
|
|
|
In order to train on more bugs and features data in
|
|
future this method was created
|
|
- 2000 balanced by rating (400 per)
|
|
- 1500 likely bugs using bug_keywords list
|
|
- 1500 likely features using feature_keywords list
|
|
|
|
inputs:
|
|
outputs:
|
|
|
|
"""
|
|
|
|
def sample_with_keywords(self):
|
|
#TODO add keywords for feature classification
|
|
print(f"\n{"="*50}")
|
|
print("Keyword influenced / rating stratified set")
|
|
print(f"\n{"="*50}")
|
|
|
|
bug_keywords = ["crash","freeze", "error",
|
|
"stop", "doesnt work", "doesn't work","loading",
|
|
"blank", "stuck", "load", "broken", "break",
|
|
"glitch", "issue", "fix", "needs","please repair",
|
|
"failed", "responding"
|
|
]
|
|
feature_keywords = ["need","should","add","wish","would","benefit",
|
|
"please add","should have", "want", "missing",
|
|
"require", "suggestion", "request", "could you",
|
|
"include", "hope", "why not", "greatly", "option",
|
|
"new","system"
|
|
]
|
|
self.data['likely_bug'] = self.data['review'].apply(
|
|
lambda x:any(keyword in str(x).lower() for keyword in bug_keywords)
|
|
)
|
|
self.data['likely_feature'] = self.data['review'].apply(
|
|
lambda x: any (keyword in str(x).lower() for keyword in feature_keywords)
|
|
)
|
|
print(f"Reviews with bug_keywords = {self.data['likely_bug'].sum():,}")
|
|
print(f"Reviews with feature_keywords = {self.data['likely_feature'].sum():,}")
|
|
|
|
print(f"Sampling 2000 reviews balanced (400 per rating)...")
|
|
base_sample = self.data.groupby(self.stratify_column).apply(
|
|
lambda x: x.sample(n=min(400, len(x)), random_state=42),
|
|
include_groups = False
|
|
).reset_index(drop=True)
|
|
|
|
print(f"Sampling 1500 possible bug reports...")
|
|
bugs = self.data[self.data['likely_bug'] & ~self.data.index.isin(base_sample.index)]
|
|
bug_sample = bugs.sample(n=min(1500, len(bugs)), random_state=42)
|
|
|
|
print(f"Sampling 1500 possible feature requests...")
|
|
features = self.data[
|
|
self.data['likely_feature'] &
|
|
~self.data.index.isin(base_sample.index) &
|
|
~self.data.index.isin(bug_sample.index)
|
|
]
|
|
feature_sample = features.sample(n=min(1500, len(features)), random_state=42)
|
|
|
|
# Combine all samples
|
|
keyword_sample = pd.concat([base_sample, bug_sample, feature_sample], ignore_index=True)
|
|
|
|
# Drop helper columns
|
|
keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
|
|
|
|
print(f"\n Total samples: {len(keyword_sample):,}")
|
|
return keyword_sample
|
|
|
|
|
|
|
|
|
|
def save_sample(self, sample_df,output_path):
|
|
"""Save sample and display statistics"""
|
|
sample_df.to_csv(output_path, index=False)
|
|
|
|
print(f"\n{'='*50}")
|
|
print("SAMPLE SAVED")
|
|
print(f"{'='*50}")
|
|
print(f"Location: {output_path}")
|
|
print(f"Total samples: {len(sample_df):,}")
|
|
print(f"\nDistribution:")
|
|
for rating in sorted(sample_df[self.stratify_column].unique()):
|
|
count = (sample_df[self.stratify_column] == rating).sum()
|
|
pct = count / len(sample_df) * 100
|
|
print(f" {rating}★: {count:,} ({pct:.1f}%)")
|
|
|
|
def main():
|
|
|
|
sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
|
|
|
|
# Choose sampling strategy
|
|
print(f"\n{'='*50}")
|
|
print("SAMPLING STRATEGY OPTIONS")
|
|
print(f"{'='*50}")
|
|
print("1. get_stratified_sample() stratified by current distribution")
|
|
print("2. original_distribution_sample() stratified by the original data distribution")
|
|
print("3. get_keyword_boosted_sample() stratified using original distribution but also using a keyword dictionary")
|
|
|
|
choice = input("\nEnter choice (1-3): ").strip()
|
|
|
|
if choice == '1':
|
|
sample = sampler.get_stratified_sample()
|
|
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
|
|
|
elif choice == '2':
|
|
sample = sampler.original_distribution_sample()
|
|
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
|
|
|
elif choice == '3':
|
|
sample = sampler.sample_with_keywords()
|
|
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |