Added multitag.py (65% complete), preprocess.py (complete), sampler.py (80% complete)

This commit is contained in:
2025-11-09 01:45:09 +00:00
parent 4d6e2511e6
commit a178284ffc
5 changed files with 179 additions and 1517 deletions

View File

@@ -1,22 +1,45 @@
# TODO: Fix get_stratified_sample() replace broken x() with actual working logic
# TODO: Add verification comparison between ratings
# TODO: implement sample_with_keywords() add to lists, and implement logic
# TODO: Clean up the logging print statements
import pandas as pd
import numpy as np
print(pd.__version__)
print(np.__version__)
path = "data/uber_reviews.csv"
sampled_path = "data/uber_reviews_sampled.csv"
path = "multitag/data/uber_reviews_cleaned.csv"
sampled_path = "multitag/data/uber_reviews_sampled.csv"
original_path = "multitag/data/uber_reviews.csv" ### only for distribution comparison
class Sampler:
def __init__(self, data_path):
def __init__(self, data_path, target_samples):
self.data_path = data_path
self.target_samples = 5000 # target number of samples
self.stratify_column = "rating" # column to stratify by (another sampleset will use keyword boosting to aid feature request / bug report numbers)
self.original_data = pd.read_csv(original_path, low_memory=False)
self.data = pd.read_csv(self.data_path, low_memory=False)
self.total = len(self.data) # total number of records in the dataset
self.target_samples = 5000 # target number of samples
self.stratify_column = "rating" # column to stratify by
print("="*50)
print("SAMPLER INITIALIZED")
print("="*50,"\n")
print(f"Total records in dataset: {self.total}")
print(f"Data loaded from {self.data_path}, total records: {len(self.data)}")
print(self.data.head())
#print(self.data.head())
#print(f"\nCurrent distribution:")
#print(self.data[self.stratify_column].value_counts().sort_index())
#print(f"\nColumns: {self.data.columns.tolist()}")
print(f"Percentage distribution (working data):")
print((self.data[self.stratify_column].value_counts(normalize=True).sort_index() * 100).round(1),"\n")
_origdist = self.original_data[self.stratify_column].value_counts(normalize=True).sort_index()
print(f"Original Distribution from {original_path}:")
print((_origdist*100).round(1),"\n")
self.data.info()
@@ -31,36 +54,128 @@ class Sampler:
2 3.9% (41707)
Name: proportion, dtype: object
"""
def get_stratified_sample(self):
stratified_sample = self.data.groupby(self.stratify_column).apply(
lambda x: x.sample(n=int(len(x) / self.total * self.target_samples)),
# include_groups=False
)
return stratified_sample
sampler = Sampler("data/uber_reviews.csv")
to_sample = input("Do you want to create a stratified sample of the data? (y/n): ")
if to_sample == 'y':
sampled = sampler.get_stratified_sample()
sampled.to_csv("data/uber_reviews_sampled.csv", index=False)
print("Original columns:", sampler.data.columns.tolist())
print("Sampled columns:", sampled.columns.tolist())
print("Stratified sample saved to data/uber_reviews_sampled.csv")
elif to_sample == 'n':
sampled_data = pd.read_csv("data/uber_reviews_sampled.csv", low_memory=False)
"""
debug to check sampled data matches original columns
print("Original columns:", sampler.data.columns.tolist())
print("Sampled columns:", sampled_data.columns.tolist())
"""
print("Original data distribution:")
print(sampler.data["rating"].value_counts())
print("Sampled data distribution:")
print(sampled_data["rating"].value_counts())
else:
print("Invalid input, please enter 'y' or 'n'")
Sample size by rating
Redundant calculation, kept for clarity
Doesn't factor that the distribution changed greatly after preprocessing
"""
def get_stratified_sample(self) -> pd.Series:
stratified_sample = self.data.groupby(self.stratify_column).apply(self.x)
return stratified_sample
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
def x(self, ):
return lambda x: x.sample(n=int(len(x) / self.total * self.target_samples))
"""
get_proportional_sample()
"""
"""
original_distribution_sample()
The main sampling method for our labelling as it
keeps composition of the original uber dataset
which is a fairer comparison, may also work better in general
inputs:
outputs:
"""
def original_distribution_sample(self):
original_dist = {
5: int(0.571 * self.target_samples),
1: int(0.265 * self.target_samples),
4: int(0.078 * self.target_samples),
3: int(0.047 * self.target_samples),
2: int(0.039 * self.target_samples)
}
print("Target Distribution =", original_dist)
samples = []
for rating, num_samples in original_dist.items():
rating_data = self.data[self.data[self.stratify_column] == rating]
if len(rating_data) < num_samples:
print("Missing samples available for rating")
num_samples = len(rating_data)
sample = rating_data.sample(n = num_samples,random_state=33)
samples.append(sample)
original_sample = pd.concat(samples, ignore_index=True)
return original_sample
"""
sample_with_keywords()
In order to train on more bugs and features data in
future this method was created
- 2000 balanced by rating (400 per)
- 1500 likely bugs using bug_keywords list
- 1500 likely features using feature_keywords list
inputs:
outputs:
"""
def sample_with_keywords():
#TODO add keywords for feature classification
print(f"\n{"="*50}")
print("Keyword influenced / rating stratified set")
print(f"\n{"="*50}")
bug_keywords = ["crash","crashes", "freeze", "freezes", "error",
"stops", "doesnt work", "doesn't work","loading",
"blank", "stuck", "load", "loads", "broken", "breaks",
"glitch", "glitches", "issue", "could you", "fix",
"failed"]
return
def save_sample(self, sample_df,output_path):
"""Save sample and display statistics"""
sample_df.to_csv(output_path, index=False)
print(f"\n{'='*50}")
print("SAMPLE SAVED")
print(f"{'='*50}")
print(f"Location: {output_path}")
print(f"Total samples: {len(sample_df):,}")
print(f"\nDistribution:")
for rating in sorted(sample_df[self.stratify_column].unique()):
count = (sample_df[self.stratify_column] == rating).sum()
pct = count / len(sample_df) * 100
print(f" {rating}★: {count:,} ({pct:.1f}%)")
def main():
sampler = Sampler("multitag/data/uber_reviews_cleaned.csv", target_samples=5000)
# Choose sampling strategy
print(f"\n{'='*50}")
print("SAMPLING STRATEGY OPTIONS")
print(f"{'='*50}")
print("1. get_stratified_sample() stratified by current distribution")
print("2. original_distribution_sample() stratified by the original data distribution")
print("3. get_keyword_boosted_sample() stratified using original distribution but also using a keyword dictionary")
choice = input("\nEnter choice (1-3): ").strip()
if choice == '1':
sample = sampler.get_stratified_sample()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
elif choice == '2':
sample = sampler.original_distribution_sample()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
elif choice == '3':
sample = sampler.get_keyword_boosted_sample()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
if __name__ == "__main__":
main()