Fixed get_stratified_sample() and replace broken x() with actual working logic, added sample_with_keywords().

This commit is contained in:
charlie-rasberry
2025-11-12 02:05:20 +00:00
parent a178284ffc
commit 2cbdd55243
2 changed files with 66 additions and 16 deletions

View File

@@ -166,8 +166,8 @@ def preprocess_uber_reviews(input_path, output_path):
return df_clean return df_clean
if __name__ == "__main__": if __name__ == "__main__":
input_file = "data/uber_reviews.csv" input_file = "multitag/data/uber_reviews.csv"
output_file = "data/uber_reviews_cleaned.csv" output_file = "multitag/data/uber_reviews_cleaned.csv"
df_clean = preprocess_uber_reviews(input_file, output_file) df_clean = preprocess_uber_reviews(input_file, output_file)
print("\nPreprocessing complete!") print("\nPreprocessing complete!")

View File

@@ -61,14 +61,22 @@ class Sampler:
Doesn't factor that the distribution changed greatly after preprocessing Doesn't factor that the distribution changed greatly after preprocessing
""" """
def get_stratified_sample(self) -> pd.Series: def get_stratified_sample(self) -> pd.DataFrame:
stratified_sample = self.data.groupby(self.stratify_column).apply(self.x) stratified_sample = (
self.data
.reset_index(drop=True)
.apply(self.x)
.sample(n=self.target_samples, random_state=42)
)
return stratified_sample return stratified_sample
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX= # x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
def x(self, ): def x(self, x):
return lambda x: x.sample(n=int(len(x) / self.total * self.target_samples)) n = int(len(x) / self.total * self.target_samples)
n = max(n,1)
return x.sample(n=n, random_state=42)
""" """
get_proportional_sample() get_proportional_sample()
@@ -100,7 +108,7 @@ class Sampler:
if len(rating_data) < num_samples: if len(rating_data) < num_samples:
print("Missing samples available for rating") print("Missing samples available for rating")
num_samples = len(rating_data) num_samples = len(rating_data)
sample = rating_data.sample(n = num_samples,random_state=33) sample = rating_data.sample(n = num_samples,random_state=42)
samples.append(sample) samples.append(sample)
original_sample = pd.concat(samples, ignore_index=True) original_sample = pd.concat(samples, ignore_index=True)
return original_sample return original_sample
@@ -119,20 +127,62 @@ class Sampler:
""" """
def sample_with_keywords(): def sample_with_keywords(self):
#TODO add keywords for feature classification #TODO add keywords for feature classification
print(f"\n{"="*50}") print(f"\n{"="*50}")
print("Keyword influenced / rating stratified set") print("Keyword influenced / rating stratified set")
print(f"\n{"="*50}") print(f"\n{"="*50}")
bug_keywords = ["crash","crashes", "freeze", "freezes", "error", bug_keywords = ["crash","freeze", "error",
"stops", "doesnt work", "doesn't work","loading", "stop", "doesnt work", "doesn't work","loading",
"blank", "stuck", "load", "loads", "broken", "breaks", "blank", "stuck", "load", "broken", "break",
"glitch", "glitches", "issue", "could you", "fix", "glitch", "issue", "fix", "needs","please repair",
"failed"] "failed", "responding"
]
feature_keywords = ["need","should","add","wish","would","benefit",
"please add","should have", "want", "missing",
"require", "suggestion", "request", "could you",
"include", "hope", "why not", "greatly", "option",
"new","system"
]
self.data['likely_bug'] = self.data['review'].apply(
lambda x:any(keyword in str(x).lower() for keyword in bug_keywords)
)
self.data['likely_feature'] = self.data['review'].apply(
lambda x: any (keyword in str(x).lower() for keyword in feature_keywords)
)
print(f"Reviews with bug_keywords = {self.data['likely_bug'].sum():,}")
print(f"Reviews with feature_keywords = {self.data['likely_feature'].sum():,}")
print(f"Sampling 2000 reviews balanced (400 per rating)...")
base_sample = self.data.groupby(self.stratify_column).apply(
lambda x: x.sample(n=min(400, len(x)), random_state=42),
include_groups = False
).reset_index(drop=True)
print(f"Sampling 1500 possible bug reports...")
bugs = self.data[self.data['likely_bug'] & ~self.data.index.isin(base_sample.index)]
bug_sample = bugs.sample(n=min(1500, len(bugs)), random_state=42)
print(f"Sampling 1500 possible feature requests...")
features = self.data[
self.data['likely_feature'] &
~self.data.index.isin(base_sample.index) &
~self.data.index.isin(bug_sample.index)
]
feature_sample = features.sample(n=min(1500, len(features)), random_state=42)
# Combine all samples
keyword_sample = pd.concat([base_sample, bug_sample, feature_sample], ignore_index=True)
# Drop helper columns
keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
print(f"\n Total samples: {len(keyword_sample):,}")
return keyword_sample
return
def save_sample(self, sample_df,output_path): def save_sample(self, sample_df,output_path):
"""Save sample and display statistics""" """Save sample and display statistics"""
@@ -172,7 +222,7 @@ def main():
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
elif choice == '3': elif choice == '3':
sample = sampler.get_keyword_boosted_sample() sample = sampler.sample_with_keywords()
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")