diff --git a/multitag/preprocess.py b/multitag/preprocess.py index 646378b..dcdffe0 100644 --- a/multitag/preprocess.py +++ b/multitag/preprocess.py @@ -166,8 +166,8 @@ def preprocess_uber_reviews(input_path, output_path): return df_clean if __name__ == "__main__": - input_file = "data/uber_reviews.csv" - output_file = "data/uber_reviews_cleaned.csv" + input_file = "multitag/data/uber_reviews.csv" + output_file = "multitag/data/uber_reviews_cleaned.csv" df_clean = preprocess_uber_reviews(input_file, output_file) print("\nPreprocessing complete!") diff --git a/multitag/sampler.py b/multitag/sampler.py index 25a16ae..cff9ab2 100644 --- a/multitag/sampler.py +++ b/multitag/sampler.py @@ -61,14 +61,22 @@ class Sampler: Doesn't factor that the distribution changed greatly after preprocessing """ - def get_stratified_sample(self) -> pd.Series: - stratified_sample = self.data.groupby(self.stratify_column).apply(self.x) - return stratified_sample + def get_stratified_sample(self) -> pd.DataFrame: + stratified_sample = ( + self.data + .reset_index(drop=True) + .apply(self.x) + .sample(n=self.target_samples, random_state=42) + ) + return stratified_sample + # x(self): helper function for get_proportional_sample and get_stratified_sample =FIX= - def x(self, ): - return lambda x: x.sample(n=int(len(x) / self.total * self.target_samples)) + def x(self, x): + n = int(len(x) / self.total * self.target_samples) + n = max(n,1) + return x.sample(n=n, random_state=42) """ get_proportional_sample() @@ -100,7 +108,7 @@ class Sampler: if len(rating_data) < num_samples: print("Missing samples available for rating") num_samples = len(rating_data) - sample = rating_data.sample(n = num_samples,random_state=33) + sample = rating_data.sample(n = num_samples,random_state=42) samples.append(sample) original_sample = pd.concat(samples, ignore_index=True) return original_sample @@ -119,20 +127,62 @@ class Sampler: """ - def sample_with_keywords(): + def sample_with_keywords(self): #TODO add keywords for feature classification print(f"\n{"="*50}") print("Keyword influenced / rating stratified set") print(f"\n{"="*50}") - bug_keywords = ["crash","crashes", "freeze", "freezes", "error", - "stops", "doesnt work", "doesn't work","loading", - "blank", "stuck", "load", "loads", "broken", "breaks", - "glitch", "glitches", "issue", "could you", "fix", - "failed"] + bug_keywords = ["crash","freeze", "error", + "stop", "doesnt work", "doesn't work","loading", + "blank", "stuck", "load", "broken", "break", + "glitch", "issue", "fix", "needs","please repair", + "failed", "responding" + ] + feature_keywords = ["need","should","add","wish","would","benefit", + "please add","should have", "want", "missing", + "require", "suggestion", "request", "could you", + "include", "hope", "why not", "greatly", "option", + "new","system" + ] + self.data['likely_bug'] = self.data['review'].apply( + lambda x:any(keyword in str(x).lower() for keyword in bug_keywords) + ) + self.data['likely_feature'] = self.data['review'].apply( + lambda x: any (keyword in str(x).lower() for keyword in feature_keywords) + ) + print(f"Reviews with bug_keywords = {self.data['likely_bug'].sum():,}") + print(f"Reviews with feature_keywords = {self.data['likely_feature'].sum():,}") + + print(f"Sampling 2000 reviews balanced (400 per rating)...") + base_sample = self.data.groupby(self.stratify_column).apply( + lambda x: x.sample(n=min(400, len(x)), random_state=42), + include_groups = False + ).reset_index(drop=True) + + print(f"Sampling 1500 possible bug reports...") + bugs = self.data[self.data['likely_bug'] & ~self.data.index.isin(base_sample.index)] + bug_sample = bugs.sample(n=min(1500, len(bugs)), random_state=42) + + print(f"Sampling 1500 possible feature requests...") + features = self.data[ + self.data['likely_feature'] & + ~self.data.index.isin(base_sample.index) & + ~self.data.index.isin(bug_sample.index) + ] + feature_sample = features.sample(n=min(1500, len(features)), random_state=42) + + # Combine all samples + keyword_sample = pd.concat([base_sample, bug_sample, feature_sample], ignore_index=True) + + # Drop helper columns + keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature']) + + print(f"\n Total samples: {len(keyword_sample):,}") + return keyword_sample - return + def save_sample(self, sample_df,output_path): """Save sample and display statistics""" @@ -172,7 +222,7 @@ def main(): sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv") elif choice == '3': - sample = sampler.get_keyword_boosted_sample() + sample = sampler.sample_with_keywords() sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")