Fixed get_stratified_sample() and replace broken x() with actual working logic, added sample_with_keywords().
This commit is contained in:
@@ -166,8 +166,8 @@ def preprocess_uber_reviews(input_path, output_path):
|
|||||||
return df_clean
|
return df_clean
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_file = "data/uber_reviews.csv"
|
input_file = "multitag/data/uber_reviews.csv"
|
||||||
output_file = "data/uber_reviews_cleaned.csv"
|
output_file = "multitag/data/uber_reviews_cleaned.csv"
|
||||||
|
|
||||||
df_clean = preprocess_uber_reviews(input_file, output_file)
|
df_clean = preprocess_uber_reviews(input_file, output_file)
|
||||||
print("\nPreprocessing complete!")
|
print("\nPreprocessing complete!")
|
||||||
|
|||||||
@@ -61,14 +61,22 @@ class Sampler:
|
|||||||
Doesn't factor that the distribution changed greatly after preprocessing
|
Doesn't factor that the distribution changed greatly after preprocessing
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def get_stratified_sample(self) -> pd.Series:
|
def get_stratified_sample(self) -> pd.DataFrame:
|
||||||
stratified_sample = self.data.groupby(self.stratify_column).apply(self.x)
|
stratified_sample = (
|
||||||
|
self.data
|
||||||
|
.reset_index(drop=True)
|
||||||
|
.apply(self.x)
|
||||||
|
.sample(n=self.target_samples, random_state=42)
|
||||||
|
)
|
||||||
return stratified_sample
|
return stratified_sample
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
|
# x(self): helper function for get_proportional_sample and get_stratified_sample =FIX=
|
||||||
def x(self, ):
|
def x(self, x):
|
||||||
return lambda x: x.sample(n=int(len(x) / self.total * self.target_samples))
|
n = int(len(x) / self.total * self.target_samples)
|
||||||
|
n = max(n,1)
|
||||||
|
return x.sample(n=n, random_state=42)
|
||||||
"""
|
"""
|
||||||
get_proportional_sample()
|
get_proportional_sample()
|
||||||
|
|
||||||
@@ -100,7 +108,7 @@ class Sampler:
|
|||||||
if len(rating_data) < num_samples:
|
if len(rating_data) < num_samples:
|
||||||
print("Missing samples available for rating")
|
print("Missing samples available for rating")
|
||||||
num_samples = len(rating_data)
|
num_samples = len(rating_data)
|
||||||
sample = rating_data.sample(n = num_samples,random_state=33)
|
sample = rating_data.sample(n = num_samples,random_state=42)
|
||||||
samples.append(sample)
|
samples.append(sample)
|
||||||
original_sample = pd.concat(samples, ignore_index=True)
|
original_sample = pd.concat(samples, ignore_index=True)
|
||||||
return original_sample
|
return original_sample
|
||||||
@@ -119,20 +127,62 @@ class Sampler:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def sample_with_keywords():
|
def sample_with_keywords(self):
|
||||||
#TODO add keywords for feature classification
|
#TODO add keywords for feature classification
|
||||||
print(f"\n{"="*50}")
|
print(f"\n{"="*50}")
|
||||||
print("Keyword influenced / rating stratified set")
|
print("Keyword influenced / rating stratified set")
|
||||||
print(f"\n{"="*50}")
|
print(f"\n{"="*50}")
|
||||||
|
|
||||||
bug_keywords = ["crash","crashes", "freeze", "freezes", "error",
|
bug_keywords = ["crash","freeze", "error",
|
||||||
"stops", "doesnt work", "doesn't work","loading",
|
"stop", "doesnt work", "doesn't work","loading",
|
||||||
"blank", "stuck", "load", "loads", "broken", "breaks",
|
"blank", "stuck", "load", "broken", "break",
|
||||||
"glitch", "glitches", "issue", "could you", "fix",
|
"glitch", "issue", "fix", "needs","please repair",
|
||||||
"failed"]
|
"failed", "responding"
|
||||||
|
]
|
||||||
|
feature_keywords = ["need","should","add","wish","would","benefit",
|
||||||
|
"please add","should have", "want", "missing",
|
||||||
|
"require", "suggestion", "request", "could you",
|
||||||
|
"include", "hope", "why not", "greatly", "option",
|
||||||
|
"new","system"
|
||||||
|
]
|
||||||
|
self.data['likely_bug'] = self.data['review'].apply(
|
||||||
|
lambda x:any(keyword in str(x).lower() for keyword in bug_keywords)
|
||||||
|
)
|
||||||
|
self.data['likely_feature'] = self.data['review'].apply(
|
||||||
|
lambda x: any (keyword in str(x).lower() for keyword in feature_keywords)
|
||||||
|
)
|
||||||
|
print(f"Reviews with bug_keywords = {self.data['likely_bug'].sum():,}")
|
||||||
|
print(f"Reviews with feature_keywords = {self.data['likely_feature'].sum():,}")
|
||||||
|
|
||||||
|
print(f"Sampling 2000 reviews balanced (400 per rating)...")
|
||||||
|
base_sample = self.data.groupby(self.stratify_column).apply(
|
||||||
|
lambda x: x.sample(n=min(400, len(x)), random_state=42),
|
||||||
|
include_groups = False
|
||||||
|
).reset_index(drop=True)
|
||||||
|
|
||||||
|
print(f"Sampling 1500 possible bug reports...")
|
||||||
|
bugs = self.data[self.data['likely_bug'] & ~self.data.index.isin(base_sample.index)]
|
||||||
|
bug_sample = bugs.sample(n=min(1500, len(bugs)), random_state=42)
|
||||||
|
|
||||||
|
print(f"Sampling 1500 possible feature requests...")
|
||||||
|
features = self.data[
|
||||||
|
self.data['likely_feature'] &
|
||||||
|
~self.data.index.isin(base_sample.index) &
|
||||||
|
~self.data.index.isin(bug_sample.index)
|
||||||
|
]
|
||||||
|
feature_sample = features.sample(n=min(1500, len(features)), random_state=42)
|
||||||
|
|
||||||
|
# Combine all samples
|
||||||
|
keyword_sample = pd.concat([base_sample, bug_sample, feature_sample], ignore_index=True)
|
||||||
|
|
||||||
|
# Drop helper columns
|
||||||
|
keyword_sample = keyword_sample.drop(columns=['likely_bug', 'likely_feature'])
|
||||||
|
|
||||||
|
print(f"\n Total samples: {len(keyword_sample):,}")
|
||||||
|
return keyword_sample
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
def save_sample(self, sample_df,output_path):
|
def save_sample(self, sample_df,output_path):
|
||||||
"""Save sample and display statistics"""
|
"""Save sample and display statistics"""
|
||||||
@@ -172,7 +222,7 @@ def main():
|
|||||||
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
||||||
|
|
||||||
elif choice == '3':
|
elif choice == '3':
|
||||||
sample = sampler.get_keyword_boosted_sample()
|
sample = sampler.sample_with_keywords()
|
||||||
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
sampler.save_sample(sample, "multitag/data/uber_reviews_sampled.csv")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user