Starting with some notes:
References: Stats Theory Theory Code
# PRE-REQUISITS
import seaborn as sns
import numpy as np
import pandas as pd
%matplotlib inline
# GET DATA
# empty df
df = pd.DataFrame()
# Get the global dataset, already cleansed and incl. features.
%run 'returnCleansedDatasetGlobal.py'
data = df.copy() # creating a safety copy
# df = df.where(df['average_spending'] < 200000) # want to be looking only at "smaller" deals which make up the biggest bunch of our data?
# Taking a first look on the distribution of consulting customers vs. non_consulting customers
sns.displot(df, x="average_spending", bins=40, hue='consulting_customer')
# Define A/B groups
ab = df.copy()
ab["group"] = np.where(ab.consulting_customer == True, "A", "B")
ab.head()
# A/B Testing Function - Quick Solution
def AB_Test(dataframe, group, target):
# Packages
from scipy.stats import shapiro
import scipy.stats as stats
# Split A/B
groupA = dataframe[dataframe[group] == "A"][target]
groupB = dataframe[dataframe[group] == "B"][target]
# Assumption: Normality
ntA = shapiro(groupA)[1] < 0.05
ntB = shapiro(groupB)[1] < 0.05
# H0: Distribution is Normal! - False
# H1: Distribution is not Normal! - True
if (ntA == False) & (ntB == False): # "H0: Normal Distribution"
# Parametric Test
# Assumption: Homogeneity of variances
leveneTest = stats.levene(groupA, groupB)[1] < 0.05
# H0: Homogeneity: False
# H1: Heterogeneous: True
if leveneTest == False:
# Homogeneity
ttest = stats.ttest_ind(groupA, groupB, equal_var=True)[1]
# H0: M1 == M2 - False
# H1: M1 != M2 - True
else:
# Heterogeneous
ttest = stats.ttest_ind(groupA, groupB, equal_var=False)[1]
# H0: M1 == M2 - False
# H1: M1 != M2 - True
else:
# Non-Parametric Test
ttest = stats.mannwhitneyu(groupA, groupB)[1]
# H0: M1 == M2 - False
# H1: M1 != M2 - True
# Result
temp = pd.DataFrame({
"AB Hypothesis":[ttest < 0.05],
"p-value":[ttest]
})
temp["Test Type"] = np.where((ntA == False) & (ntB == False), "Parametric", "Non-Parametric")
temp["AB Hypothesis"] = np.where(temp["AB Hypothesis"] == False, "Fail to Reject H0", "Reject H0")
temp["Comment"] = np.where(temp["AB Hypothesis"] == "Fail to Reject H0", "A/B groups are similar!", "A/B groups are not similar!")
# Columns
if (ntA == False) & (ntB == False):
temp["Homogeneity"] = np.where(leveneTest == False, "Yes", "No")
temp = temp[["Test Type", "Homogeneity","AB Hypothesis", "p-value", "Comment"]]
else:
temp = temp[["Test Type","AB Hypothesis", "p-value", "Comment"]]
# Print Hypothesis
print("# A/B Testing Hypothesis")
print("H0: A == B")
print("H1: A != B", "\n")
return temp
# Apply A/B Testing
AB_Test(dataframe=ab, group = "group", target = "average_spending")
# Apply A/B Testing
AB_Test(dataframe=ab, group = "group", target = "number_of_solutions")