import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
#We are setting the seed to assure you get the same answers on quizzes as we set up
random.seed(42)


# Read in the dataset
df = pd.read_csv('ab_data.csv')

# Return the top 5 rows
df.head()


# The first parameter returned by shape gives the number of rows
df.shape[0]

294478


df['user_id'].nunique()

290584


df['converted'].mean()

0.11965919355605512


# There are two cases where 'new_page' and 'treatment' won't match:
# (1) A user is in the 'treatment' group, but they are not presented with the 'new page'
# (2) A user is in the 'control' group, but they are presented with the 'new page'
df.query('(group == "treatment" & landing_page != "new_page") | (group == "control" & landing_page == "new_page")').shape[0]

3893


df.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64


# Create a new dataframe that drops all rows that meet the criteria in the query in step 1(e) above
df2 = df.drop(df.query('(group == "treatment" & landing_page != "new_page") | (group == "control" & landing_page == "new_page")').index)


# Double Check all of the correct rows were removed - this should be 0
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]

0


df2['user_id'].nunique()

290584


#First, confirm there is only 1 duplicated user, then print the id
print('The number of duplicated user ids in df2 is {}'.format(sum(df2['user_id'].duplicated())))
print('This user id is {}'.format(df2[df2.duplicated(['user_id'], keep='last')]['user_id'].values[0]))

The number of duplicated user ids in df2 is 1
This user id is 773192


df2[df2.duplicated(['user_id'], keep=False)]


# Drop the oldest row (i.e. the one with the oldest timestamp)
df2 = df2.drop(df2[df2['timestamp'] == '2017-01-09 05:37:58.781806'].index)
# Make sure the duplicate has been deleted. This should return 0.
sum(df2['user_id'].duplicated())

0


df2.converted.mean()

0.11959708724499628


df2[df2['group'] == 'control']['converted'].mean()

0.1203863045004612


df2[df2['group'] == 'treatment']['converted'].mean()

0.11880806551510564


# Divide the number of times the new page was served by the total number of times a landing page was served
df2[df2['landing_page'] == 'new_page']['landing_page'].count()/df2['landing_page'].count()

0.50006194422266881


# What is the time interval of our experiment (i.e. was it run long enough)
df2['timestamp'].min(), df2['timestamp'].max()

('2017-01-02 13:42:05.378582', '2017-01-24 13:41:54.460509')


# As per above, we assume p_new is the converted rate regardless of the page.
p_new = df2['converted'].mean()
p_new

0.11959708724499628


# Again, we assume p_old is the converted rate regardless of the page.
# This should be equal to p_new
p_old = df2['converted'].mean()
p_old

0.11959708724499628


n_new = df2[df2['group'] == 'treatment']['user_id'].count()
n_new

145310


n_old = df2[df2['group'] == 'control']['user_id'].count()
n_old

145274


new_page_converted = np.random.choice([0,1], size = n_new, p = [1 - p_new, p_new])


old_page_converted = np.random.choice([0,1], size = n_old, p = [1 - p_old, p_old])


# Subtract the mean of the simulated new page conversions from the mean of the simulated old page conversions
new_page_converted.mean() - old_page_converted.mean()

0.0011818303079598469


# Initialize p_diffs
p_diffs = []

# Simulate and create 10,000 p_diff values
for _ in range(10000):
    new_page_converted = np.random.choice([0,1], size = n_new, p = [1 - p_new, p_new])
    old_page_converted = np.random.choice([0,1], size = n_old, p = [1 - p_old, p_old])
    p_diffs.append(new_page_converted.mean() - old_page_converted.mean())


# First, convert p_diffs into a numpy array
p_diffs = np.array(p_diffs)

# Plot the histogram of p_diffs
plt.hist(p_diffs);


# First calculate the actual difference observed in ab_data.csv
actual_diff = df2[df2['group'] == 'treatment']['converted'].mean() - df2[df2['group'] == 'control']['converted'].mean() 
actual_diff

-0.0015782389853555567


# Calculate the proportion of those p_diffs that are greater than the actual difference
(p_diffs > actual_diff).mean()

0.90610000000000002


import statsmodels.api as sm

convert_old = df2[df2['group'] == 'control']['converted'].sum()
convert_new = df2[df2['group'] == 'treatment']['converted'].sum()
n_old = df2[df2['group'] == 'control']['user_id'].count()
n_new = df2[df2['group'] == 'treatment']['user_id'].count()

/opt/conda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools


print('# conversions on old page: {}'.format(convert_old))
print('# conversions on new page: {}'.format(convert_new))
print('# rows associated with the old page: {}'.format(n_old))
print('# rows associated with the new page: {}'.format(n_new))

# conversions on old page: 17489
# conversions on new page: 17264
# rows associated with the old page: 145274
# rows associated with the new page: 145310


z_score, p_value = sm.stats.proportions_ztest([convert_new, convert_old], [n_new, n_old], alternative='larger')
print('z-score: {}'.format(z_score))
print('p-value: {}'.format(p_value))

z-score: -1.3109241984234394
p-value: 0.9050583127590245


# Add the intercept column
df2['intercept'] = 1

# Get the dummy columns for the group
df2[['control', 'ab_page']] = pd.get_dummies(df2['group'])

# Drop the unnecessary column for 'control'
df2.drop('control', axis=1, inplace=True)

# Look at the first few rows of our resulting dataframe
df2.head()


# Instantiate the regression model
logit_mod = sm.Logit(df2['converted'], df2[['intercept', 'ab_page']])

# Fit the model
results = logit_mod.fit()

Optimization terminated successfully.
         Current function value: 0.366118
         Iterations 6


results.summary2()


# Read in the new dataframe
df_countries = pd.read_csv('countries.csv')

# Print the first few rows
df_countries.head()


# Join the two datasets and create a new dataframe
df3 = df2.join(df_countries.set_index('user_id'), on='user_id')

# Print the first few rows
df3.head()


# Determine the unique values for country in order to get the dummy values
df3['country'].unique()

array(['US', 'CA', 'UK'], dtype=object)


# Add dummy columns
df3[['CA', 'UK', 'US']] = pd.get_dummies(df3['country'])

# Drop unnecessary column
df3.drop('CA', axis=1, inplace=True)

# Print the first few rows
df3.head()


# Instantiate the new regression model
logit_mod = sm.Logit(df3['converted'], df3[['intercept', 'ab_page','UK', 'US']])

# Fit the model
results = logit_mod.fit()

# Print summary of the results
results.summary2()

Optimization terminated successfully.
         Current function value: 0.366113
         Iterations 6


# Create interaction variables between page and country
df3['ab_page_UK'] = df3['ab_page']*df3['UK']
df3['ab_page_US'] = df3['ab_page']*df3['US']


# Instantiate the new regression model
logit_mod = sm.Logit(df3['converted'], df3[['intercept', 'ab_page','UK', 'US', 'ab_page_UK', 'ab_page_US']])

# Fit the model
results = logit_mod.fit()

# Print summary of the results
results.summary2()

Optimization terminated successfully.
         Current function value: 0.366109
         Iterations 6

Model:	Logit	No. Iterations:	6.0000
Dependent Variable:	converted	Pseudo R-squared:	0.000
Date:	2021-05-20 17:53	AIC:	212780.3502
No. Observations:	290584	BIC:	212801.5095
Df Model:	1	Log-Likelihood:	-1.0639e+05
Df Residuals:	290582	LL-Null:	-1.0639e+05
Converged:	1.0000	Scale:	1.0000

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
intercept	-1.9888	0.0081	-246.6690	0.0000	-2.0046	-1.9730
ab_page	-0.0150	0.0114	-1.3109	0.1899	-0.0374	0.0074

Model:	Logit	No. Iterations:	6.0000
Dependent Variable:	converted	Pseudo R-squared:	0.000
Date:	2021-05-20 17:55	AIC:	212781.1253
No. Observations:	290584	BIC:	212823.4439
Df Model:	3	Log-Likelihood:	-1.0639e+05
Df Residuals:	290580	LL-Null:	-1.0639e+05
Converged:	1.0000	Scale:	1.0000

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
intercept	-2.0300	0.0266	-76.2488	0.0000	-2.0822	-1.9778
ab_page	-0.0149	0.0114	-1.3069	0.1912	-0.0374	0.0075
UK	0.0506	0.0284	1.7835	0.0745	-0.0050	0.1063
US	0.0408	0.0269	1.5161	0.1295	-0.0119	0.0934

Model:	Logit	No. Iterations:	6.0000
Dependent Variable:	converted	Pseudo R-squared:	0.000
Date:	2021-05-20 17:55	AIC:	212782.6602
No. Observations:	290584	BIC:	212846.1381
Df Model:	5	Log-Likelihood:	-1.0639e+05
Df Residuals:	290578	LL-Null:	-1.0639e+05
Converged:	1.0000	Scale:	1.0000

Analyze A/B Test Results¶

Table of Contents¶

Introduction¶

Part I - Probability¶

Part II - A/B Test¶

Part III - A regression approach¶

Conclusion¶

	user_id	timestamp	group	landing_page	converted
0	851104	2017-01-21 22:11:48.556739	control	old_page	0
1	804228	2017-01-12 08:01:45.159739	control	old_page	0
2	661590	2017-01-11 16:55:06.154213	treatment	new_page	0
3	853541	2017-01-08 18:28:03.143765	treatment	new_page	0
4	864975	2017-01-21 01:52:26.210827	control	old_page	1

	user_id	timestamp	group	landing_page	converted
1899	773192	2017-01-09 05:37:58.781806	treatment	new_page	0
2893	773192	2017-01-14 02:55:59.590927	treatment	new_page	0

	user_id	country
0	834778	UK
1	928468	US
2	822059	UK
3	711597	UK
4	710616	UK

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
intercept	-2.0040	0.0364	-55.0077	0.0000	-2.0754	-1.9326
ab_page	-0.0674	0.0520	-1.2967	0.1947	-0.1694	0.0345
UK	0.0118	0.0398	0.2957	0.7674	-0.0663	0.0899
US	0.0175	0.0377	0.4652	0.6418	-0.0563	0.0914
ab_page_UK	0.0783	0.0568	1.3783	0.1681	-0.0330	0.1896
ab_page_US	0.0469	0.0538	0.8718	0.3833	-0.0585	0.1523