Central Limit Theorem on the binomial distribution from scratch with Python
Here I am playing with the Central Limit Theorem.
I am running bernoulli trials to generate binomial distributions.
I was interested in how the results approximate the normal distribution, so I labelled their status at particular iteration rounds and plotted the result.
The code is based upon the respective example from Data Science from Scratch.
import random
import pandas as pd
import numpy as np
import altair as alt
import math as m
from collections import Counter
alt.data_transformers.disable_max_rows()
p = .35
binomial_trials = 10000
total_rounds = 5000
step = 100
def bernoulli_trial(p: float) -> int:
return 1 if random.random() < p else 0
Let's run the trials many times.
trials = []
for _ in range(binomial_trials):
trials.append(bernoulli_trial(p))
print("First thousand trials:")
print(" ".join([str(t) for t in trials[:1000]]))
print(f"\nOnes: {sum([t == 1 for t in trials])}, Zeros: {sum([t == 0 for t in trials])}")
print("\nMean value of 'success' (ones):", np.mean(trials))
def binomial(n: int, p: float) -> int:
return sum(bernoulli_trial(p) for _ in range(n))
trials = binomial(binomial_trials, p)
trials
def iterate_binomial(n, p, num_points):
return [binomial(n, p) for _ in range(num_points)]
simulations = iterate_binomial(binomial_trials, p, total_rounds)
print("First hunded values:")
print(" ".join([str(results) for results in simulations[:100]]))
print("\nMean:", np.mean(simulations), "Standard deviation:", np.std(simulations))
As we are interested in the intermediate states, we add new rounds iteratively and labeling them in the way.
def add_iterations(probability, trials, total_rounds, step):
iteration_results = pd.DataFrame()
for round in range(step, total_rounds + 1, step):
simulations = iterate_binomial(trials, probability, round)
counts = Counter(simulations)
iteration_results = pd.concat(
[
iteration_results,
pd.DataFrame({'count': counts.values(), 'value': counts.keys(), 'iterations': round})
]
)
return iteration_results
iteration_results = add_iterations(p, binomial_trials, total_rounds, step)
iteration_results
As we are interested in the total counts at each step we need to cumulatively add the value counts together over the iteration rounds.
def sum_counts(value_df):
return value_df.sort_values('iterations')['count'].cumsum()
iteration_results = iteration_results.sort_values(['value', 'iterations'])
iteration_results['cumulative count'] = iteration_results.groupby('value').apply(sum_counts).values
iteration_results
We also generate a normal distribution for reference
def calc_normal_cdf(x: float, mu: float = 0, sigma: float = 1) -> float:
return (1 + m.erf((x - mu) / m.sqrt(2) / sigma)) /2
def calc_normal_pdf(x: float, mu: float = 0, sigma: float=1) -> float:
return m.exp(-(x-mu)**2 / (2 * sigma **2)) * 1/(m.sqrt(2 * m.pi) * sigma)
n = binomial_trials
mu = p * n
sigma = m.sqrt(n * p * (1 - p))
data = iteration_results.loc[iteration_results['iterations'] == iteration_results['iterations'].max(), 'value']
xs = range(min(data), max(data) + 1)
ys = [calc_normal_cdf(i + .5, mu, sigma) - calc_normal_cdf(i - .5, mu, sigma) for i in xs]
normal_dist = pd.DataFrame({'x': xs, 'y':ys})
normal_dist
With the slider you can see the intermediate results at each iteration.
label = alt.selection_single(
encodings=['x'], on='mouseover', nearest=True, empty='none'
)
iteration_bounds = alt.binding_range(min=iteration_results['iterations'].min(), max=iteration_results['iterations'].max(), step=step)
iteration_slider = alt.selection_single(bind=iteration_bounds, fields=['iterations'], name='Iteration', init={'iterations': iteration_results['iterations'].median()})
bar_chart = alt.Chart(iteration_results).mark_bar().encode(
alt.X('value:Q', scale=alt.Scale(domain=(iteration_results['value'].min(), iteration_results['value'].max()))),
alt.Y('cumulative count:Q', scale=alt.Scale(domain=(iteration_results['cumulative count'].min(), iteration_results['cumulative count'].max()))),
opacity=alt.value(0.75)
)
line_chart = alt.Chart(normal_dist).mark_line().encode(
alt.X('x'), alt.Y('y',),color=alt.value('darkred')
)
bar_chart = alt.layer(
bar_chart.add_selection(iteration_slider).transform_filter(iteration_slider),
bar_chart.mark_circle().encode(opacity=alt.condition(label, alt.value(1), alt.value(0))).add_selection(label).transform_filter(iteration_slider),
bar_chart.mark_text(dx=5, dy=-5, align='left', strokeWidth=.5, stroke='black').encode(text=alt.Text('cumulative count:Q')).transform_filter(label).transform_filter(iteration_slider)
)
alt.layer(
bar_chart,
line_chart,
).resolve_scale(y='independent').properties(width=800)