Here I am playing with the Central Limit Theorem.

I am running bernoulli trials to generate binomial distributions.

I was interested in how the results approximate the normal distribution, so I labelled their status at particular iteration rounds and plotted the result.

The code is based upon the respective example from Data Science from Scratch.

Libraries

import random
import pandas as pd
import numpy as np
import altair as alt
import math as m
from collections import Counter
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')

Parameters

p = .35
binomial_trials = 10000
total_rounds = 5000
step = 100

Bernoulli Trials

def bernoulli_trial(p: float) -> int:
    return 1 if random.random() < p else 0

Let's run the trials many times.

trials =  []
for _ in range(binomial_trials):
    trials.append(bernoulli_trial(p))

print("First thousand trials:")
print(" ".join([str(t) for t in trials[:1000]]))
print(f"\nOnes: {sum([t == 1 for t in trials])}, Zeros: {sum([t == 0 for t in trials])}")
print("\nMean value of 'success' (ones):", np.mean(trials))
First thousand trials:
0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 1 0 1 1 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 0 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 1 1 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 0 1 1 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 0 0 1 0 0 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 1 1 0 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1

Ones: 3594, Zeros: 6406

Mean value of &#39;success&#39; (ones): 0.3594
def binomial(n: int, p: float) -> int:
    return sum(bernoulli_trial(p) for _ in range(n))

trials = binomial(binomial_trials, p)
trials
3441

Iterating the binomial

def iterate_binomial(n, p, num_points):
    return [binomial(n, p) for _ in range(num_points)]

simulations = iterate_binomial(binomial_trials, p, total_rounds)

print("First hunded values:")
print(" ".join([str(results) for results in simulations[:100]]))
print("\nMean:", np.mean(simulations), "Standard deviation:", np.std(simulations))
First hunded values:
3483 3596 3442 3470 3479 3570 3521 3512 3470 3499 3446 3495 3530 3527 3485 3537 3445 3538 3478 3542 3510 3444 3478 3384 3452 3566 3538 3548 3485 3425 3547 3456 3411 3515 3466 3454 3428 3485 3514 3524 3411 3530 3559 3401 3459 3513 3580 3533 3527 3502 3476 3465 3478 3475 3493 3537 3558 3499 3511 3496 3572 3555 3480 3541 3519 3477 3460 3496 3501 3524 3489 3499 3537 3527 3557 3508 3503 3530 3489 3497 3519 3509 3502 3484 3499 3488 3537 3567 3395 3469 3537 3440 3542 3493 3385 3539 3490 3504 3398 3555

Mean: 3500.871 Standard deviation: 47.232848304966744

As we are interested in the intermediate states, we add new rounds iteratively and labeling them in the way.

def add_iterations(probability, trials, total_rounds, step):
    iteration_results = pd.DataFrame()

    for round in range(step, total_rounds + 1, step):
        simulations = iterate_binomial(trials, probability, round)

        counts = Counter(simulations)
        iteration_results = pd.concat(
            [
                iteration_results,
                pd.DataFrame({'count': counts.values(), 'value': counts.keys(), 'iterations': round})
            ]
        )

    return iteration_results


iteration_results = add_iterations(p, binomial_trials, total_rounds, step)
iteration_results
count value iterations
0 1 3447 100
1 1 3457 100
2 1 3530 100
3 1 3496 100
4 1 3551 100
... ... ... ...
265 2 3410 5000
266 1 3663 5000
267 1 3372 5000
268 1 3644 5000
269 1 3622 5000

11949 rows × 3 columns

As we are interested in the total counts at each step we need to cumulatively add the value counts together over the iteration rounds.

def sum_counts(value_df):
    return value_df.sort_values('iterations')['count'].cumsum()

iteration_results = iteration_results.sort_values(['value', 'iterations'])

iteration_results['cumulative count'] = iteration_results.groupby('value').apply(sum_counts).values

iteration_results
count value iterations cumulative count
233 1 3285 4400 1
200 1 3286 4700 1
161 1 3288 1900 1
244 1 3313 2200 1
156 1 3314 3200 1
... ... ... ... ...
130 1 3691 4700 1
240 1 3692 4100 1
144 1 3695 5000 1
155 1 3700 2400 1
271 1 3704 4500 1

11949 rows × 4 columns

Normal distributions

We also generate a normal distribution for reference

def calc_normal_cdf(x: float, mu: float = 0, sigma: float = 1) -> float:
    return (1 + m.erf((x - mu) / m.sqrt(2) / sigma)) /2

def calc_normal_pdf(x: float, mu: float = 0, sigma: float=1) -> float:
    return m.exp(-(x-mu)**2 / (2 * sigma **2)) * 1/(m.sqrt(2 * m.pi) * sigma)
n = binomial_trials

mu = p * n
sigma = m.sqrt(n * p * (1 - p))

data = iteration_results.loc[iteration_results['iterations'] == iteration_results['iterations'].max(), 'value']

xs = range(min(data), max(data) + 1)
ys = [calc_normal_cdf(i + .5, mu, sigma) - calc_normal_cdf(i - .5, mu, sigma) for i in xs]

normal_dist = pd.DataFrame({'x': xs, 'y':ys})
normal_dist
x y
0 3347 0.000049
1 3348 0.000052
2 3349 0.000056
3 3350 0.000060
4 3351 0.000064
... ... ...
344 3691 0.000003
345 3692 0.000003
346 3693 0.000002
347 3694 0.000002
348 3695 0.000002

349 rows × 2 columns

Visualization

With the slider you can see the intermediate results at each iteration.

label = alt.selection_single(
    encodings=['x'], on='mouseover', nearest=True, empty='none'
)


iteration_bounds = alt.binding_range(min=iteration_results['iterations'].min(), max=iteration_results['iterations'].max(), step=step)
iteration_slider = alt.selection_single(bind=iteration_bounds, fields=['iterations'], name='Iteration', init={'iterations': iteration_results['iterations'].median()})

bar_chart = alt.Chart(iteration_results).mark_bar().encode(
    alt.X('value:Q', scale=alt.Scale(domain=(iteration_results['value'].min(), iteration_results['value'].max()))),
    alt.Y('cumulative count:Q', scale=alt.Scale(domain=(iteration_results['cumulative count'].min(), iteration_results['cumulative count'].max()))),
    opacity=alt.value(0.75)
)


line_chart = alt.Chart(normal_dist).mark_line().encode(
    alt.X('x'), alt.Y('y',),color=alt.value('darkred')
)

bar_chart = alt.layer(
    bar_chart.add_selection(iteration_slider).transform_filter(iteration_slider),
    bar_chart.mark_circle().encode(opacity=alt.condition(label, alt.value(1), alt.value(0))).add_selection(label).transform_filter(iteration_slider),
    bar_chart.mark_text(dx=5, dy=-5, align='left', strokeWidth=.5, stroke='black').encode(text=alt.Text('cumulative count:Q')).transform_filter(label).transform_filter(iteration_slider)
)

alt.layer(
    bar_chart,
    line_chart,    
).resolve_scale(y='independent').properties(width=800)