Probabilty distributions in Python 'from scratch'
Here I write up some functions to generate probability univariate and normal probability distributions based on the book Data Science from Scratch
import math as m
import altair as alt
import numpy as np
import pandas as pd
alt.data_transformers.disable_max_rows()
def uniform_pdf(x: float) -> float:
return 1 if 0 <= x < 1 else 0
def uniform_cdf(x: float) -> float:
if x < 0: return 0
elif x < 1 : return x
else: return 1
Example values
print("x\tpdf\tcdf\n")
for x in [-2, 0, .2, .8, 1, 1.5]:
print(f"{x}\t{uniform_pdf(x)}\t{uniform_cdf(x)}")
For plotting we generate both cdf and pdf values in a tidy format.
x = pd.Series(np.linspace(-1, 2, 1000))
uniform = pd.DataFrame(
{
'x': x,
'pdf': x.apply(uniform_pdf),
'cdf': x.apply(uniform_cdf)
}
).melt(id_vars='x')
uniform
uniform.groupby('variable').describe().loc[:, ('value', slice(None))].T
chart = alt.Chart().mark_line().encode(
alt.X('x:Q'), alt.Y('value:Q'), alt.Color('variable:N'),
)
label = alt.selection_single(
encodings=['x'], on='mouseover', nearest=True, empty='none'
)
alt.layer(
chart,
chart.mark_circle().encode(opacity=alt.condition(label, alt.value(1), alt.value(0))).add_selection(label),
alt.Chart().mark_rule(color='darkgray').encode(alt.X('x:Q')).transform_filter(label),
chart.mark_text(align='left', dx=5, dy=-5, strokeWidth=0.5).encode(
text=alt.Text('value:Q', format=',.4f')
).transform_filter(label),
data=uniform
).properties(width=600, title='Uniform PDF and CDF')
def calc_normal_pdf(x: float, mu: float = 0, sigma: float=1) -> float:
return m.exp(-(x-mu)**2 / (2 * sigma **2)) * 1/(m.sqrt(2 * m.pi) * sigma)
x = pd.Series(np.linspace(-5, 5, 1000))
normal_pdf = pd.DataFrame(
{
'x': x,
'mu=0, sigma=1': x.apply(calc_normal_pdf),
'mu=0, sigma=2': x.apply(lambda x: calc_normal_pdf(x, 0, 2)),
'mu=0, sigma=3': x.apply(lambda x: calc_normal_pdf(x, 0, 3))
}
).melt(id_vars='x')
normal_pdf
normal_pdf.groupby('variable').describe()['value'].T
label = alt.selection_single(
encodings=['x'], on='mouseover', nearest=True, empty='none'
)
chart = alt.Chart().mark_line().encode(
alt.X('x:Q'), alt.Y('value:Q'), alt.Color('variable:N')
)
alt.layer(
chart,
chart.mark_circle().encode(opacity=alt.condition(label, alt.value(1), alt.value(0))).add_selection(label),
alt.Chart().mark_rule(color='darkgray').encode(alt.X('x:Q')).transform_filter(label),
chart.mark_text(align='left', dx=5, dy=-5).encode(text=alt.Text('value:Q', format=',.6f')).transform_filter(label),
# tooltip=alt.Tooltip('value:Q'),
data=normal_pdf
).properties(width=600, title="Normal PDF")
def calc_normal_cdf(x: float, mu: float = 0, sigma: float = 1) -> float:
return (1 + m.erf((x - mu) / m.sqrt(2) / sigma)) /2
normal_cdf = pd.DataFrame(
{
'x': x,
'mu=0, sigma=1': x.apply(calc_normal_cdf),
'mu=0, sigma=2': x.apply(lambda x: calc_normal_cdf(x, 0, 2)),
'mu=0, sigma=3': x.apply(lambda x: calc_normal_cdf(x, 0, 3)),
}
).melt(id_vars='x')
normal_cdf
normal_cdf.describe()['value'].T
label = alt.selection_single(
encodings=['x'], on='mouseover', nearest=True, empty='none'
)
chart = alt.Chart().mark_line().encode(
alt.X('x:Q'), alt.Y('value:Q'), alt.Color('variable:N')
)
alt.layer(
chart,
chart.mark_circle().encode(opacity=alt.condition(label, alt.value(1), alt.value(0))).add_selection(label),
alt.Chart().mark_rule(color='darkgray').encode(alt.X('x:Q')).transform_filter(label),
chart.mark_text(align='left', dx=5, dy=-5).encode(text=alt.Text('value:Q', format=',.6f')).transform_filter(label),
# tooltip=alt.Tooltip('value:Q'),
data=normal_cdf
).properties(width=600, title="Normal CDFs")