Principal Component Analysis from scatch - preparations
From the Data Science from Scratch book.
import math as m
import random
import pandas as pd
import numpy as np
import altair as alt
from typing import List
Vector = List[float]
def add(vector1: Vector, vector2: Vector) -> Vector:
assert len(vector1) == len(vector2)
return [v1 + v2 for v1, v2 in zip(vector1, vector2)]
def subtract(vector1: Vector, vector2:Vector) -> Vector:
assert len(vector1) == len(vector2)
return [v1 - v2 for v1, v2 in zip(vector1, vector2)]
def vector_sum(vectors: List[Vector]) -> Vector:
assert vectors
vector_length = len(vectors[0])
assert all(len(v) == vector_length for v in vectors)
sums = [0] * vector_length
for vector in vectors:
sums = add(sums, vector)
return sums
def scalar_multiply(c: float, vector: Vector) -> Vector:
return [c * v for v in vector]
def vector_mean(vector: Vector) -> float:
n = len(vector)
return scalar_multiply(1/n, vector)
def dot(vector1: Vector, vector2: Vector) -> float:
assert len(vector1) == len(vector2)
return sum(v1 * v2 for v1, v2 in zip(vector1, vector2))
def sum_of_squares(v: Vector) -> Vector:
return dot(v, v)
def magnitude(v: Vector) -> Vector:
return m.sqrt(sum_of_squares(v))
def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector:
"""Return vector adjusted with step. Step is gradient times step size.
"""
step = scalar_multiply(step_size, gradient)
return add(v, step)
intercept = random.randint(-30, 30)
coefficient = random.uniform(-1, 1)
n = 30
xs = np.random.randint(-50, 10 + 1, 30)
ys = np.random.randint(-20, 50 + 1, 30)
df = pd.DataFrame({'x': xs, 'y': ys})
print(intercept, coefficient)
alt.Chart(df).mark_point().encode(
alt.X('x:Q'), alt.Y('y:Q'), alt.Tooltip(['x', 'y'])
)
def de_mean(data: List[Vector]) -> List[Vector]:
# mean = vector_mean(data)
return [vector - np.mean(vector) for vector in data]
xs_demean, ys_demean = de_mean([xs, ys])
df = pd.DataFrame({'x': xs_demean, 'y': ys_demean})
alt.Chart(df).mark_point().encode(
alt.X('x:Q'), alt.Y('y:Q'), alt.Tooltip(['x', 'y'])
)
def direction(w: Vector) -> Vector:
mag = magnitude(w)
return [w_i / mag for w_i in w]
direction(xs)
xs_dir = direction(xs_demean)
ys_dir = direction(ys_demean)
df = pd.DataFrame({'x': xs_dir, 'y': ys_dir})
alt.Chart(df).mark_point().encode(
alt.X('x:Q'), alt.Y('y:Q'), alt.Tooltip(['x', 'y'])
)