Libraries and helper functions

import random
from typing import List, Callable

import pandas as pd
import altair as alt
Vector = List[float]
Vector
typing.List[float]
def add(vector1: Vector, vector2: Vector) -> Vector:
    assert len(vector1) == len(vector2)
    return [v1 + v2 for v1, v2 in zip(vector1, vector2)]
def scalar_multiply(c: float, vector: Vector) -> Vector:
    return [c * v for v in vector]
def dot(vector1: Vector, vector2: Vector) -> float:
    assert len(vector1) == len(vector2)

    return sum(v1 * v2 for v1, v2 in zip(vector1, vector2))
def sum_of_squares(v: Vector) -> Vector:
    return dot(v, v)
def magnitude(v: Vector) -> Vector:
    return m.sqrt(sum_of_squares(v))
def distance(vector1: Vector, vector2: Vector) -> Vector:
    return magnitude(subtract(vector1, vector2))
def vector_sum(vectors: List[Vector]) -> Vector:
    assert vectors
    
    vector_length = len(vectors[0])
    assert all(len(v) == vector_length for v in vectors)

    sums = [0] * vector_length
    for vector in vectors:
        sums = add(sums, vector)

    return sums
def vector_mean(vectors: List[Vector]) -> Vector:
    n = len(vectors)
    return scalar_multiply(1/n, vector_sum(vectors))

Estimate gradient

def estimate_gradient(
    f: Callable[[Vector], float],
    v: Vector,
    h: float = 0.0001
):
    return [
        partial_difference_quotient(f, v, i, h)
        for i in range(len(v))
    ]
v = [random.uniform(-10, 10) for i in range(3)]
v
[-9.684378319623278, 4.813838863175313, 3.1311841279856303]

Adjusting with gradients

def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector:
    """Return vector adjusted with step. Step is gradient times step size.
    """
    step = scalar_multiply(step_size, gradient)
    return add(v, step)

v, gradient_step(v, [1, 1, 1], step_size=0.1)
([-9.684378319623278, 4.813838863175313, 3.1311841279856303],
 [-9.584378319623278, 4.913838863175313, 3.2311841279856304])
def sum_of_squares_gradient(v: Vector) -> Vector:
    return [2 * v_i for v_i in v]

sum_of_squares_gradient(v)
[-19.368756639246556, 9.627677726350626, 6.262368255971261]
steps = pd.DataFrame()

for epoch in range(1000):
    grad = sum_of_squares_gradient(v)
    v = gradient_step(v, grad, -0.01)
    steps = steps.append(
        {'x': v[0], 'y': v[1], 'z': v[2]}, ignore_index=True
    )

steps
x y z
0 4.718042e+00 -7.666161e+00 -5.877492e-01
1 4.623681e+00 -7.512838e+00 -5.759942e-01
2 4.531207e+00 -7.362581e+00 -5.644743e-01
3 4.440583e+00 -7.215329e+00 -5.531848e-01
4 4.351771e+00 -7.071023e+00 -5.421211e-01
... ... ... ...
995 8.784299e-09 -1.427326e-08 -1.094302e-09
996 8.608613e-09 -1.398780e-08 -1.072416e-09
997 8.436440e-09 -1.370804e-08 -1.050968e-09
998 8.267712e-09 -1.343388e-08 -1.029949e-09
999 8.102357e-09 -1.316520e-08 -1.009350e-09

1000 rows × 3 columns

alt.Chart(steps.reset_index().melt(id_vars='index')).mark_point().encode(
    alt.X('index:Q'), alt.Y('value:Q'), alt.Color('variable:N')
).properties(title='Gradient steps')

Changing the step size

inputs = [(x, 20 * x + 5) for x in range(-50, 50)]
inputs[:20]
[(-50, -995),
 (-49, -975),
 (-48, -955),
 (-47, -935),
 (-46, -915),
 (-45, -895),
 (-44, -875),
 (-43, -855),
 (-42, -835),
 (-41, -815),
 (-40, -795),
 (-39, -775),
 (-38, -755),
 (-37, -735),
 (-36, -715),
 (-35, -695),
 (-34, -675),
 (-33, -655),
 (-32, -635),
 (-31, -615)]

We use a simple linear gradient

def linear_gradient(x: float, y: float, theta: Vector) -> Vector:
    slope, intercept = theta
    predicted = slope * x + intercept
    error = (predicted - y) #** 2
    return [2 * error * x, 2 * error]
    

x, y = inputs[0][0], inputs[0][1]
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

x, y, theta, linear_gradient(x, y, theta)    
(-50,
 -995,
 [-0.2222082595361492, -0.7215025177939673],
 [-100538.89104590134, 2010.777820918027])
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

learning_rate = 0.001

for epoch in range(20):
    grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs])
    theta = gradient_step(theta, grad, -learning_rate)
    print(epoch, theta)
0 [33.27214426853234, 0.961944418352982]
1 [11.143441717307283, 0.9832926737848086]
2 [25.903307667229832, 0.9824695301545462]
3 [16.05847625548786, 0.996407898761467]
4 [22.624992745488356, 1.0004735592194318]
5 [18.245130312318487, 1.0110976048464813]
6 [21.16650917928842, 1.0173205399491068]
7 [19.217955697954576, 1.026452408048497]
8 [20.517650001872347, 1.0336174589303544]
9 [19.650761066210077, 1.042067874014366]
10 [20.228984436711894, 1.0496344993325475]
11 [19.8433170152125, 1.0577642147705943]
12 [20.100565315068035, 1.0654920033562656]
13 [19.928988426852978, 1.0734615846646212]
14 [20.04343818087373, 1.081243649922145]
15 [19.967107977007146, 1.0891246008031743]
16 [20.018028103937038, 1.096913459578575]
17 [19.984072168133576, 1.104737660763355]
18 [20.00672860151567, 1.1125122576099618]
19 [19.991624535046657, 1.1202939616962575]