Copenhagen Denmark
Gradient Descent: The Ultimate Optimizer Erik Meijer
@headinthebox
Gradient Descent: The Ultimate Optimizer Erik Meijer @headinthebox - - PowerPoint PPT Presentation
Gradient Descent: The Ultimate Optimizer Erik Meijer @headinthebox Copenhagen Denmark We all want to write cool apps like this ... Software 1.0 Augustin-Louis Cauchy 1789 -1857 What if we feed the examples/tests to a mathematician
Copenhagen Denmark
@headinthebox
We all want to write cool apps like this ...
Software 1.0
Augustin-Louis Cauchy 1789 -1857
What if we feed the examples/tests to a mathematician
it deduce the code for us?
Physicists and Mathematicians have been doing curve fitting and function approximation for
saying.
Galileo Galilei 1566-1642 Joseph Fourier 1768-1830 Henry Padé 1863-1953
“Everything interesting in CS has already been invented by mathematicians at least 100 years ago.” @headinthebox
Fourier(x) = a0 + (∑ aicos(ixπ/L)) + (∑ bisin(ixπ/L)) PadeN,M(x) = (∑i∈0…N aixi) (1+∑i∈1…M bixi)
We’ll jump on the latest Computer Science bandwagon; Deep Learning, using Artificial Neural Networks!!!!!!!!!!!!!!!!!!
George Cybenko, 1989
Activation function weights/ parameters Linear algebra/ map-reduce
One input, one weight, identity
strong assumption
var a: ℝ = … val η: ℝ = … some tiny value of your choosing … fun model(x: ℝ): ℝ = a*x fun loss(y: ℝ, ŷ: ℝ): ℝ = (y-ŷ)2 fun train(n: Int, samples: Sequence<Pair<ℝ,ℝ>>) { repeat(n) { epoch(samples) } } fun epoch(samples: Sequence<Pair<ℝ,ℝ>>) { samples.foreach{ (x,y) ➝ val e = loss(y, model(x)) val de/da = 2*a*x2-2*x*y a -= η*de/da } }
syntax cheat syntax cheat
var a: ℝ = … val η: ℝ = … some tiny value of your choosing … fun model(x: ℝ): ℝ = a*x fun loss(y: ℝ, ŷ: ℝ): ℝ = (y-ŷ)2 fun train(n: Int, samples: Sequence<Pair<ℝ,ℝ>>) { repeat(n) { epoch(samples) } } fun epoch(samples: Sequence<Pair<ℝ,ℝ>>) { samples.foreach{ (x,y) ➝ val e = loss(y, model(x)) val de/da = 2*a*x2-2*x*y a -= η*de/da } }
Differentiable Programming, I told you so!
f(x) =3x2+4 f’(x) = 6x
Read ⇦ that again!
High school math review: Sum Rule
The derivative of (u+v) with respect to x
High school math review: Product Rule
The derivative of (u*v*w) with respect to x
High school math review: Chain Rule
The derivative
respect to x
Sum Rule Product Rule Chain Rule
(a+(da/dx)ε) + (c+(dc/dx)ε) ={ dual number } (a+c)+ (da/dx+dc/dx)ε = { sum rule } (a+c)+(d(a+b)/dx)ε (a+(da/dx)ε) * (b+(db/dx)ε) ={ dual number } (a*b)+ (a*(db/dx)+(da/dx)*b)ε = { product rule } (a*b)+(d(a*b)/dx)ε f(a+(da/dx)ε) ={ dual number } f(a)+ d(f(a)/da)(da/dx)ε = { chain rule } f(a)+(df(a)/dx)ε
Your high school education was a total waste of time!
class 𝔼(val r: ℝ, val ε: ℝ=1.0) fun sin(x: 𝔼): 𝔼 = 𝔼(r=sin(x.r), ε=cos(x.r)*x.ε) fun cos(x: 𝔼): 𝔼 = 𝔼(r=cos(x.r), ε=-sin(x.r)*x.ε)
𝔼(r=this.r*that.r, ε=this.ε*that.r + this.r*that.ε)
(this.r/that.r).let{ 𝔼(r=it, ε=(this.ε/that.r - it*that.ε/that.r) }
var a: ℝ = … val η: ℝ = … fun model(x: ℝ): ℝ = a*x fun loss(y: ℝ, ŷ: ℝ): ℝ = (y-ŷ)2 fun train(n: Int, samples: Sequence<Pair<ℝ,ℝ>>) { repeat(n) { epoch(samples) } } fun epoch(samples: Sequence<Pair<ℝ,ℝ>>) { samples.foreach{ (x,y) ➝ val e = loss(y, model(x)) val de/da = 2*a*x2-2*x*y a -= η*de/da } }
var a: 𝔼 = 𝔼(…) val η: ℝ = … def model(x: ℝ): 𝔼 = a*x def loss(y: ℝ, ŷ: 𝔼): 𝔼 = (y-ŷ)2 def epoch(samples: List[(ℝ,ℝ)]) { samples.foreach{ case (x,y) ➝ { val e = loss(y,model(x)) val de/da: ℝ =e.ε a -= η*(de/da) } }
class 𝔼(val r: ℝ, val ε: List<ℝ>) fun sin(x: 𝔼): 𝔼 = 𝔼(r=sin(x.r), ε=cos(x.r)*x.ε)
𝔼(r=this.r*that.r, ε=this.ε*that.r + this.r*that.ε)
That’s all that needs to change
Mathematically, by changing numbers to lists, we upgraded from dual numbers to synthetic differential geometry and deep category theory
var κ: ⅅ = ⅅ(1e-20, 0.th) var η: ⅅ = ⅅ(1e-20, 1.th) var a = ⅅ(Math.random(), 2.th) val 𝛿 = 1e-80 fun model(x: ℝ): ⅅ = a*x fun loss(y: ℝ, ŷ: ⅅ): ⅅ = (y-ŷ)2 fun epoch(samples: List<Pair<ℝ,ℝ>>) { lateinit var e: ⅅ samples.forEach { (x,y) ➝ val (∂e/∂κ, ∂e/∂η, ∂e/∂a) = loss(y, model(x)) κ -= 𝛿 * ∂e/∂κ η -= κ * ∂e/∂η a -= η * ∂e/∂a }}
Error η κ a
Yes we can!
Choosing the correct hyper parameter is essential Lower is better
How do we pick the meta-step size?
Stack a small number of hyper-...-hyper parameters layers and pick a tiny number for the last fixed one.
https://arxiv.org/pdf/1909.13371.pdf
fun id(x: ⅅ) = ⅅ(r=x.r, ε=1.0*x.ε) var x = ⅅ(0.0, n.th); repeat(n){ x = id(x) }; x.ε ∂id(xn)/∂xn*(…*(∂id(xn)/∂xn*[…, ∂xn/∂xn])…)
(...([] ++ x1) ++ ...) ++ xn is slow O(n2) x1++(... ++(xn++[])...) is fast O(n) = ((x1++)∘…∘(xn++)) []
Thinking Fast not Slow
Represent lists by functions !%#@&?
(...([] ++ x1) ++ ...) ++ xn is slow O(n2) x1++(... ++(xn++[])...) is fast O(n) ∂id(xn)/∂xn*(…*(∂id(xn)/∂xn*[…, ∂xn/∂xn])…) is slow O(n2) (…(∂id(xn)/∂xn*∂id(xn)/∂xn)*…)*[…, ∂xn/∂xn])…) is fast O(n)
Thinking Fast not Slow
Chain Rule Product Rule 〚f’(x.r)*x.ε〛(c) ={ 〚a〛(b) = b*a } c*(f’(x.r)*x.ε) ={ associativity } (c*f’(x.r))*x.ε ={ 〚a〛(b) = b*a } 〚x.ε〛(c*f’(x.r)) ={ 〚x.ε〛= x.ɞ } x.ɞ(c*f’(x.r)) ={ abstraction } { x.ɞ(it*f’(x.r)) }(c) 〚this.ε*that.r + this.r*that.ε〛(c) ={ commutativity } 〚that.r*this.ε + this.r*that.ε〛(c) ={〚a〛(b) = b*a } c*(that.r*this.ε + this.r*that.ε) ={ distributivity } c*(that.r*this.ε) + c*(this.r*that.ε) ={ associativity } (c*that.r)*this.ε + (c*this.r)*that.ε ={ definition of 〚〛} 〚this.ε〛(c*that.r) + 〚that.ε〛(c*this.r) ={ 〚x.ε〛= x.ɞ } this.ɞ(c*that.r) + that.ɞ(c*this.r) ={ abstraction } { this.ɞ(it*that.r) + that.ɞ(it*this.r) }(c)
class ⅅ(val r: ℝ, val ɞ: (ℝ)->ℝ={it}) /* df(a)/dx = (df(a)/da)*(da/dx) */ fun sin(x: ⅅ): ⅅ = ⅅ(r = sin(x.r), ɞ = { x.ɞ(it*cos(x.r)) }) /* d(a*b)/dx = (da/dx)*b + a*(db/dx) */
ⅅ(r = this.r * that.r, ɞ = { this.ɞ(it*that.r) +that.ɞ(it*this.r) })
repeat(n) { x = x*x }; x.ɞ(1.0)
class 𝔼(val r: ℝ, var ε: ℝ = 0.0, var n: Int = 0, val ɞ: (ℝ)➝ℝ = { it }) { fun ɞ(d: ℝ): ℝ { ε += d if(--n == 0) { return ɞ.invoke(ε) } else { return 0.0 } } } fun ⅅ.backward(d: ℝ = 1.0) { this.n++; this.ɞ(d) } fun sin(x: 𝔼): 𝔼 = 𝔼(r=sin(x.r), ɞ={ x.ɞ(it*cos(x.r)*) }).also{ x.n++ }
𝔼(r=this.r*that.r, ɞ={ this.ɞ(it*that.r); that.ɞ(it*this.r) }) .also{ this.n++; that.n++; }
data class 𝔼(val r: ℝ, var ε: ℝ = 0.0, var n: Int = 0, val ɞ: (ℝ)➝Unit={it}) { fun ɞ(d: ℝ) { ε += d; if(--n == 0) { ɞ.invoke(ε) } } } fun sin(x: 𝔼): 𝔼 = 𝔼(r=sin(x.r), ɞ={ x.ɞ(it*cos(x.r)) }).also{ x.n++ }
𝔼(r=this.r*that.r, ɞ={ this.ɞ(it*that.r); that.ɞ(it*this.r) }) .also{ this.n++; that.n++; } val x:𝔼 = …; val y:𝔼 = … val z = e[x,y]; z.backward() val ∂z/∂x = x.ε; val ∂z/∂y = y.ε
typealias Cont = ⅅ.(()->Unit)->Unit class ⅅ(val r: ℝ, var ε: ℝ = 1.0, val ɞ: Cont = { κ ➝ κ() }) fun sin(x: ⅅ): ⅅ = ⅅ(r = sin(x.r), ɞ = { κ ➝ x.ɞ { this.ε = cos(x.r)*x.ε; κ() }})
{ κ ➝ this@times.ɞ { that.ɞ { this.ε = this@times.ε*that.r + this@times.r*that.ε; κ() }}})
class ⅅ(val r: ℝ, var ε: ℝ = 0.0, val ɞ: Cont = { κ ➝ κ() }) fun sin(x: ⅅ): ⅅ = object: ⅅ(r = sin(x.r), ɞ = { κ ➝ x.ɞ{ k(); x.ε += this.ε*cos(x.r) }}
κ ➝ this@times.ɞ { that.ɞ { κ(); this@times.ε += this.ε*that.r; that.ε += this.ε*this@times.r }}})
#KotlinConf
Erik Meijer @headinthebox