Note: This equation sheet will be provided alongside the midterm exam. Outside copies of this or any other aids are not permitted.
Conditional Gaussian Equations
\begin{align*} \mathcal{N}(\mathbf{x} | \boldsymbol{\mu}, \boldsymbol{\Sigma}) &= \mathcal{N}\left(\begin{bmatrix} \mathbf{x}_a \\ \mathbf{x}_b \end{bmatrix} \middle| \begin{bmatrix} \boldsymbol{\mu}_a \\ \boldsymbol{\mu}_b \end{bmatrix} , \begin{bmatrix} \boldsymbol{\Sigma}_{aa} & \boldsymbol{\Sigma}_{ab} \\ \boldsymbol{\Sigma}_{ba} & \boldsymbol{\Sigma}_{bb} \end{bmatrix}\right)\\ p(\mathbf{x}_a | \mathbf{x}_b) &= \mathcal{N} \left( \mathbf{x}_a | \boldsymbol{\mu}_{a|b}, \boldsymbol{\Sigma}_{a|b} \right) \\ \boldsymbol{\mu}_{a|b} &= \boldsymbol{\mu}_a + \boldsymbol{\Sigma}_{ab}\boldsymbol{\Sigma}_{bb}^{-1}(\mathbf{x}_b - \boldsymbol{\mu}_b) \\ \boldsymbol{\Sigma}_{a|b} &= \boldsymbol{\Sigma}_{aa} - \boldsymbol{\Sigma}_{ab} \boldsymbol{\Sigma}_{bb}^{-1} \boldsymbol{\Sigma}_{ba} \end{align*}
Linear Dynamical Systems
The Kalman filtering and smoothing equations are given by:
\begin{align*} \boldsymbol{\mu}_{t|t-1} &= \mathbf{A} \boldsymbol{\mu}_{t-1|t-1} \\ \boldsymbol{\Sigma}_{t|t-1} &= \mathbf{Q} + \mathbf{A} \boldsymbol{\Sigma}_{t-1|t-1}\mathbf{A}^T \\ \boldsymbol{\mu}_{t|t} &= \boldsymbol{\mu}_{t|t-1} + \mathbf{K}_t (\mathbf{x}_t - \mathbf{C}\boldsymbol{\mu}_{t|t-1}) \\ \boldsymbol{\Sigma}_{t|t} &= \boldsymbol{\Sigma}_{t|t-1} - \mathbf{K}_t \mathbf{C}\boldsymbol{\Sigma}_{t|t-1} \\ \mathbf{K}_t &= \boldsymbol{\Sigma}_{t|t-1} \mathbf{C}^T(\mathbf{C} \boldsymbol{\Sigma}_{t|t-1}\mathbf{C}^T + \mathbf{R})^{-1} \\ \boldsymbol{\mu}_{t|T} &= \boldsymbol{\mu}_{t|t} + \mathbf{F}_t(\boldsymbol{\mu}_{t+1|T} - \boldsymbol{\mu}_{t+1|t}) \\ \boldsymbol{\Sigma}_{t|T} &= \mathbf{F}_t (\boldsymbol{\Sigma}_{t+1|T} - \boldsymbol{\Sigma}_{t+1|t})\mathbf{F}_t^T + \boldsymbol{\Sigma}_{t|t} \\ \mathbf{F}_t &= \boldsymbol{\Sigma}_{t|t} \mathbf{A}^T \boldsymbol{\Sigma}_{t+1|t}^{-1} \end{align*}
Prediction is given by:
\begin{align*} \boldsymbol{\mu}_{T+k|T} &=\mathbf{A} \boldsymbol{\mu}_{T+k-1|T} \\ \boldsymbol{\Sigma}_{T+k|T} &=\mathbf{A} \boldsymbol{\Sigma}_{T+k-1|T} \mathbf{A}^T + \mathbf{Q} \end{align*}
The expectation-maximization equations are:
\begin{align*} \mathbb{E}_{q_n}[\mathbf{z}_t|\mathbf{x}_{1:T}] &= \boldsymbol{\mu}_{t|T} \\ \mathbb{E}_{q_n}[\mathbf{z}_t\mathbf{z}_t^T|\mathbf{x}_{1:T}] &= \boldsymbol{\Sigma}_{t|T} + \boldsymbol{\mu}_{t|T}\boldsymbol{\mu}_{t|T}^T\\ \mathbb{E}_{q_n}[\mathbf{z}_{t}\mathbf{z}_{t-1}^T|\mathbf{x}_{1:T}] &= \mathbf{F}_{t-1}\boldsymbol{\Sigma}_{t|T} + \boldsymbol{\mu}_{t|T}\boldsymbol{\mu}_{t-1|T}^T \\ \mathbf{A}_n &= \left( \sum_{t=1}^T \mathbb{E} [\mathbf{z}_t \mathbf{z}_{t-1}^T] \right) \left( \sum_{t=1}^T \mathbb{E} [\mathbf{z}_{t-1} \mathbf{z}_{t-1}^T] \right)^{-1} \\ \mathbf{C}_n &= \left( \sum_{t=1}^T \mathbf{x}_t \mathbb{E} [\mathbf{z}_{t}^T] \right) \left( \sum_{t=1}^T \mathbb{E} [\mathbf{z}_{t} \mathbf{z}_{t}^T] \right)^{-1} \\ \mathbf{Q}_n &= \frac{1}{T} \left( \sum_{t=1}^T \mathbb{E} [\mathbf{z}_t \mathbf{z}_{t}^T] - \mathbf{A}_n \mathbb{E} [\mathbf{z}_{t-1} \mathbf{z}_{t}^T] - \mathbb{E} [\mathbf{z}_{t} \mathbf{z}_{t-1}^T] \mathbf{A}_n^T + \mathbf{A}_n \mathbb{E} [\mathbf{z}_{t-1} \mathbf{z}_{t-1}^T]\mathbf{A}_n^T \right) \\ \mathbf{R}_n &= \frac{1}{T} \left( \sum_{t=1}^T \mathbf{x}_t \mathbf{x}_{t}^T - \mathbf{C}_n \mathbb{E} [\mathbf{z}_{t}] \mathbf{x}_{t}^T - \mathbf{x}_{t} \mathbb{E} [\mathbf{z}_{t}^T] \mathbf{C}_n^T + \mathbf{C}_n \mathbb{E} [\mathbf{z}_{t} \mathbf{z}_{t}^T]\mathbf{C}_n^T \right) \end{align*}
Discrete HMMs
For a discrete HMM with K discrete classes, the transition probability is fully described by the probability of transitioning from state k to k':
\begin{align*} p(z_{t+1,k'} = 1|z_{t,k} = 1) &= A_{k,k'} \\ 0 \leq A_{k,k'} &\leq 1 \\ \sum_{k'=1}^K A_{k,k'} &= 1 \end{align*}
The initial latent node \mathbf{z}_0 has a prior that is fully determined by a vector of probabilities \boldsymbol{\pi}:
p(\mathbf{z}_0) = \prod_{k=1}^K \left(\pi_k \right)^{z_{0,k}}.
The renormalized alpha and beta are:
\begin{align*} \hat{\alpha}(\mathbf{z}_t) &= p(\mathbf{z}_t|\mathbf{x}_{1:t}) = \frac{\alpha(\mathbf{z}_t)}{p(\mathbf{x}_{1:t})} \\ \hat{\beta}(\mathbf{z}_t) &= \frac{p(\mathbf{x}_{t+1:T}|\mathbf{z}_t)}{p(\mathbf{x}_{t+1:T}|\mathbf{x}_{1:t})} \\ c_t &= p(\mathbf{x}_t|\mathbf{x}_{1:t-1}) \end{align*}
The recursion equations are:
\begin{align*} c_t \hat{\alpha}(\mathbf{z}_t) &= p(\mathbf{x}_t | \mathbf{z}_t) \sum_{\mathbf{z}_{t-1}} \hat{\alpha}(\mathbf{z}_{t-1}) p(\mathbf{z}_{t}|\mathbf{z}_{t-1}) \\ c_{t+1} \hat{\beta}(\mathbf{z}_t) &= \sum_{\mathbf{z}_{t+1}} \hat{\beta}(\mathbf{z}_{t+1}) p(\mathbf{x}_{t+1}|\mathbf{z}_{t+1}) p(\mathbf{z}_{t+1}|\mathbf{z}_{t}) \end{align*}
The EM update equations for discrete HMMs are:
\begin{align*} \pi_{k,n} &= p(z_{0,k} = 1 | \mathbf{x}_{1:T}) \\ A_{k,k',n} &= \frac{\sum_{t=0}^{T-1} p(z_{t,k} = 1, z_{t+1,k'} = 1 | \mathbf{x}_{1:T})}{\sum_{t=0}^{T-1} \sum_{j=1}^K p(z_{t,k} = 1, z_{t+1,j} = 1 | \mathbf{x}_{1:T})} \end{align*}
Extended Kalman Filter
For nonlinear dynamics \mathbf{z}_t = f(\mathbf{z}_{t-1}) + \mathbf{w}_t and observations \mathbf{x}_t = h(\mathbf{z}_t) + \mathbf{v}_t:
\begin{align*} \boldsymbol{\mu}_{t|t-1} &= f(\boldsymbol{\mu}_{t-1|t-1}) \\ \boldsymbol{\Sigma}_{t|t-1} &= \mathbf{G}_t \boldsymbol{\Sigma}_{t-1|t-1} \mathbf{G}_t^T + \mathbf{Q} \\ \boldsymbol{\mu}_{t|t} &= \boldsymbol{\mu}_{t|t-1} + \mathbf{K}_t(\mathbf{x}_t - h(\boldsymbol{\mu}_{t|t-1})) \\ \boldsymbol{\Sigma}_{t|t} &= \boldsymbol{\Sigma}_{t|t-1} - \mathbf{K}_t \mathbf{H}_t \boldsymbol{\Sigma}_{t|t-1} \\ \mathbf{K}_t &= \boldsymbol{\Sigma}_{t|t-1} \mathbf{H}_t^T (\mathbf{H}_t \boldsymbol{\Sigma}_{t|t-1} \mathbf{H}_t^T + \mathbf{R})^{-1} \end{align*}
where \mathbf{G}_t = \nabla f(\boldsymbol{\mu}_{t-1|t-1}) is the Jacobian of f and \mathbf{H}_t = \nabla h(\boldsymbol{\mu}_{t|t-1}) is that of h.
Gaussian Processes
In parametric models, we infer parameters \boldsymbol{\theta} and predict via:
\begin{align*} p(\boldsymbol{\theta} \mid \mathbf{X}) &= \frac{p(\mathbf{X} \mid \boldsymbol{\theta}) p(\boldsymbol{\theta})}{p(\mathbf{X})} \\ p(\mathbf{X}_\star \mid \mathbf{X}) &= \int p(\mathbf{X}_\star \mid \boldsymbol{\theta}) p(\boldsymbol{\theta} \mid \mathbf{X}) d\boldsymbol{\theta} \end{align*}
In function-space inference with gaussian processes (GPs), we marginalize over functions:
p(\mathbf{X}_\star \mid \mathbf{X}) = \int p(\mathbf{X}_\star \mid f) p(f \mid \mathbf{X}) df
A GP is defined by a mean function \mu(\mathbf{x}) and covariance (kernel) function \kappa(\mathbf{x}, \mathbf{x}'):
\begin{align*} f(\mathbf{x}) &\sim \mathcal{GP}(\mu(\mathbf{x}), \kappa(\mathbf{x}, \mathbf{x}')) \\ p(\mathbf{f}) &= \mathcal{N}(\mathbf{f} | \boldsymbol{\mu}, \mathbf{K}) \end{align*}
where K_{ij} = \kappa(\mathbf{x}_i, \mathbf{x}_j).
The joint distribution over observed and test points is:
p\left(\begin{bmatrix} \mathbf{f}(\mathbf{X}) \\ \mathbf{f}(\mathbf{X}_\star) \end{bmatrix}\right) = \mathcal{N}\left(\begin{bmatrix} \boldsymbol{\mu}(\mathbf{X}) \\ \boldsymbol{\mu}(\mathbf{X}_\star) \end{bmatrix}, \begin{bmatrix} \mathbf{K} & \mathbf{K}_\star \\ \mathbf{K}_\star^T & \mathbf{K}_{\star\star} \end{bmatrix}\right)
Given observations \mathbf{y} = \mathbf{f} + \boldsymbol{\epsilon} where \boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \sigma_w^2 \mathbf{I}), the predictive distribution at \mathbf{x}_* is:
\begin{align*} p(f_* | \mathbf{x}_*, \mathbf{X}, \mathbf{y}) &= \mathcal{N}(f_* | \mu_*, \sigma_*^2) \\ \mu_* &= \mu(\mathbf{x}_*) + \mathbf{k}_*^T (\mathbf{K} + \sigma_w^2 \mathbf{I})^{-1} (\mathbf{y} - \boldsymbol{\mu}(\mathbf{X})) \\ \sigma_*^2 &= \kappa(\mathbf{x}_*, \mathbf{x}_*) - \mathbf{k}_*^T (\mathbf{K} + \sigma_w^2 \mathbf{I})^{-1} \mathbf{k}_* \end{align*}
where \mathbf{k}_* = [\kappa(\mathbf{x}_*, \mathbf{x}_1), \ldots, \kappa(\mathbf{x}_*, \mathbf{x}_n)]^T, \mu(\mathbf{x}_*) is the prior mean at \mathbf{x}_*, and \boldsymbol{\mu}(\mathbf{X}) is the vector of prior means at training points.
Some common kernel functions are:
\begin{align*} \kappa_{\text{SE}}(\mathbf{x}, \mathbf{x}') &= \sigma^2 \exp\left(-\frac{\|\mathbf{x} - \mathbf{x}'\|^2}{2\ell^2}\right) \\ \kappa_{\text{per}}(\mathbf{x}, \mathbf{x}') &= \sigma^2 \exp\left(-\frac{2\sin^2(\pi|\mathbf{x} - \mathbf{x}'|/p)}{\ell^2}\right) \\ \kappa_{\text{linear}}(\mathbf{x}, \mathbf{x}') &= \mathbf{x}^T \mathbf{x}' \\ \kappa_{+}(\mathbf{x}, \mathbf{x}') &= \kappa_1(\mathbf{x}, \mathbf{x}') + \kappa_2(\mathbf{x}, \mathbf{x}') \\ \kappa_{\times}(\mathbf{x}, \mathbf{x}') &= \kappa_1(\mathbf{x}, \mathbf{x}') \times \kappa_2(\mathbf{x}, \mathbf{x}') \end{align*}
A kernel function \kappa is valid if and only if it can be written as:
\kappa(\mathbf{x}, \mathbf{x}') = \boldsymbol{\phi}(\mathbf{x})^T \boldsymbol{\phi}(\mathbf{x}')
for some feature representation \boldsymbol{\phi} (which may be infinite-dimensional).
A linear model y(\mathbf{x}) = \mathbf{w}^T \boldsymbol{\phi}(\mathbf{x}) with prior p(\mathbf{w}) = \mathcal{N}(\mathbf{0}, \sigma_w^2 \mathbf{I}) induces a GP with kernel:
\kappa(\mathbf{x}_i, \mathbf{x}_j) = \sigma_w^2 \boldsymbol{\phi}(\mathbf{x}_i)^T \boldsymbol{\phi}(\mathbf{x}_j)
Algorithms
Expectation-Maximization
Input: Initial estimate \boldsymbol{\theta}_0
For n=1 to N_\text{iter}:
E-step: q_n = \underset{q}{\text{argmin}} \ \mathrm{KL}(q(\mathbf{Z}) || p(\mathbf{Z}|\mathbf{X},\boldsymbol{\theta}_{n-1}))
M-step: \boldsymbol{\theta}_n = \underset{\boldsymbol{\theta}}{\text{argmax}} \ \mathcal{L}(q_n, \boldsymbol{\theta})
Output: Parameter estimate \boldsymbol{\theta}_{N_\text{iter}}
Particle Filtering
Input: N samples \mathbf{z}_{0,i} \sim p(\mathbf{z}_0) with uniform weights w_{0,i} = 1/N
For t=1 to T+k:
- Resample: For i=1
to N_\text{samps}:
- Draw \mathbf{z}'_{t-1,i} = \mathbf{z}_{t-1,j} with probability w_{t-1,j}
- Propagate: For i=1
to N_\text{samps}:
- Sample \mathbf{z}_{t,i} = g(\mathbf{z}'_{t-1,i})
- Reweight: For i=1
to N:
- If t \leq T: w_{t,i} = \frac{p(\mathbf{x}_t|\mathbf{z}_{t,i})}{\sum_{j=1}^N p(\mathbf{x}_t|\mathbf{z}_{t,j})}
- Else: w_{t,i} = \frac{1}{N_\text{samps}}
Output: Samples \mathbf{z}_{t,i} and weights w_{t,i}