The essential idea is to

systematically and slowly destroy structure in a data distribution through an iterative forward diffusion process
learn a reverse diffusion process that restores data structure, yielding a highly flexible and tractable generative model

Diffusion

\boldsymbol{x}_t=\alpha_t \boldsymbol{x}_{t-1}+\beta_t \boldsymbol{\varepsilon}_t, \quad \varepsilon_t \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I})

where $\alpha_t, \beta_t >0$, $\alpha_t^2+\beta_t^2=1$.

\begin{equation}
\begin{aligned}
\boldsymbol{x}_t & =\alpha_t \boldsymbol{x}_{t-1}+\beta_t \varepsilon_t \\
& =\alpha_t\left(\alpha_{t-1} \boldsymbol{x}_{t-2}+\beta_{t-1} \varepsilon_{t-1}\right)+\beta_t \varepsilon_t \\
& =\cdots \\
& =\left(\alpha_t \cdots \alpha_1\right) \boldsymbol{x}_0+\underbrace{\left(\alpha_t \cdots \alpha_2\right) \beta_1 \varepsilon_1+\left(\alpha_t \cdots \alpha_3\right) \beta_2 \varepsilon_2+\cdots+\alpha_t \beta_{t-1} \varepsilon_{t-1}+\beta_t \varepsilon_t}_{\text {sum of multiple i.i.d gaussian noise}}
\end{aligned}
\end{equation}

\begin{equation}
\boldsymbol{x}_t=\underbrace{\left(\alpha_t \cdots \alpha_1\right)}_{\text {denote } \bar{\alpha}_t} \boldsymbol{x}_0+\underbrace{\sqrt{1-\left(\alpha_t \cdots \alpha_1\right)^2}}_{\text {denote } \bar{\beta}_t} \bar{\varepsilon}_t, \quad \bar{\varepsilon}_t \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I})
\end{equation}

\begin{equation}
\begin{aligned}
\boldsymbol{x}_t&=\alpha_t \boldsymbol{x}_{t-1}+\beta_t \varepsilon_t \\
&=\alpha_t\left(\bar{\alpha}_{t-1} \boldsymbol{x}_0+\bar{\beta}_{t-1} \bar{\varepsilon}_{t-1}\right)+\beta_t \varepsilon_t \\
&=\bar{\alpha}_t \boldsymbol{x}_0+\alpha_t \bar{\beta}_{t-1} \bar{\varepsilon}_{t-1}+\beta_t \varepsilon_t
\end{aligned}
\end{equation}

Denoise

\begin{equation}
L = \Vert x_{t-1} - \mu(x_t)\Vert^2
\end{equation}

\begin{equation}
\mu(x_t) = \frac{1}{\alpha_t}(x_t - \beta_t \epsilon_\theta(x_t, t))
\end{equation}

\begin{equation}
\begin{aligned}
L &= \frac{\beta_t^2}{\alpha_t^2}\Vert \epsilon_t - \epsilon_\theta(x_t, t) \Vert^2 \\
&=  \frac{\beta_t^2}{\alpha_t^2}\Vert \epsilon_t - \epsilon_\theta(\bar{\alpha}_t \boldsymbol{x}_0+\alpha_t \bar{\beta}_{t-1} \bar{\varepsilon}_{t-1}+\beta_t \varepsilon_t,t)\Vert^2
\end{aligned}
\end{equation}

\begin{equation}
\begin{aligned}
\alpha_t \bar{\beta}_{t-1} \bar{\epsilon}_{t-1}+\beta_t \epsilon_t &\Leftrightarrow \bar{\beta}_t \epsilon \mid \epsilon \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I}) \\
\text{because } \left(\alpha_t \bar{\beta}_{t-1}\right)^2+\beta_t^2 &= \bar{\beta}_t^2
\end{aligned}
\end{equation}

\beta_t \bar{\varepsilon}_{t-1}-\alpha_t \bar{\beta}_{t-1} \varepsilon_t \Leftrightarrow  \bar{\beta}_t \boldsymbol{\omega} \mid \boldsymbol{\omega} \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I})

$\epsilon, \omega$ are i.i.d.

\begin{equation}
\begin{aligned}
\alpha_t \bar{\beta}_{t-1} \bar{\varepsilon}_{t-1}+\beta_t \varepsilon_t & =\bar{\beta}_t \varepsilon \\
\beta_t \bar{\varepsilon}_{t-1}-\alpha_t \bar{\beta}_{t-1} \varepsilon_t & =\bar{\beta}_t \omega
\end{aligned}
\end{equation}

\begin{equation}
\varepsilon_t=\frac{\left(\beta_t \varepsilon-\alpha_t \bar{\beta}_{t-1} \omega\right) \bar{\beta}_t}{\beta_t^2+\alpha_t^2 \bar{\beta}_{t-1}^2}=\frac{\beta_t \varepsilon-\alpha_t \bar{\beta}_{t-1} \omega}{\bar{\beta}_t}
\end{equation}

\begin{equation}
\begin{aligned}
& \mathbb{E}_{\bar{\varepsilon}_{t-1}, \varepsilon_t \sim \mathcal{N}(0, I)}\left[\left\|\varepsilon_t-\boldsymbol{\epsilon}_{\boldsymbol{\theta}}\left(\bar{\alpha}_t \boldsymbol{x}_0+\alpha_t \bar{\beta}_{t-1} \bar{\varepsilon}_{t-1}+\beta_t \varepsilon_t, t\right)\right\|^2\right] \\
= & \mathbb{E}_{\boldsymbol{\omega}, \varepsilon \sim \mathcal{N}(0, I)}\left[\left\|\frac{\beta_t \varepsilon-\alpha_t \bar{\beta}_{t-1} \boldsymbol{\omega}}{\bar{\beta}_t}-\boldsymbol{\epsilon}_{\boldsymbol{\theta}}\left(\bar{\alpha}_t \boldsymbol{x}_0+\bar{\beta}_t \varepsilon, t\right)\right\|^2\right]
\end{aligned}
\end{equation}

Diffusion Models As Variational Encoder-Decoder

Assume we want to model a data distribution($p\theta(x\vert z)$, $p\theta$ known as decoder) using gaussian distribution($z \sim \mathcal{N}(0,I)$ ).

Maximum Likelihood

Dataset: ${x^1,x^2,\dots,x^m} \sim p_{data}(x)$

Maximum Likelihood

\begin{equation}
\begin{aligned}
\theta^* &= arg\max\prod_{i=1}^{m} p_\theta(x^i)\\
&= arg\max \sum_{i=1}^{m}\log p_\theta(x^i) \\
&\approx  arg\max \mathbb{E}_{x\sim p_{data}}[\log p_\theta(x)] \\
& = arg\min  \int p_{\text {data }}(\mathbf{x}) \log \frac{p_{\mathrm{data}}(\mathbf{x})}{p_\theta(\mathbf{x})} \mathrm{d} \mathbf{x} \\
& = arg\min \mathrm{D}_{\mathrm{KL}}\left(p_{\text {data }} \| p_\theta\right)
\end{aligned}
\end{equation}

It's hard to compute the local log-likelihood value $\log p(x)=\log\int p(x,z)dz$.

We introduce a encoder $q(z\vert x)$.

\begin{equation}
\begin{aligned}
\log p(\mathbf{x})&=\log p(\mathbf{x}) \int q(\mathbf{z} \mid \mathbf{x}) \mathrm{d} \mathbf{z} \\
&=\mathbb{E}_{q(\mathbf{z} \mid \mathbf{x})}[\log p(\mathbf{x})] \\
&=\mathbb{E}_{q(\mathbf{z} \mid \mathbf{x})}\left[\log \frac{p(\mathbf{x}, \mathbf{z})}{q(\mathbf{z} \mid \mathbf{x})}\right]+\mathrm{D}_{\mathrm{KL}}(q(\mathbf{z} \mid \mathbf{x}) \| p(\mathbf{z} \mid \mathbf{x}))
\end{aligned}
\end{equation}

Evidence Evidence Lower Bound (ELBO)

\log p(\mathbf{x}) \geq \mathbb{E}_{q(\mathbf{z} \mid \mathbf{x})}\left[\log \frac{p(\mathbf{x}, \mathbf{z})}{q(\mathbf{z} \mid \mathbf{x})}\right]

In diffusion model, latent variables are $x_1,\dots,x_T$.

\begin{equation}
\text { Maximize } \mathbb{E}_{q\left(\mathbf{x}_1: \mathbf{x}_T \mid \mathbf{x}_0\right)}\left[\log \frac{p\left(\mathbf{x}_0: \mathbf{x}_T\right)}{q\left(\mathbf{x}_1: \mathbf{x}_T \mid \mathbf{x}_0\right)}\right]
\end{equation}

q(x_{1:T}\vert x_0) = \prod_{t=1}^T q(x_t\vert x_{t-1})

\begin{equation}
q\left(\mathbf{x}_t \mid \mathbf{x}_{t-1}\right)=\mathcal{N}\left(\mathbf{x}_t ; \sqrt{\alpha_t} \mathbf{x}_{t-1},\left(1-\alpha_t\right) \mathbf{I}\right)
\end{equation}

p_\theta(x_{0:T})=p(x_T)\prod_{t>0}p_\theta(x_{t-1}\vert x_t)

Remark. The backward model $q(x_{1:T}\vert x0)$ is fixed. All learnable parameters lie in the forward model $p\theta(x_{t-1}\vert x_t)$.

Remark. The forward model is Markovian. The backward model is Markovian when conditioned on $x_0$.

\begin{equation}
\begin{aligned}
L_\theta(x_0)&=\mathbb{E}_{q\left(\mathbf{x}_1: \mathbf{x}_T \vert \mathbf{x}_0\right)}\left[\log \frac{p\left(\mathbf{x}_0: \mathbf{x}_T\right)}{q\left(\mathbf{x}_1: \mathbf{x}_T \mid \mathbf{x}_0\right)}\right]\\
&=\mathbb{E}_{q\left(\mathbf{x}_{1: T} \mid \mathbf{x}_0\right)}\left[\log \frac{p\left(\mathbf{x}_T\right)}{q\left(\mathbf{x}_T \mid \mathbf{x}_0\right)}+\log p_\theta\left(\mathbf{x}_0 \vert \mathbf{x}_1\right)+\sum_{t>1} \log p_\theta\left(\mathbf{x}_{t-1} \mid \mathbf{x}_t\right)-\sum_{t>1} \log q\left(\mathbf{x}_{t-1} \vert \mathbf{x}_t, \mathbf{x}_0\right)\right] \\
&=\mathbb{E}_{q\left(\mathbf{x}_{1: T} \mid \mathbf{x}_0\right)}\left[\log \frac{p\left(\mathbf{x}_T\right)}{q\left(\mathbf{x}_T \mid \mathbf{x}_0\right)}+\sum_{t>1} \log \frac{p_\theta\left(\mathbf{x}_{t-1} \mid \mathbf{x}_t\right)}{q\left(\mathbf{x}_{t-1} \mid \mathbf{x}_t, \mathbf{x}_0\right)}+\log p_\theta\left(\mathbf{x}_0 \vert \mathbf{x}_1\right)\right] \\
&=\mathbb{E}_{q\left(\mathbf{x}_T \mid \mathbf{x}_0\right)}\left[\log \frac{p\left(\mathbf{x}_T\right)}{q\left(\mathbf{x}_T \vert \mathbf{x}_0\right)}\right]+\sum_{t>1} \mathbb{E}_{q\left(\mathbf{x}_{t-1}, \mathbf{x}_t \vert \mathbf{x}_0\right)}\left[\log \frac{p_\theta\left(\mathbf{x}_{t-1} \vert \mathbf{x}_t\right)}{q\left(\mathbf{x}_{t-1} \vert \mathbf{x}_t, \mathbf{x}_0\right)}\right]+\mathbb{E}_{q\left(\mathbf{x}_1 \mid \mathbf{x}_0\right)}\left[\log p_\theta\left(\mathbf{x}_0 \vert \mathbf{x}_1\right)\right]\\
&=\mathbb{E}_{q(\mathbf{x}_1 \vert \mathbf{x}_0)}\left[\log p_\theta\left(\mathbf{x}_0 \vert\mathbf{x}_1\right)\right]-\sum_{t>1} \mathbb{E}_{q\left(\mathbf{x}_t \vert \mathbf{x}_0\right)}\left[D_{\mathrm{KL}}\left(q\left(\mathbf{x}_{t-1} \vert \mathbf{x}_t, \mathbf{x}_0\right) \Vert p_\theta\left(\mathbf{x}_{t-1} \vert \mathbf{x}_t\right)\right)\right]-D_{\mathrm{KL}}\left(q\left(\mathbf{x}_T \vert \mathbf{x}_0\right) \Vert p\left(\mathbf{x}_T\right)\right)
\end{aligned}
\end{equation}

$-\mathrm{D}_{\mathrm{KL}}\left(q\left(\mathbf{x}_T \vert \mathbf{x}_0\right) \Vert p\left(\mathbf{x}_T\right)\right)$ Prior matching
$\mathbb{E}_{q(x_1 \vert x0)}[\log p\theta(x_0 \vert x_1)]$ Reconstruction
$-\sum{t>1}^{\mathrm{T}} \mathbb{E}{q\left(\mathbf{x}_t \mid \mathbf{x}0\right)}\left[\mathrm{D}{\mathrm{KL}}\left(q\left(\mathbf{x}_{t-1} \vert \mathbf{x}_t, \mathbf{x}0\right) \Vert p\theta\left(\mathbf{x}_{t-1} \vert \mathbf{x}_t\right)\right)\right]$ Denoising matching

ChufanSuki / read-paper-and-code

NeurIPS 2020 | Denoising Diffusion Probabilistic Model #180

Diffusion

Denoise

Diffusion Models As Variational Encoder-Decoder

Maximum Likelihood