\documentclass[12pt]{article} \setlength{\oddsidemargin}{0.2in} \setlength{\topmargin}{-0.5in} \setlength{\headheight}{0.0in} \setlength{\headsep}{0.5in} \setlength{\textwidth}{6.0in} \setlength{\textheight}{9.0in} \newcommand{\params}{\theta} \newcommand{\ba}{{\bf a}} \newcommand{\bah}{\hat{\bf a}} \newcommand{\bs}{{\bf s}} \newcommand{\bsh}{\hat{\bf s}} \newcommand{\bI}{{\bf I}} \newcommand{\bPhi}{{\bf \Phi}} \newcommand{\bnu}{{\bf \nu}} \newcommand{\be}{{\bf e}} \newcommand{\bLambdas}{{\bf \Lambda_s}} \newcommand{\bLambdaa}{{\bf \Lambda_a}} \newcommand{\bb}{{\bf b}} \def\lnot{\overline} \newcommand{\flipsi}{s_i\leftarrow\lnot{s_i}} \newcommand{\flipsk}{s_k\leftarrow\lnot{s_k}} \newcommand{\bH}{{\bf H}} \newcommand{\bJ}{{\bf J}} \newcommand{\bK}{{\bf K}} \newcommand{\bone}{{\bf 1}} \newcommand{\pd}[1]{\frac{\partial}{\partial #1}} \begin{document} \section{Image model} Our total image model is \begin{equation} P(\bI|\params) = \sum_{\bs} P(\bs|\params) \int P(\bI|\ba,\params) P(\ba|\bs,\params) d\ba \end{equation} where \begin{eqnarray} P(\bI|\ba,\params) & = & \frac{1}{Z_{\lambda_N}} e^{-\frac{\lambda_N}{2}|\bI-\bPhi\ba|^2} \\ P(\ba|\bs,\params) & = & \frac{1}{Z_{\bLambdaa(\bs)}} e^{-\frac{1}{2} \ba^T \, \bLambdaa(\bs)\, \ba} \\ P(\bs|\params) & = & \frac{1}{Z_{\bLambdas}} e^{-\frac{1}{2} \bs^T \, \bLambdas\, \bs} \end{eqnarray} $\theta$ denotes the model parameters: $\bPhi$, $\lambda_N$, $\bLambdaa(\bs)$, and $\bLambdas$. $\bLambdaa(\bs)$ and $\bLambdas$ are (for now) diagonal matrices with elements $\bLambdaa(\bs)_{ii}=\lambda_{a_i}(s_i)$ and $\bLambdas_{ii}=\lambda_{s_i}$. Variables $s_i$ are binary and $a_i$ are analog. Thus, $\lambda_{a_i}(s_i)$ takes on two values, $\lambda_{a_i}(0)$ and $\lambda_{a_i}(1)$, corresponding to the inverse variance of the Gaussian distribution over $a_i$ in the `inactive' and `active' states, respectively. \section{Inferring an optimal representation} Our problem is to find good estimates, $\bah$ and $\bsh$, derived from the posterior probability, $P(\ba,\bs|\bI,\params)$. There are at least two possible avenues for doing this. One is to seek a joint maximum of $P(\ba,\bs|\bI,\params)$ over $\ba$ and $\bs$. The other is to seek a maximum of $\int P(\ba,\bs|\bI,\params)\, d\ba$ over $\bs$ and then set $\bah=\arg\max_{\ba} P(\ba|\bI,\bsh)$. The latter has the possible advantage that it will find modes of $P(\ba,\bs|\bI,\params)$ with high volume with respect to $\ba$. We consider both of these routes here, labeled Method~1 and Method~2 respectively. \subsection{Method 1: $\max_{\ba,\bs} P(\ba,\bs|\bI,\params)$} Here we seek a joint maximum of the posterior over $\ba$ and $\bs$: \begin{eqnarray} \bah,\bsh & = & \arg\max_{\ba,\bs} P(\ba,\bs|\bI,\params) \\ & = & \arg\max_{\ba,\bs} P(\ba,\bs,\bI|\params) \\ & = & \arg\max_{\ba,\bs} P(\bI|\ba,\params) P(\ba|\bs,\params) P(\bs|\params) \\ & = & \arg\min_{\ba,\bs} E(\ba,\bs) \end{eqnarray} where \begin{eqnarray} E(\ba,\bs) & = & -\log P(\ba,\bs,\bI|\params) \\ & = & \frac{\lambda_N}{2}|\bI-\bPhi \ba|^2 + \log Z_{\bLambdaa(\bs)} + \frac{1}{2} \ba^T \, \bLambdaa(\bs)\, \ba + \frac{1}{2} \bs^T \, \bLambdas\, \bs + \mbox{const.} \label{eq:log-posterior-terms} \end{eqnarray} Separating out the terms in $E(\ba,\bs)$ that depend on $\ba$ and $\bs$ from those that depend on $\bs$ alone, we obtain \begin{eqnarray} E(\ba,\bs) & = & E_{\ba|\bs}(\ba,\bs) + E_{\bs}(\bs) + \mbox{const.} \\ E_{\ba|\bs}(\ba,\bs) & = & \frac{\lambda_N}{2}|\bI-\bPhi \ba|^2 + \frac{1}{2} \ba^T \, \bLambdaa(\bs)\, \ba \\ E_{\bs}(\bs) & = & \log Z_{\bLambdaa(\bs)} + \frac{1}{2} \bs^T \, \bLambdas\, \bs \; . \end{eqnarray} Our minimization problem then becomes \begin{eqnarray} \min_{\ba,\bs} E(\ba,\bs) & = & \min_{\ba,\bs} \left(E_{\ba|\bs}(\ba,\bs) + E_{\bs}(\bs) \right) \\ & = & \min_{\bs} \left( \min_{\ba} E_{\ba|\bs}(\ba,\bs) + E_{\bs}(\bs) \right) \; . \end{eqnarray} Let's first consider the inner minimization over $\ba$. For a given $\bs$, the minimum of $E_{\ba|\bs}(\ba,\bs)$ will occur at \begin{eqnarray} \nabla_a E_{\ba|\bs}(\ba,\bs) & = & 0 \\ & = & -\lambda_N \bPhi^T \bI + \lambda_N \bPhi^T\bPhi\ba + \bLambdaa(\bs)\ba \\ & = & -\lambda_N \bb + \bH(\bs)\,\ba \; . \end{eqnarray} where $\bb=\bPhi^T\bI$ and $\bH(\bs)=\lambda\bPhi^T\bPhi + \bLambdaa(\bs)$. Thus, the coefficients $\bah$ that minimizes $E_{\ba|\bs}(\ba,\bs)$ for a given $\bs$ are given by the solution to \begin{equation} \bH(\bs)\, \bah = \lambda_N \bb \end{equation} The outer minimization over $\bs$ may be accomplished by flipping the $s_i$ with a probability determined by the change in $E(\bah,\bs)$. Denoting the present state as $\bs_0$ and the state with one element $s_i$ flipped as $\bs_1$, and denoting the corresponding coefficients that minimize $E_{\ba|\bs}$ in each of those states as $\bah_0$ and $\bah_1$, respectively, then we have \begin{eqnarray} P(\flipsi) & = & \frac{P(\bah_1,\bs_1|I,\params)}{ P(\bah_0,\bs_0|I,\params) + P(\bah_1,\bs_1|I,\params)} \\ & = & \frac{1}{1 + \frac{P(\bah_0,\bs_0|I,\params)}{ P(\bah_1,\bs_1|I,\params)}} \\ & = & \frac{1}{1 + e^{-[E(\bah_0,\bs_0) - E(\bah_1,\bs_1)]}} \\ & = & \frac{1}{1 + e^{\Delta E(\flipsi)}} \; . \label{eq:flip-rule} \end{eqnarray} where $\Delta E(\flipsi)$ is the change in $E(\ba,\bs)$ due to flipping a single element $s_i$ and reminimizing over $\ba$. If we wish to minimize $E(\bah,\bs)$, then a temperature should be assigned and progressively lowered as the $s_i$ are flipped. That is, \begin{equation} P(\flipsi) = \frac{1}{1 + e^{\Delta E(\flipsi)/T}} \end{equation} \vspace{0.1in} Now we need to work out the details for computing $\Delta E(\flipsi)$. We have \begin{equation} \Delta E(\flipsi) = \Delta \min_{\ba} E_{\ba|\bs}(\flipsi) + \Delta E_{\bs}(\flipsi) \end{equation} To figure out $\Delta E_{\bs}(\flipsi)$, let's first expand $E_{\bs}(\bs)$: \begin{eqnarray} E_{\bs}(\bs) & = & \log Z_{\bLambdaa(\bs)} + \frac{1}{2} \bs^T \, \bLambdas\, \bs \\ & = & \log((2\pi)^\frac{n}{2}|\det\bLambdaa(\bs)|^{-\frac{1}{2}}) + \frac{1}{2} \bs^T \, \bLambdas\, \bs \\ & = & \mbox{const.} - \frac{1}{2}\log\det\bLambdaa(\bs) + \frac{1}{2} \bs^T \, \bLambdas\, \bs \\ & = & \mbox{const.} - \frac{1}{2}\sum_i \log \lambda_{a_i}(s_i) + \frac{1}{2} \sum_i s_i \lambda_{s_i} \end{eqnarray} Thus, \begin{eqnarray} \Delta E_{\bs}(\flipsi) & = & \frac{1}{2} \left[ -\log \lambda_{a_i}(\lnot{s_i}) - -\log \lambda_{a_i}(s_i) \right] + \frac{1}{2} \left[ \lnot{s_i} \lambda_{s_i} - s_i \lambda_{s_i} \right] \\ & = & \frac{1}{2} \left[ \log \frac{\lambda_{a_i}(s_i)}{\lambda_{a_i}(\lnot{s_i})} + \lambda_{s_i}(\lnot{s_i}-s_i) \right] \end{eqnarray} Now to figure out $\Delta \min_{\ba} E_{\ba|\bs}(\flipsi)$, let's expand $E_{\ba|\bs}(\bah,\bs)$: \begin{eqnarray} E_{\ba|\bs}(\bah,\bs) & = & \frac{\lambda_N}{2}|\bI-\bPhi \bah|^2 + \frac{1}{2} \bah^T \, \bLambdaa(\bs)\, \bah \\ & = & \frac{\lambda_N}{2} \left[ |\bI|^2 - 2\bah^T\bPhi^T\bI + \bah^T\bPhi^T\bPhi\bah \right] + \frac{1}{2} \bah^T \, \bLambdaa(\bs)\, \bah \\ & = & -\lambda_N \bah^T\bb + \frac{1}{2} \bah^T\, \bH(\bs)\, \bah + \mbox{const.}\\ & = & -\lambda_N \bah^T\bb + \frac{1}{2} \bah^T \lambda_N \bb + \mbox{const.}\\ & = & -\frac{\lambda_N}{2} \bah^T\bb + \mbox{const.} \end{eqnarray} We have used here the substitution $\bH(\bs) \, \bah = \lambda_N \bb$, as this condition always holds when $E_{\ba|\bs}(\ba,\bs)$ is at its minimum with respect to $\ba$. The change in $\min_{\ba} E_{\ba|\bs}$ for a flip of $s_i$ is thus \begin{eqnarray} \Delta \min_\ba E_{\ba|\bs}(\flipsi) & = & -\frac{\lambda_N}{2} \bah_1^T \bb - -\frac{\lambda_N}{2} \bah_0^T \bb \\ & = & -\frac{\lambda_N}{2} \Delta\ba^T \bb \; . \end{eqnarray} The total change in energy, $\Delta E(\flipsi)$, for a flip in $s_i$ is thus \begin{equation} \Delta E(\flipsi) = \frac{1}{2} \left[ \log \frac{\lambda_{a_i}(s_i)}{\lambda_{a_i}(\lnot{s_i})} + \lambda_{s_i}(\lnot{s_i}-s_i) -\frac{\lambda_N}{2} \Delta\ba^T \bb \right] \; . \label{eq:dE1} \end{equation} The challenge at this point is to compute $\Delta\ba$ quickly. Let's say that we have already computed the solution for $\ba$ for a current setting of $\bs$, \begin{equation} \bH(\bs_0) \, \bah_0 = \lambda_N \bb \, , \end{equation} and that we also have the inverse to $\bH(\bs)$: \begin{equation} \bJ(\bs_0)=\bH(\bs_0)^{-1} \; . \end{equation} Now if we wish to compute the change in $\bah$ that would be incurred for flipping a single binary state variable, $s_k$, then we need to solve the problem \begin{equation} \left[ \bH(\bs) + \Delta\bLambdaa(\flipsk) \right]\, \bah_1 = \lambda_N \bb \; , \end{equation} where $\Delta\bLambdaa(\flipsk)$ has a single non-zero element (always on the diagonal): \begin{equation} \Delta\bLambdaa(\flipsk)_{ij}=\left\{ \begin{array}{ll} \Delta\lambda_{a_k}=\lambda_{a_k}(\lnot{s_k})-\lambda_{a_k}(s_k) & i=k,\,j=k \\ 0 & \mbox{otherwise} \end{array} \right. \; . \end{equation} The change in $\bah$ is then \begin{eqnarray} \Delta \ba & = & \bah_1 - \bah_0 \\ & = & -\left[\frac{\Delta\lambda_{a_k}\, a_k} {1+\Delta\lambda_{a_k}\, J(\bs_0)_{kk}} \right] \bJ(\bs_0)_k \; , \label{eq:change-sol} \end{eqnarray} where $\bJ(\bs_0)_k$ is the $k$-th column of $\bJ(\bs_0)$. (See appendix~\ref{app:change-sol} for proof.) Note however that we must update $\bJ(\bs)$ if the change to $s_k$ is accepted. For this we use the Sherman-Morrison formula: \begin{equation} \bJ(\bs_1) = \bJ(\bs_0) - \left[ \frac{\Delta\lambda_{a_k}} {1+\Delta\lambda_{a_k}\, J(\bs_0)_{kk}} \right] \bJ(\bs_0)_k \bJ(\bs_0)_k^T \; . \end{equation} \subsection{Method 2: $\max_{\bs} \int P(\ba,\bs|\bI,\params) d\ba$} Here we first seek a maximum of the posterior over $\bs$ only: \begin{eqnarray} \bsh & = & \arg\max_{\bs} P(\bs|\bI,\params) \\ & = & \arg\max_{\bs} \int P(\ba,\bs|\bI,\params) d\ba \\ & = & \arg\max_{\bs} \int P(\ba,\bs,\bI|\params) d\ba \\ & = & \arg\max_{\bs} P(\bs|\params) \int P(\bI|\ba,\params)\, P(\ba|\bs,\params)\, d\ba \\ & = & \arg\max_{\bs} \frac{1}{Z_{\bLambdas}} e^{-\frac{1}{2} \bs^T \, \bLambdas\, \bs} \, \int \frac{1}{Z_{\lambda_N}} e^{-\frac{\lambda_N}{2}|\bI-\bPhi\ba|^2} \, \frac{1}{Z_{\bLambdaa(\bs)}} e^{-\frac{1}{2} \ba^T \, \bLambdaa(\bs)\, \ba} \, d\ba\\ & = & \arg\max_{\bs} \frac{1}{Z_{\bLambdas}} e^{-\frac{1}{2} \bs^T \, \bLambdas\, \bs} \, \frac{1}{Z_{\lambda_N}} \, \frac{1}{Z_{\bLambdaa(\bs)}} \, \int e^{-E_{\ba|\bs}(\ba,\bs)} d\ba \label{eq:E-integral} \end{eqnarray} where $E_{\ba|\bs}(\ba,\bs)$ is the same as defined previously: \begin{equation} E_{\ba|\bs}(\ba,\bs) = \frac{\lambda_N}{2}|\bI-\bPhi \ba|^2 + \frac{1}{2} \ba^T \, \bLambdaa(\bs)\, \ba \; . \end{equation} This term is quadratic in $\ba$ and so may be rewritten in the form: \begin{eqnarray} E_{\ba|\bs}(\ba,\bs) & = & E_{\ba|\bs}(\bah,\bs) + \frac{1}{2}(\ba-\bah)^T \bH(\bs) (\ba-\bah) + \mbox{const.} \\ \bah & = & \arg\min_{\ba} E_{\ba|\bs}(\ba,\bs) \\ \bH(\bs) & = & \nabla\nabla_{\ba} E_{\ba|\bs}(\ba,\bs) = \lambda_N \bPhi^T\bPhi + \bLambdaa(s) \end{eqnarray} and so the integral $\int e^{-E_{\ba|\bs}(\ba,\bs)}d\ba$ evaluates to \begin{eqnarray} \int e^{-E_{\ba|\bs}(\ba,\bs)} d\ba & = & k\, e^{-E_{\ba|\bs}(\bah,\bs)} \, \int e^{-\frac{1}{2}(\ba-\bah)^T \bH(\bs) (\ba-\bah)} d\ba \\ & = & k\, e^{-E_{\ba|\bs}(\bah,\bs)} \, (2\pi)^{\frac{n}{2}} |\det \bH(\bs)|^{-\frac{1}{2}} \; . \end{eqnarray} Our optimization problem is then \begin{eqnarray} \bsh & = & \arg\max_{\bs} \frac{1}{Z_{\bLambdas}} e^{-\frac{1}{2} \bs^T \, \bLambdas\, \bs} \, \frac{1}{Z_{\lambda_N}} \, \frac{1}{Z_{\bLambdaa(\bs)}} \, k\, e^{-E_{\ba|\bs}(\bah,\bs)} \, (2\pi)^{\frac{n}{2}} |\det \bH(\bs)|^{-\frac{1}{2}} \\ & = & \arg\min_{\bs} E(\bs) \end{eqnarray} where \begin{eqnarray} E(\bs) & = & -\log P(\bs|\bI,\params) \\ & = & \frac{1}{2} \bs^T \, \bLambdas\, \bs + \log Z_{\bLambdaa(\bs)} + E_{\ba|\bs}(\bah,\bs) - \frac{1}{2}\log\det \bH(\bs) + \mbox{const.} \label{eq:Es} \end{eqnarray} We can now perform stochastic hill-climbing on $P(\bs|\bI,\params)$ by flipping bits $s_i$ according to \begin{equation} P(\flipsi) = \frac{1}{1 + e^{\Delta E(\flipsi)/T}} \; . \end{equation} We have already calculated how the first three terms in (\ref{eq:Es}) are affected by a bit flip. The only thing to do now is to figure out the change in $\log \det \bH(\bs)$ for a single flip $\flipsi$. Here we use the relation \begin{equation} \det \bH(\bs_1) = \det\bH(s_0) \, (1 + \Delta\lambda_{a_i} J(\bs_0)_{ii}) \, , \label{eq:change-det} \end{equation} (see appendix~\ref{app:change-det} for proof), and so \begin{equation} \Delta\log\det\bH(\flipsi) = \log\left( 1 + \Delta\lambda_{a_i} J(\bs_0)_{ii} \right) \; . \end{equation} Thus, \begin{equation} \Delta E(\flipsi) = \frac{1}{2} \left[ \log \frac{\lambda_{a_i}(s_i)}{\lambda_{a_i}(\lnot{s_i})} + \lambda_{s_i}(\lnot{s_i}-s_i) -\lambda_N \Delta\ba^T \bb - \log\left( 1 + \Delta\lambda_{a_i} J(\bs_0)_{ii} \right) \right] \; . \label{eq:dE2} \end{equation} \subsection{Algorithm for finding an optimal represention} \noindent 1. Initialize $\bs$ to all 1's. Compute $\bH(\bs)$ and solve for $\ba$. Set $T=1$.\\ \\ 2. Loop over $k$=1:{\tt num\_coefficients}\\ \\ \indent 2a. Compute $\Delta E(\flipsk)$ according to either Method~1 or 2 (eq.~\ref{eq:dE1} or \ref{eq:dE2}). \\ \\ \indent 2b. Accept flip with probability $P(\flipsk)$. \\ \\ \indent 2c. If flip accepted, $\flipsk$, $\ba \leftarrow \ba_1$, $\bJ \leftarrow \bJ(\bs_1)$. \\ \\ \indent End loop.\\ \\ 3. Lower $T$ and go to 2 if $T>T_{\mbox{min}}$; otherwise stop. \section{Learning} Our strategy for learning is to adjust the parameters, $\params$, to maximize the average log-likelihood of images under the model: \begin{equation} \hat{\params} = \arg\max_{\params} \langle \log P(\bI|\params) \rangle \; . \end{equation} We shall do this via gradient ascent. In what follows, we derive the derivatives for each parameter. \subsection{$\bLambdas$} \begin{eqnarray} \Delta \lambda_{s_i} & \propto & \pd{\lambda_{s_i}} \langle \log P(\bI|\params) \rangle \\ & = & \frac{1}{P(\bI|\params)} \pd{\lambda_{s_i}} P(\bI|\params) \\ & = & \frac{1}{P(\bI|\params)} \sum_{\bs} \pd{\lambda_{s_i}} P(\bs|\params) \int P(\bI|\ba,\params) P(\ba|\bs,\params) d\ba \\ & = & \frac{1}{P(\bI|\params)} \sum_{\bs} \frac{1}{2}\left[ \langle s_i \rangle_{P(s_i|\params)} - s_i \right] P(\bs|\params) P(\bI|\bs,\params) \label{eq:si_prob-deriv} \\ & = & \frac{1}{2} \left[ \langle s_i \rangle_{P(s_i|\params)} - \langle s_i \rangle_{P(\bs|\bI,\params)} \right] \; . \end{eqnarray} To get (\ref{eq:si_prob-deriv}) we have used the general relation \begin{equation} \pd{\lambda} P(x|\lambda) = \left[ \langle f(x) \rangle_{P(x|\lambda)} - f(x) \right] P(x|\lambda)\; , \label{eq:prob-deriv} \end{equation} when $P(x|\lambda)$ is of the form \begin{equation} P(x|\lambda)=\frac{1}{Z_\lambda} e^{-\lambda f(x)} \; . \end{equation} (See appendix~\ref{app:prob-deriv} for proof.) \subsection{$\bLambdaa(\bs)$} \begin{eqnarray} \Delta \lambda_{a_i}(t) & \propto & \pd{\lambda_{a_i}(t)} \langle \log P(\bI|\params) \rangle \\ & = & \frac{1}{P(\bI|\params)} \sum_{\bs} P(\bs|\bLambdas) \int P(\bI|\ba,\params) \pd{\lambda_{a_i}(t)} P(\ba|\bs,\params) d\ba \\ & = & \frac{1}{P(\bI|\params)} \sum_{\bs} P(\bs|\params) \\ & & \int P(\bI|\ba,\params) \frac{1}{2} \delta(s_i-t) \left[ \langle a_i^2 \rangle_{P(a_i|t,\lambda_{a_i}(t))} - a_i^2 \right] P(\ba|\bs,\params) d\ba \\ & = & \frac{1}{2} \left\langle \delta(s_i-t) \left[ \langle a_i^2 \rangle_{P(a_i|t,\lambda_{a_i}(t))} - a_i^2 \right] \right\rangle_{P(\ba,\bs|\bI,\params)} \\ & = & \frac{1}{2} \left\langle \delta(s_i-t) \left\langle \left[ \frac{1}{\lambda_{a_i}(t)} - a_i^2 \right] \right\rangle_{P(\ba|\bs,\bI,\params)} \right\rangle_{P(\bs|\bI,\params)} \; . \end{eqnarray} where $t$ takes on values 0,1. We defer the problem of computing $\langle a_i^2 \rangle_{P(\ba|\bs,\bI,\params)}$ until the end of this section, as it is common to the other derivatives that follow. \subsection{$\lambda_N$} \begin{eqnarray} \Delta \lambda_N & \propto & \pd{\lambda_N} \langle \log P(\bI|\params) \rangle \\ & = & \frac{1}{P(\bI|\params)} \sum_{\bs} P(\bs|\params) \int \pd{\lambda_N} P(\bI|\ba,\params) P(\ba|\bs,\params) d\ba \\ & = & \frac{1}{P(\bI|\params)} \sum_{\bs} P(\bs|\params) \\ & & \int \frac{1}{2} \left[ \left\langle |\bI-\bPhi\ba|^2 \right\rangle_{P(\bI|\ba,\params)} - |\bI-\bPhi\ba|^2 \right] P(\bI|\ba,\params) P(\ba|\bs,\params) d\ba \\ & = & \frac{1}{2} \left\langle \left[ \frac{n}{\lambda_N} - |\bI-\bPhi\ba|^2 \right] \right\rangle_{P(\ba,\bs|\bI,\params)} \\ & = & \frac{1}{2} \left[ \frac{n}{\lambda_N} - \left\langle \left\langle |\bI-\bPhi\ba|^2 \right\rangle_{P(\ba|\bs,\bI,\params)} \right\rangle_{P(\bs|\bI,\params)} \right] \; . \end{eqnarray} Here we need to dissect $|\bI-\bPhi\ba|^2$ in terms of the moments on $\ba$ that need to be computed: \begin{eqnarray} \langle |\bI-\bPhi\ba|^2 \rangle & = & \langle |\bI|^2 - 2\ba^T\bPhi^T\bI + \ba^T\bPhi^T\bPhi\ba \rangle \\ & = & |\bI|^2 - 2\langle\ba^T\rangle\bb + \bone^T[\langle\ba\ba^T\rangle.*(\bPhi^T\bPhi)]\bone \; . \end{eqnarray} where all averages are under the distribution $P(\ba|\bs,\bI,\params)$. Again, we defer the problem of computing these moments until the end of this section. \subsection{$\bPhi$} \begin{eqnarray} \Delta \Phi_{ij} & \propto & \pd{\Phi_{ij}} \langle \log P(\bI|\params) \rangle \\ & = & \frac{1}{P(\bI|\params)} \sum_{\bs} P(\bs|\params) \int \pd{\Phi_{ij}} P(\bI|\ba,\params) P(\ba|\bs,\params) d\ba \\ & = & \frac{1}{P(\bI|\params)} \sum_{\bs} P(\bs|\params) \int \lambda_N \left[\bI-\bPhi\ba \right]\ba^T P(\bI|\ba,\params) P(\ba|\bs,\params) d\ba \\ & = & \lambda_N \left\langle \left[\bI -\bPhi\ba \right] \ba^T \right\rangle_{P(\ba,\bs|\bI,\params)} \\ & = & \lambda_N \left\langle \left[ \bI\langle\ba^T\rangle_{P(\ba|\bs,\bI,\params)} - \bPhi\langle\ba\ba^T\rangle_{ P(\ba|\bs,\bI,\params)} \right] \right\rangle_{P(\bs|\bI,\params)} \; . \end{eqnarray} \subsection{Computing $\langle\ba\rangle_{P(\ba|\bs,\bI,\params)}$ and $\langle\ba\ba^T\rangle_{P(\ba|\bs,\bI,\params)}$} The posterior over $\ba$, $P(\ba|\bs,\bI,\params)$, is given by \begin{eqnarray} P(\ba|\bs,\bI,\params) & = & k P(\bI|\ba,\params) P(\ba|\bs,\params) \\ & = & k \frac{1}{Z_{\lambda_N}} e^{-\frac{\lambda_N}{2}|\bI-\bPhi\ba|^2} \frac{1}{Z_{\bLambdaa(\bs)}} e^{-\frac{1}{2} \ba^T \, \bLambdaa(\bs)\, \ba} \\ & = & \frac{1}{Z_{\bH(\bs)}} e^{-E_{\ba|\bs}(\ba,\bs)} \end{eqnarray} where \begin{equation} E_{\ba|\bs}(\ba,\bs) = E_{\ba|\bs}(\bah,\bs) + \frac{1}{2}(\ba-\bah)^T \bH(\bs) (\ba-\bah) \; . \end{equation} Thus, \begin{equation} \langle\ba\rangle_{P(\ba|\bs,\bI,\params)} = \bah \; . \end{equation} Now to compute $\langle\ba\ba^T\rangle_{P(\ba|\bs,\bI,\params)}$ we use the fact that \begin{equation} \langle(\ba-\bah)(\ba-\bah)^T\rangle_{P(\ba|\bs,\bI,\params)} = \bH(\bs)^{-1} \end{equation} and so \begin{eqnarray} \langle\ba\ba^T\rangle_{P(\ba|\bs,\bI,\params)} & = & \bH(\bs)^{-1} + \bah\bah^T \\ & = & \bJ + \ba\ba^T \equiv \bK \; . \end{eqnarray} \subsection{Algorithm for adapting parameters} \noindent 1. Initialize parameters: $\bLambdas, \bLambdaa(\bs), \lambda_N, \bPhi$.\\ \\ 2. Loop over images\\ \\ \indent 2a. Gibbs sample on $P(\bs|\bI)$. Collect statistics $\langle \bs \rangle$, $\langle \bah \rangle$, $\langle H(\bs) + \bah\bah^T \rangle$.\\ \\ \indent 2b. Accumulate statistics.\\ \\ 3. Update parameters according to statistics accumulated over images.\\ \\ 4. Go to 2. \vspace{0.25in} \noindent {\bf Subroutine for Gibbs sampling on $P(\bs|\bI)$:}\\ \\ \noindent 1. Initialize $\bs$ to all 1's. Compute $\bH(\bs)$ and solve for $\ba$. \\ \\ 2. Loop over $k$=1:{\tt num\_coefficients}\\ \\ \indent 2a. Compute $\Delta E(\flipsk)$ according to Method 2 (eq.~\ref{eq:dE2}). \\ \\ \indent 2b. Accept flip with probability $P(\flipsk)$. \\ \\ \indent 2c. If flip accepted, $\flipsk$, $\ba \leftarrow \ba_1$, $\bJ \leftarrow \bJ(\bs_1)$. \\ \\ \indent End loop.\\ \\ 3. Go to 2 (for desired \# of sweeps). \appendix \section{Proof of~(\ref{eq:change-sol})} \label{app:change-sol} Comparing Equation (19) and Equation (42), which are to be solved for $\bah_0$ and $\bah_1$, respectively, we obtain \begin{equation} \left[ \bH(\bs) + \Delta\bLambdaa(\flipsk) \right]\, \bah_1 = \bH(\bs) \, \bah_0 \; . \end{equation} where $\bs=\bs_0$ (the state which leads to $\bah_0$). This yields \begin{eqnarray} \bah_1 - \bah_0 & = & \frac{\bH(\bs)}{\bH(\bs) + \Delta\bLambdaa(\flipsk)} \, \bah_0 - \bah_0 \\ & = & \frac{\bH(\bs)}{\bH(\bs) + \Delta\bLambdaa(\flipsk)} \, \bah_0 - \frac{\bH(\bs) + \Delta\bLambdaa(\flipsk)}{\bH(\bs) + \Delta\bLambdaa(\flipsk)} \, \bah_0 \\ & = & - \frac{\Delta\bLambdaa(\flipsk) \, \bah_0}{\bH(\bs) + \Delta\bLambdaa(\flipsk)} \\ & = & - \frac{\Delta\bLambdaa(\flipsk) \, \bah_0}{\bone + \Delta\bLambdaa(\flipsk) \bH(\bs)^{-1}} \, \bH(\bs)^{-1} \\ & = & -\left[\frac{\Delta\lambda_{a_k}\, a_k} {1+\Delta\lambda_{a_k}\, J(\bs_0)_{kk}} \right] \bJ(\bs_0)_k \end{eqnarray} The last step was made because the fraction represents a matrix with only one non-zero element, which is on the diagonal. Furthermore, $\bJ(\bs_0)=\bH(\bs)^{-1}$ and $\bJ(\bs_0)_k$ is the $k$-th column of $\bJ(\bs_0)$. \section{Proof of~(\ref{eq:change-det})} \label{app:change-det} According to the Leibnitz rule of the determinant, we have \begin{equation} \det \bH = \sum_j H_{ij} \det H_{i,\be_j} \end{equation} where \begin{equation} \bH_{i,\be_j} = \left( \begin{array}{c} \bH_1 \\ \vdots \\ \be_j \\ \vdots \end{array} \right) \begin{array}{c}\phantom{\bH_1} \\ \phantom{\vdots} \\ \mbox{ $\leftarrow$ $i$-th row } \\ \phantom{\vdots} \end{array} \end{equation} is obtained from the matrix $\bH$ by replacing its $i$-th row by the unity vector $\be_j$, which is $1$ at the $j$-th position, else zero. Rows $\bH_k$ of $\bH$ at other positions, $k \neq i$, remain. As $\bH(\bs_1)$ differs from $\bH(\bs_0)$ only in a single element $H_{ii}$, on the diagonal, we have \begin{eqnarray} \det \bH(\bs_1) & = & \sum_{j, j\neq i} H_{ij}(\bs_0) \det \bH_{i,\be_j}(\bs_0) + H_{ii}(\bs_1) \det \bH_{i,\be_i}(\bs_0) \\ & & + H_{ii}(\bs_0) \det \bH_{i,\be_i}(\bs_0) - H_{ii}(\bs_0) \det \bH_{i,\be_i}(\bs_0) \\ & = & \det \bH(\bs_0) + (H_{ii}(\bs_1) - H_{ii}(\bs_0)) \det \bH_{i,\be_i}(\bs_0) \\ & = & \det \bH(\bs_0) \, (1 + \Delta \lambda_{a_i} \det ( \bH_{i,\be_i}(\bs_0) \cdot \bH(\bs_0)^{-1} )) \\ & = & \det \bH(\bs_0) \, (1 + \Delta\lambda_{a_i} J(\bs_0)_{ii}) \end{eqnarray} where we made use of $\det \bH_{i,\be_i}(\bs_0) / \det \bH(\bs_0) = \det (\bH_{i,\be_i}(\bs_0) \cdot \bH(\bs_0)^{-1})$ and the product of these matrices is the unity matrix with the $i$-th diagonal element replaced by $J(\bs_0)_{ii}$. \section{Proof of~(\ref{eq:prob-deriv})} \label{app:prob-deriv} We have \begin{equation} P(x|\lambda)=\frac{1}{Z_\lambda} e^{-\lambda f(x)} \;\;\;\mbox{ and }\;\;\; Z_\lambda=\int e^{-\lambda f(x)} dx\; . \end{equation} Then \begin{eqnarray} \pd{\lambda} P(x|\lambda) & = & -\frac{e^{-\lambda f(x)}}{Z_\lambda^2} \, \pd{\lambda} Z_\lambda + \frac{1}{Z_\lambda} \pd{\lambda} e^{-\lambda f(x)} \\ & = & +\frac{e^{-\lambda f(x)}}{Z_\lambda} \, \frac{\int f(x) e^{-\lambda f(x)} dx}{Z_\lambda} + \frac{-f(x) e^{-\lambda f(x)}}{Z_\lambda} \\ & = & \left[ \langle f(x) \rangle_{P(x|\lambda)} - f(x) \right] P(x|\lambda)\; . \end{eqnarray} {\small \clearpage \section{Proof of Equation (91)} Because \begin{equation} Z_{\lambda_N} = \int e^{-\frac{\lambda_N}{2}|\bI-\bPhi\ba|^2} d\bI \end{equation} does not depend on $\bPhi$, we have \begin{eqnarray} \pd{\lambda_N} P(\bI|\ba,\params) & = & \frac{1}{Z_{\lambda_N}} \pd{\lambda_N} e^{-\frac{\lambda_N}{2}|\bI-\bPhi\ba|^2} \\ & = & \frac{1}{Z_{\lambda_N}} \lambda_N \left[\bI-\bPhi\ba \right]\ba^T e^{-\frac{\lambda_N}{2}|\bI-\bPhi\ba|^2} \\ & = & \lambda_N \left[\bI-\bPhi\ba \right]\ba^T P(\bI|\ba,\params) \end{eqnarray} Details on the derivation of the matrizes are: \[ |I-\Phi a|^2 \; = \; (I-\Phi a)^T (I - \Phi a) \; = \; I^T I \, - \, I^T \Phi a \, - \, \underbrace{(\Phi a)^T I}_{\underbrace{a^T \Phi^T I}_{I^T (a^T \Phi^T)^T}} \, + \, (\Phi a)^T (\Phi a) \; = \; I^T I \, - \, 2 \cdot I^T \Phi a \, + \, a^T \Phi^T \Phi a \] and \[ \frac{\partial}{\partial \Phi} (a^T \Phi^T \Phi a) \; = \; 2 \cdot \Phi (a a^T) \] and \[ \frac{\partial}{\partial \Phi} I^T \Phi a \; = \; I \, a^T \] {\tiny Quelle: Helmut L{\"u}tkepohl, Handbook of Matrices. Wiley 1997, ISBN 0471966886;0471970158(pbk.).\\ Stat.Bib. Signature: Ml l{\"u}t; page 179} } \end{document}