% I've made the source of this available September 2004 as I realised people
% might want to adapt this for their own courses. I guess it's LaTeX2E. I did
% some not very pretty fudging to fit stuff in how I wanted it. I used psnup to
% fit the four pages onto a double sided sheet of A4 paper. Not pretty, but was
% good enough.
% 
% I'd be interested to see any improved or alternative versions of this
% document. Also LaTeX wizards: please feel free to show me how to do this
% better.
% 
% Iain Murray, http://www.gatsby.ucl.ac.uk/~iam23/

\documentclass[a4paper]{article}

\usepackage{amsmath,amsthm,amsfonts}
\usepackage{graphicx}
\usepackage[bottom]{footmisc} % normal footnotes weren't behaving

% Text position and margin fudging
	\addtolength{\oddsidemargin}{-0.7in}
	\addtolength{\evensidemargin}{-0.7in}
	\addtolength{\marginparwidth}{+0.7in}
	\addtolength{\marginparpush}{+5pt}
	\let\oldmarginpar\marginpar
	\renewcommand\marginpar[1]{\-\oldmarginpar[\raggedleft\small\sf #1]{\raggedright\small\sf #1}}

% I use \prob for continuous distributions and \Prob for discreet (except when
% I forget). I can then change these to reformat all probabilities to something
% fancier if I want.
	\newcommand{\prob}[1]{p\left( #1 \right)}
	\newcommand{\Prob}[1]{P\left( #1 \right)}

% Common stuff for linear algebra
	\newcommand{\eye}{\mathbb I}
	\newcommand{\bx}{{\bf x}}
	\newcommand{\by}{{\bf y}}
	\newcommand{\be}{{\bf e}}

%calculus
	\newcommand{\ddd}[2]{\frac{{\rm d}^{2} #1}{{\rm d} #2 ^{2}}}
	\newcommand{\dd}[2]{\frac{{\rm d} #1}{{\rm d} #2}}

% Misc
	\newcommand{\ith}{^{\left( i \right)}}
	\newcommand{\te}{\!=\!} % thin equals

\title{Background material crib-sheet}
\author{Iain Murray $<${\tt i.murray+ta@gatsby.ucl.ac.uk}$>$, October 2003}
\date{}

\begin{document}

\maketitle

{\small \em Here are a summary of results with which you should be familiar.
If anything here is unclear you should to do some further reading and exercises.}

\section{Probability Theory}

{\small \em Chapter 2, sections 2.1--2.3 of David MacKay's book covers this material:\\
\indent {\tt http://www.inference.phy.cam.ac.uk/mackay/itila/book.html}}

\bigskip \noindent% \marginpar{Notation}\
The probability a discrete variable $A$
takes value $a$ is:\quad $0 \leq \Prob{A\!=\!a} \leq 1$

\bigskip

\noindent \marginpar{Alternatives}\
Probabilities of alternatives add:
$\Prob{A \!=\! a~{\mathrm or}~a^\prime}=\Prob{A \!=\! a} + \Prob{A \!=\! a^\prime}$

\bigskip

\noindent \marginpar{Normalisation}\
The probabilities of all outcomes must sum to one:
$\displaystyle \sum_{\mathrm{all~possible~} a} \Prob{A\!=\!a} = 1$

\bigskip

\noindent \marginpar{Joint Probability}\
$\Prob{A\!=\!a,B\!=\!b}$ is the joint probability that both $A\!=\!a$ and $B\!=\!b$ occur.

\bigskip

\noindent \marginpar{Marginalisation}\
Variables can be ``summed out'' of joint distributions:
\[
\Prob{A\te a} = \sum_{\mathrm{all~possible~} b} \Prob{A\!=\!a,B\!=\!b}
\]

\bigskip

\noindent \marginpar{Conditional Probability}\
$\Prob{A\!=\!a|B\!=\!b}$ is the probability $A\!=\!a$ occurs given the knowledge $B\!=\!b$.

\bigskip

\noindent \marginpar{Product Rule}\
$\Prob{A\!=\!a,B\!=\!b} = \Prob{A\!=\!a}\Prob{B\!=\!b|A\!=\!a} = \Prob{B\!=\!b}\Prob{A\!=\!a|B\!=\!b}$

\bigskip

\noindent \marginpar{Independence}\
The following hold, for all $a$ and $b$, {\em \bf if and only if $A$ and $B$ are independent}:\\
\[
\begin{array}{rcl}
\Prob{A\!=\!a|B\!=\!b}&=&\Prob{A\!=\!a} \\
\Prob{B\!=\!b|A\!=\!a}&=&\Prob{B\!=\!b} \\
\Prob{A\!=\!a,B\!=\!b}&=&\Prob{A\!=\!a}\Prob{B\!=\!b}.
\end{array}
\]
Otherwise the product rule above {\em must} be used.

\bigskip

\noindent \marginpar{Bayes Rule}Bayes rule can be derived from the above:
\[
\Prob{A\te a|B\te b,{\mathcal H}} = \frac{\Prob{B\te b|A\te a,{\mathcal 
H}}\Prob{A\te a|{\mathcal H}}}{\Prob{B\te b|{\mathcal H}}}
\propto \Prob{A\te a,B\te b|{\mathcal H}}
\]
Note that here, as with any expression, we are free to condition the whole
thing on any set of assumptions, ${\mathcal H}$, we like.
%Summing over all possible $a$ will give the ``evidence'' $\Prob{B\te b|{\mathcal H}}$,
Note $\sum_a\Prob{A\te a,B\te b|{\mathcal H}}=\Prob{B\te b|{\mathcal H}}$ gives
the normalising constant of proportionality.

\pagebreak

\noindent \marginpar{Continuous variables}All the above theory basically still applies to
continuous variables if sums are converted into integrals\footnote{Integrals
are the equivalent of sums for continuous variables. Eg: $\sum_{i=1}^n
f(x_i)\Delta x$ becomes the integral $\int_a^b f(x) \mathrm{d}x$ in the limit
$\Delta x \rightarrow 0$, $n\rightarrow \infty$, where $\Delta x =
\frac{b-a}{n}$ and $x_i = a + i\Delta x
$. Find an A-level text book with some diagrams if you have not seen this before.
}.
The probability that $X$ lies
between $x$ and $x\!+\!\mathrm{d}x$  is $\prob{x}\mathrm{d}x$,
% yeah I know x+ dx isn't very rigorous. pah.
where  $\prob{x}$ is a {\em probability density function} with range
$[0,\infty ]$.
\marginpar{\vspace{12pt}Continuous versions of some results}
\[
\Prob{x_1\!<\!X\!<\!x_2} = \int_{x_1}^{x_2} \prob{x} \mathrm{d}x \,,
\;\;
\int_{-\infty}^{\infty} \prob{x} \mathrm{d}x = 1
\;\;\mbox{and}\;\;
\prob{x} = \int_{-\infty}^{\infty} \prob{x,y} \mathrm{d}y .
\]

\noindent \marginpar{Expectations}The expectation or mean under a probability
distribution is:
\[
	\left< f(a) \right> = \sum_a \Prob{A\te a} f(a)
	\;\;\;\mbox{or}\;\;\;
	\left< f(x) \right> = \int_{-\infty}^{\infty} \prob{x} f(x) \mathrm{d}x
\]

\section{Linear Algebra}

{\em \small This is designed as a prequel to Sam Roweis's ``matrix identities'' sheet:\\
\indent {\tt http://www.cs.toronto.edu/\~{}roweis/notes/matrixid.pdf}}

\bigskip \noindent Scalars are individual numbers,
vectors are columns of numbers, matrices are rectangular grids of numbers, eg:
\[
x=3.4,
\;\;\;\;\;\;
\bx = \left( \begin{array}{c}x_1 \\ x_2 \\ \vdots \\ x_n \end{array} \right),
\;\;\;\;\;\;
A = \left( \begin{array}{cccc}
	A_{11} & A_{12} & \cdots & A_{1n} \\
	A_{21} & A_{22} & \cdots & A_{2n} \\
	\vdots & \vdots & \ddots & \vdots \\
	A_{m1} & A_{m2} & \cdots & A_{mn} \\
\end{array} \right)
\]

\bigskip \noindent \marginpar{Dimensions}In the above example $x$ is $1\times
1$, $\bx$ is $n\times 1$ and $A$ is $m\times n$.

\bigskip \noindent \marginpar{Transpose}The transpose operator, ${}^\top$ (
${}^\prime$ in Matlab), swaps the rows and columns:
\[
x^\top = x,
\;\;\;\;\;\;
\bx^\top = \left( \begin{array}{cccc} x_1 & x_2 & \cdots & x_n \end{array} \right),
\;\;\;\;\;\;
{\left(A^\top\right)}_{ij} = A_{ji}
\]

\bigskip \noindent \marginpar{Multiplication}Quantities whose inner dimensions
match may be ``multiplied'' by summing over this index. The outer dimensions
give the dimensions of the answer.
\[
A\bx \mbox{~has elements~} {(A\bx)}_i = \sum_{j=1}^n A_{ij} \bx_j
\;\;\;
\mbox{and~} \;\;\; {(AA^\top)}_{ij} = \sum_{k=1}^n A_{ik} {\left(A^\top\right)}_{kj}
 = \sum_{k=1}^n A_{ik} A_{jk}
\]
\marginpar{Check Dimensions}All the following are allowed (the dimensions of the answer are also shown):
\[
\begin{array}{c} \bx^\top \bx \\ \mbox{\small $1\times 1$} \\\mbox{\small scalar} \end{array}
\;\;\;\;\;\;
\begin{array}{c} \bx \bx^\top \\ \mbox{\small $n\times n$} \\\mbox{\small matrix} \end{array}
\;\;\;\;\;\;
\begin{array}{c} A \bx \\ \mbox{\small $m\times 1$} \\\mbox{\small vector} \end{array}
\;\;\;\;\;\;
\begin{array}{c} A A^\top \\ \mbox{\small $m\times m$} \\\mbox{\small matrix} \end{array}
\;\;\;\;\;\;
\begin{array}{c} A^\top A \\ \mbox{\small $n\times n$} \\\mbox{\small matrix} \end{array}
\;\;\;\;\;\;
\begin{array}{c} \bx^\top A \bx \\ \mbox{\small $1\times 1$} \\\mbox{\small scalar} \end{array}
\;\;\;\;\;\; ,
\]
while $\bx \bx$, $AA$ and $\bx A$ {\em do not make sense} for $m\neq n\neq 1$. 
	Can you see why?
%{\em \bf Checking dimensions keeps you sane}.

\bigskip \noindent \marginpar{Multiplication by scalar}An exception to the
above rule is that we may write: $xA$. Every element of the matrix $A$ is
multiplied by the scalar $x$.

\bigskip \noindent \marginpar{Easily proved results}Simple and valid manipulations:
\[
(AB)C= A(BC)
\;\;\;\;\;\;
A(B+C)= AB + AC
\;\;\;\;\;\;
(A+B)^\top = A^\top + B^\top
\;\;\;\;\;\;
(AB)^\top = B^\top A^\top
\]
Note that $AB\neq BA$ in general.

\subsection{Square Matrices}

Now consider the square $n\times n$ matrix $B$.

\bigskip \noindent \marginpar{Diagonal matrices, the Identity}All off-diagonal
elements of diagonal matrices are zero. The ``Identity matrix'', which leaves
vectors and matrices unchanged on multiplication, is diagonal with each
non-zero element equal to one.
\[
\begin{array}{c}
\begin{array}{rcl}
B_{ij} = 0 \mbox{~if~} i \! \neq\! j  &\Leftrightarrow& \mbox{``$B$ is diagonal''} \\
\eye_{ij} = 0 \mbox{~if~} i \! \neq\! j \mbox{~and~} \eye_{ii}=1\;\;\forall i
&\Leftrightarrow& \mbox{``$\eye$ is the identity matrix''}
\end{array} \\
\eye \bx = \bx
\;\;\;\;\;\;
\eye B = B = B \eye
\;\;\;\;\;\;
\bx^\top \eye = \bx^\top
\end{array}
\]

\noindent \marginpar{Inverses}Some square matrices have inverses:
\[
	B^{-1}B= BB^{-1}=\eye
\;\;\;\;\;\;
	{\left(B^{-1}\right)}^{-1}= B \, ,
\]
which have these properties:
\[
	(BC)^{-1} = C^{-1}B^{-1}
	\;\;\;\;\;\;
	{\left(B^{-1}\right)}^\top = {\left(B^\top\right)}^{-1}
\]

\noindent \marginpar{Solving Linear equations}Linear simultaneous equations could be solved (inefficiently) this way:
\[
	\mbox{if~} B\bx = \by \mbox{~then~} \bx = B^{-1}\by
\]

\noindent Some other commonly used matrix definitions include:
\marginpar{\vspace{12pt}Symmetry}
\[
B_{ij} = B_{ji} \Leftrightarrow \mbox{``$B$ is symmetric''}
\]
\vspace{-12pt}
\marginpar{\vspace{12pt}Trace}
\[
\mathrm{Trace}(B)= \mathrm{Tr}(B) = \sum_{i=1}^n B_{ii}=\mbox{``sum of diagonal elements''}
\]
\marginpar{A Trace Trick}Cyclic permutations are allowed inside 
trace. Trace of a scalar is a scalar:
\[
\mathrm{Tr}(BCD)= \mathrm{Tr}(DBC) = \mathrm{Tr}(CDB)
\;\;\;\;\;\;
\bx^\top B\bx = \mathrm{Tr}(\bx^\top B\bx) = \mathrm{Tr}(\bx \bx^\top B)
\]

\bigskip \noindent \marginpar{Determinants}The determinant\footnote{This
section is only intended to give you a flavour so you understand other
references and Sam's crib sheet. More detailed history and overview is here:
{\tt http://www.wikipedia.org/wiki/Determinant}} is written Det$(B)$ or $|B|$.
It is a scalar regardless of $n$.
\[
|BC|=|B||C| \, ,
\;\;\;\;\;\;
|x|= x \, ,
\;\;\;\;\;\;
|xB| %= |x\eye||B|
= x^n |B| \, ,
\;\;\;\;\;\;
\left|B^{-1}\right| = \frac{1}{|B|} \, .
\]
It {\em determines} if $B$ can be inverted:
$|B|\!=\!0 \Rightarrow B^{-1}$ undefined. If the vector to every point of a
shape is pre-multiplied by $B$ then the shape's area or volume increases by a
factor of $|B|$. It also appears in the normalising constant of a Gaussian. For
a diagonal matrix the volume scaling factor is simply the product of the
diagonal elements. In general the determinant is the product of the
eigenvalues.
\marginpar{\vspace{12pt}Eigenvalues, Eigenvectors}
\[
B \be\ith = \lambda\ith \be\ith \Leftrightarrow \mbox{``$\lambda\ith$ is an 
eigenvalue of $B$ with eigenvector $\be\ith$''}
\]
\[
	|B| = \prod \mbox{eigenvalues}
	\;\;\;\;\;\;
	\mbox{Trace}(B) = \sum \mbox{eigenvalues}
\]
If $B$ is real and symmetric (eg a covariance matrix) the eigenvectors are
orthogonal (perpendicular) and so form a basis (can be used as
axes).

\section{Differentiation}

{\small \em Any good A-level maths text book should cover this material and
have plenty of exercises. Undergraduate text books might cover it quickly in
less than a chapter.}

\bigskip \noindent \marginpar{Gradient}The gradient of a straight line
$y\!=\!mx\!+\!c$ is a constant 
$y^\prime =  \frac{y(x+\Delta x)-y(x)}{\Delta x}=m$.
%for any $x$ or $\Delta x$.

\bigskip \noindent \marginpar{Differentiation}Many functions look like 
straight lines over a small enough range. The gradient of this line, the 
derivative, is not constant, but a new function:
\[
	y^\prime(x) = \dd{y}{x} = \lim_{\Delta x\to 0} \frac{y(x\!+\!\Delta x)-y(x)}{\Delta x}\;,
	\;\;\;\;\;
	\parbox{3.2cm}{which could be\\differentiated again:}\;\; y^{\prime\prime} = \ddd{y}{x} = \dd{y^\prime}{x}
\]

\bigskip \noindent \marginpar{Standard derivatives}The following results are 
well known ($c$ is a constant):
\[
\begin{array}{c}f(x):\\f^\prime(x):\end{array}
\;\;\;\;\;\;
\begin{array}{c}c\\0\end{array}
\;\;\;\;\;\;
\begin{array}{c}cx\\c\end{array}
\;\;\;\;\;\;
\begin{array}{c}cx^n\\cnx^{n-1}\end{array}
\;\;\;\;\;\;
\begin{array}{c}\log_e(x)\\1/x\end{array}
\;\;\;\;\;\;
\begin{array}{c}\exp(x)\\\exp(x)\end{array}
% They won't use these:
%\;\;\;\;\;\;
%\begin{array}{c}\sin(x)\\\cos(x)\end{array}
%\;\;\;\;\;\;
%\begin{array}{c}\cos(x)\\-\sin(x)\end{array}
%\;\;\;\;\;\;
\;\;\;
.
\]

\bigskip \noindent \marginpar{Optimisation}At a maximum or minimum the 
function is rising on one side and falling on the other. In between the 
gradient must be zero. Therefore
\[
	\mbox{maxima and minima satisfy:~} \dd{f(x)}{x}=0
	\;\;\;\;\;\;
	\mbox{or}
	\;\;\;\;\;\;
	\dd{f(\bx)}{\bx}={\mathbf 0}
	\;
	\Leftrightarrow
	\;
	\dd{f(\bx)}{x_i}=0\;\;\;\forall i
\]
If we can't solve this we can evolve our variable $x$, or variables
$\bx$, on a computer using gradient information until we find a place where the
gradient is zero.

\bigskip \noindent \marginpar{Approximation}A function may be approximated by a
straight line\footnote{More accurate approximations can be made. Look up Taylor
series.} about any point $a$.
\[
f(a+x) \approx f(a) + x f^\prime(a)\;,
\;\;\;\;\;\;
\mbox{~eg:~}
%\;\;\;\;\;\;
\log(1+x) \approx \log(1+0) + x\frac{1}{1+0} = x
\]

\bigskip \noindent \marginpar{Linearity}The derivative operator is linear:
\[
\dd{(f(x)+g(x))}{x} = \dd{f(x)}{x} + \dd{g(x)}{x}\;,
\;\;\;\;\;\;
\mbox{~eg:~}\; \dd{\left(x+\exp(x)\right)}{x} = 1 + \exp(x).
\]

\bigskip \noindent \marginpar{Product Rule}Dealing with products is slightly 
more involved:
\[
\dd{\left(u(x)v(x)\right)}{x} = v\dd{u}{x} + u\dd{v}{x}\;,
\;\;\;\;\;
 \mbox{~eg:~}\; \dd{\left(x\cdot\exp(x)\right)}{x} = \exp(x) + x\exp(x).
\]

\bigskip \noindent \marginpar{Chain Rule}The ``chain rule''
$ \displaystyle
	\dd{f(u)}{x} = \dd{u}{x}\dd{f(u)}{u},
$ allows results to be combined.
\[
\begin{split}
\mbox{For example:~}
	\dd{\exp\left(ay^m\right)}{y} &= \dd{\left(ay^m\right)}{y} \cdot 
	\dd{\exp\left(ay^m\right)}{\left(ay^m\right)} \;\;\; \mbox{``with~$u=ay^m$''}\\
	                              &= amy^{m-1} \cdot \exp\left(ay^m\right)
\end{split}
\]
\marginpar{\em Exercise}If you can't show the following you could do with some practice:
\[
	\dd{}{z} \left[ \frac{1}{(b+cz)} \exp(az) + e \right] = \exp(az) \left(\frac{a}{b+cz}-\frac{c}{(b+cz)^2}\right)
\]
Note that $a,b,c$ and $e$ are constants, that $\frac{1}{u}=u^{-1}$ and this is 
hard if you haven't done differentiation (for a long time). Again, get a text book.

\end{document}