\documentclass[12pt]{article}

\usepackage{geometry}  % give some flexibility to format
\geometry{verbose,tmargin=0.75in,bmargin=0.75in,lmargin=1in,rmargin=1in} % needs the package geometry to work
\usepackage{amsmath}

\newcommand{\variance}{\sigma^2}

\newcommand{\pderiv}[2]{\frac{\partial #1}{\partial #2}}
\newcommand{\lmaInvCov}{{\boldsymbol \alpha}(\vec{p}, \lambda)}
\newcommand{\chisqd}[1]{\frac{\partial \chi^2}{\partial p_{#1}}} % single derivative
\newcommand{\chisqdd}[2]{\frac{\partial^2 \chi^2}{\partial p_{#1} \partial p_{#2}}} % double derivative

\begin{document}
\title{Data Reduction Notes}
\author{Tobias Marriage}
\date{Feb 2011}
\maketitle

\section{Errors}

\subsection{Errors on Raw Data}

\subsubsection{Counts}

In some experiments, the data are counts. For instance in the classical Rutherford experiment, alpha particles were counted. When studying nuclear processes, product gamma rays may be counted. When studying galaxy populations, you may bin (count) galaxies in brightnesses intervals. In these cases, the data follow Poisson statistics: for a given count N, $\variance_N = N$.

\section{Linear Modeling}

Let $\vec{d}$ be an $n$-long vector of data that we model with a linear model $M\vec{p}$, where $vec{p}$ is an $m$-long vector of parameters and $M$ is an $n \times m$ projection matrix which oper parameters $p$ into the data space. Furthermore, let the noise covariance matrix $N$ be an $n \times n$ diagonal matrix with the  data's variance $\vec{\sigma^2}$ on the diagonal:
\begin{equation}
N  = \left[\begin{matrix} \sigma^2_1 & & 0  \\
 & \ddots & \\
 0 & & \sigma^2_n 
\end{matrix}\right].
\label{eq:covmatrix}
\end{equation}
The goodness of fit for this model is evaluated using the chi-squared statistic:
\begin{equation}
\chi^2(\vec{p}) = (\vec{d} - M\vec{p})^T N^{-1} (\vec{d} - M\vec{p}) = \sum_i \frac{(d_i - [M\vec{p}]_i)^2}{\sigma_i^2}
\label{eq:chi-sq-linear}
\end{equation}
The quantity is $\vec{d} - M\vec{p}$ is called the ``residual''. The chi-squared statistic is the square of this residual over the variance. As the sum in Eq \ref{eq:chi-sq-linear} suggests, for properly estimated errors $\sigma_i^2$, a good fit minimizes chi-squared  and has a value close to $n$, the so-called number of ``degrees of freedom''.

In order to minimize the chi-squared with respect to the linear model parameters, we simply set the derivative of Eq \ref{eq:chi-sq-linear} with respect to these parameters to zero:
\begin{eqnarray}
\frac{d \chi}{d \vec{p}} & =  & 2 M^T N^{-1} (\vec{d} - M\vec{p}) = 0 \nonumber \\
\vec{p} &=&  (M^T N^{-1} M)^{-1} M^T N^{-1} \vec{d}
\label{eq:linearMLSolution}
\end{eqnarray}

\section{Non-linear Modeling}

To interpret a dataset with measurements $\{d_i\}$, we have a model which, given $n$ model parameters $\vec{p}$, gives corresponding values $\{m_i(\vec{p})\}$ . The chi-squared for this model is
\begin{equation}
\chi^2(\vec{p}) = \sum_i \frac{[d_i - m_i(\vec{p})]^2}{\sigma_i^2},
\label{eq:chi-sq}
\end{equation}
where $\sigma_i^2$ is the estimate for the variance on $d_i$. 

The errors on $d_i$ are assumed uncorrelated.
For linear models, the chi-squared function was a quadratic in the parameters, so we can write
\begin{equation}
\chi^2(\vec{p}) = \chi^2(\vec{p}_0) + \frac{d\chi^2}{d\vec{p}}(\vec{p}_0) (\vec{p}-\vec{p}_0)+ \frac{1}{2}(\vec{p}-\vec{p}_0)^T\frac{d^2\chi^2}{d^2\vec{p}}(\vec{p_0}) (\vec{p}-\vec{p}_0),
\end{equation}
where $\vec{p}_0$ is an arbitrary starting point in parameter space. The minimizing ``jump'' in parameter space, $\delta\vec{p} =  (\vec{p_{min}}-\vec{p_0})$, is given by
\begin{eqnarray}
0 & = &\frac{d\chi^2}{d\vec{p}}(\vec{p_0}) + \frac{d^2\chi^2}{d^2\vec{p}}(\vec{p_0})\delta\vec{p}  \nonumber \\
\delta\vec{p} & = & -\left(\frac{d^2\chi^2}{d^2\vec{p}}(\vec{p_0})\right)^{-1} \frac{d\chi^2}{d\vec{p}}(\vec{p_0}) 
\label{eq:quad_decent}
\end{eqnarray}
Note that in general the first derivative of chi-squared is a vector (the gradient), and the second derivative is a matrix (the hessian).

Many times, however, the chi-squared function is not purely quadratic. This is the case for non-linear models. In these cases, the  above prescription for a purely quadratic function can fail. In these cases, we must follow our nose, potentially taking many steps around parameter space to find the minimum of the chi-squared. One easy prescription is to follow the gradient of chi-squared. The step in parameter space is then
\begin{equation}
\delta\vec{p} =  - \vec{C} \frac{d\chi^2}{d\vec{p}}(\vec{p_0}), 
\label{eq:grad_decent}
\end{equation}
where $\vec{C}$ is a constant vector of length $n$ (same as $\vec{p}$). In general, we can use both step formulae (Eq~\ref{eq:quad_decent} and Eq~\ref{eq:grad_decent}) to explore the parameter space in search of the best fit values.

\subsection{Gradient and Hessian}

The next step towards an algorithm for non-linear model fitting is constructing the gradient vector and hessian matrix of the chi-squared function (Eq~\ref{eq:chi-sq}).The gradient is straight forward. The $j^{th}$ element of this vector is
\begin{equation}
\chisqd{j}(\vec{p_0}) = - 2 \sum_i \frac{[d_i - m_i(\vec{p}_0)]}{\sigma_i^2} \left( \frac{\partial m_i}{\partial p_j}(\vec{p}_0) \right).
\label{eq:gradient}
\end{equation} 
And the $(j,k)^{th}$ element of the hessian is
\begin{eqnarray}
\chisqdd{j}{k}(\vec{p_0}) & = & - 2 \sum_{i} \sigma_i^{-2} \left( \frac{\partial m_i}{\partial p_j}(\vec{p}_0) \right)  \left( \frac{\partial m_i}{\partial p_k}(\vec{p}_0) \right) + \frac{[d_i - m_i(\vec{p}_0)]}{\sigma_i^2} \left( \frac{\partial^2 m_i}{\partial p_j\partial p_k}(\vec{p}_0) \right) \nonumber \\
& \approx & - 2 \sum_{i} \sigma_i^{-2} \left( \frac{\partial m_i}{\partial p_j}(\vec{p}_0) \right)  \left( \frac{\partial m_i}{\partial p_k}(\vec{p}_0) \right).
\label{eq:hessian}
\end{eqnarray} 
The second term in the sum on the first line tends to cancel out for a good model near the chi-squared minimum: the quantity $d_i - m_i(\vec{p}_0)$ will be randomly positive and negative across $d_i$ such that its sum over $i$ should be small (Why isn't this the case for Eq \ref{eq:gradient}? My hypothesis is that both of these terms are small near the minimum and it is the first term in the hessian which drives the final steps of the search.). The second term is also supposedly a numerical nuisance, so we drop it for the effective expression of the hessian.

\subsection{Levenburg-Marquardt}

The final step towards solving a non-linear least squares system is providing a prescription for stepping through parameter space and in particular choosing the constant in Eq \ref{eq:grad_decent}. The Levenberg-Marquardt approach is the classic way to do this \cite{levenberg:1944}. In this approach the constant for gradient decent is chosen to be proportional to the inverse of the second derivative with respect to the parameter in question. Following this prescription, Eq \ref{eq:quad_decent} and Eq \ref{eq:grad_decent} can be written
\begin{eqnarray}
-\left[\begin{matrix}
 \chisqdd{1}{1} &  \ldots & \chisqdd{1}{n} \\
\vdots   & \ddots & \vdots\\
\chisqdd{n}{1} & \ldots &  \chisqdd{n}{n} 
\end{matrix}\right]
\left[\begin{matrix} \delta p_1 \\ \vdots \\ \delta p_n\end{matrix} \right] & =  &
\left[\begin{matrix} \chisqd{1} \\ \vdots \\ \chisqd {n}\end{matrix} \right], \label{eq:lma0}\\
\lambda 
\left[\begin{matrix}
\chisqdd{1}{1} & & 0 \\
 & \ddots & \\
0  & & \chisqdd{1}{1}
\end{matrix}\right]
\left[\begin{matrix} \delta p_1 \\ \vdots \\ \delta p_n\end{matrix} \right] & =  & 
\left[\begin{matrix} \chisqd{1} \\ \vdots \\ \chisqd {n}\end{matrix} \right],
\label{eq:lma1}
\end{eqnarray}
where $\lambda$ is a constant. Eq \ref{eq:lma0} and Eq \ref{eq:lma1} specify, respectively, the quadratic (Eq \ref{eq:quad_decent}) and gradient (Eq \ref{eq:grad_decent}) contributions to $\delta\vec{p}$. These equations can be rewritten as a single operation on the gradient
\begin{equation}
\left[\begin{matrix} \delta p_1 \\ \vdots \\ \delta p_n\end{matrix} \right]  =   \left(
\left[\begin{matrix}
 \chisqdd{1}{1} (-1+\lambda) &  \ldots & \chisqdd{1}{n} \\
\vdots   & \ddots & \vdots\\
\chisqdd{n}{1} & \ldots &  \chisqdd{n}{n} (-1+\lambda)
\end{matrix}\right]
  \right)^{-1}
\left[\begin{matrix} \chisqd{1} \\ \vdots \\ \chisqd {n}\end{matrix} \right],
\end{equation}
Setting the matrix on the right hand side to $\lmaInvCov$, the expression simplifies to
\begin{equation}
 \delta \vec{p} =  \lmaInvCov^{-1} \bigtriangledown_p \chi^2.
\label{eq:lma3}
\end{equation}
As a prescription for choosing $\lambda$ and stepping around parameter space, here is a suggestion from the ever useful Numerical Recipes books \cite{press:1992}
\begin{enumerate}
\item Solve Eq \ref{eq:lma3} for $\delta\vec{p}$.
\item If  $\chi^2(\vec{p}+\delta\vec{p}) > \chi^2(\vec{p})$, do \emph{not} set $\vec{p} = \vec{p}+\delta\vec{p}$ (reject the step) and increase $\lambda$ by a factor of 10. Return to start.
\item If  $ \chi^2(\vec{p}+\delta\vec{p}) < \chi^2(\vec{p})$, set $\vec{p} = \vec{p}+\delta\vec{p}$ and decrease $\lambda$ by a factor of 10. Return to start.
\end{enumerate}
You stop iterating this sequence when the quantity $\chi^2(\vec{p}+\delta\vec{p}) - \chi^2(\vec{p})$ is significantly less than one.

\subsection{Error Propagation}

Let the variance on a dataset $\{d_i\}$ ($n$ long) be given by $\{\variance_i\}$. The variance of a function of the data $p(\{d_i\})$ is given by
\begin{equation}
\variance_p = \sum_i \left| \pderiv{p}{d_i} \right|^2 \variance_i
\label{eq:errorprop}
\end{equation}
A corollary of this is that, for functions linear in $\{d_i\}$, the fractional variance ($\variance_p/p^2$) of the function is the quadrature sum of the fractional variance of the data. This is corollary does not hold for non-linear functions.

Eq \ref{eq:errorprop} can be written in vector form as
\begin{equation}
\variance_p =  \vec{\bigtriangledown}_d  p \cdot N \cdot \vec{\bigtriangledown}_d p,
\label{eq:errorpropvec}
\end{equation}
where $N$ is given by Eq \ref{eq:covmatrix}. More generally we can define the covariance matrix of a set of $m$ dependent 
variables (e.g., model parameters) as
\begin{equation}
C = \vec{\bigtriangledown}_d  \vec{p} \cdot N \cdot \vec{\bigtriangledown}_d \vec{p}
\end{equation}
where $\vec{\bigtriangledown}_d  \vec{p}$ has dimensions of ($n \times m$). Referring to  Eq \ref{eq:linearMLSolution} for an expression for $\vec{p}$, the covariance matrix takes the form
\begin{equation}
C = (M^T N^{-1} M)^{-1}.
\end{equation}
This expression can be directly related to the Hessian of the chi-squared in Eq \ref{eq:chi-sq-linear}:
\begin{equation}
\frac{d\chi^2}{d^2\vec{p}} = 2 M^T N^{-1} M \equiv 2 \alpha.
\label{eq:alphadef}
\end{equation}
Thus half the Hessian (defined here as $\alpha$) is the inverse of the  covariance matrix. For a non-linear model which is well approximated by linear terms near the minimum of the chi-squared function, the inverse of the $\alpha$ matrix is the formal covariance. This quantity can be calculated at the end of a non-linear model fit in order to give errors -- it is often returned by algorithms based on the Levenburg-Marquardt chi-squared minimization.

Generally near the minimum, the chi-squared function can approximated by a second order function:
\begin{equation}
\Delta \chi^2 = \delta \vec{p} \cdot \alpha \cdot \delta \vec{p}
\label{eq:chi-sq-min}
\end{equation}
where we have used the definition of the matrix $\alpha$ from Eq \ref{eq:alphadef}, $\Delta\chi^2 = \chi^2 - \chi^2_{min}$ and $\delta \vec{p} = \vec{p} - \vec{p}_{min}$. In order to find the uncertainty in a parameter $\delta p_1$, we allow this parameter to take an arbitrary value and minimize over the other parameters. This latter condition implies that the gradient of $\chi^2$ will be zero along directions besides that corresponding to $\delta p_1$.
\begin{equation}
\frac{d \Delta \chi^2}{d \vec{p}} = \alpha \cdot \delta \vec{p} = \left[ \begin{matrix} c \\ \vdots \\ 0 \end{matrix}\right]
\label{eq:minimization-condition}
\end{equation}
Furthermore, for a single degree of freedom ($\delta p_1$), the change in chi-square (Eq \ref{eq:chi-sq-min}) corresponding to the 68\% confidence interval is $\Delta \chi^2 = \delta \vec{p} \cdot \alpha \cdot \delta \vec{p} = 1$. This condition constrains the value of $c$ in Eq \ref{eq:minimization-condition} such that
\begin{equation}
\delta \vec{p}(68\%) = C \left[ \begin{matrix} 1/ \sqrt{C_{11}} \\ \vdots \\ 0 \end{matrix}\right],
\end{equation}
where we have used the identity $C = \alpha^{-1}$. It then follows that the standard deviation of parameter $p_1$ is $\sqrt{C_{11}}$. 

More generally, one can plot the confidence contours of multiple parameters. Given the constraints of the printed page, two parameter error ellipses are most frequently used and provide insight to degeneracy among parameters. For $\nu$ parameters, one constructs a $\nu \times \nu$ sub-covariance matrix by extracting the intersections of rows and columns associated with the parameters of interested. Then one plots chi-squared contours corresponding to the appropriate confidence intervals. See Table \ref{tab:chi-sq-conf}.

\begin{table}
\centering
\caption{Chi-squared values corresponding to confidence levels (CL) for various numbers of degrees of freedom.}
\begin{tabular}{|c|c|c|c|}
\hline
C.L.  & 1 dof  & 2 dof & 3 dof \\ \hline \hline
68\% & 1.00  & 2.30 & 3.53 \\
95.4\% & 4.00 & 6.17 & 8.02 \\ \hline
\end{tabular}
\label{tab:chi-sq-conf}
\end{table}


\begin{thebibliography}{99}
\bibitem{levenberg:1944}Kenneth Levenberg. ``A Method for the Solution of Certain Non-Linear Problems in Least Squares''. The Quarterly of Applied Mathematics 2: 164-168 (1944).
\bibitem{press:1992}William Press et al. {\it Numerical Recipes in C}. Cambridge: Cambridge University Press, 1992.
\end{thebibliography}

\end{document}