\documentclass[12pt]{article}

\usepackage{geometry}  % give some flexibility to format
\geometry{verbose,tmargin=1.in,bmargin=1.in,lmargin=1in,rmargin=1in} % needs the package geometry to work
\usepackage{amsmath}
\usepackage{graphicx}

\newcommand{\variance}{\sigma^2}

\newcommand{\pderiv}[2]{\frac{\partial #1}{\partial #2}}
\newcommand{\lmaInvCov}{{\boldsymbol \alpha}(\vec{p}, \lambda)}
\newcommand{\chisqd}[1]{\frac{\partial \chi^2}{\partial p_{#1}}} % single derivative
\newcommand{\chisqdd}[2]{\frac{\partial^2 \chi^2}{\partial p_{#1} \partial p_{#2}}} % double derivative

\usepackage{fancyhdr}
\pagestyle{fancy}
\lhead{\textsc{Data Reduction Notes}}

\begin{document}
\title{Data Reduction Notes}
\author{Tobias Marriage}
\date{Feb 2011}
\maketitle

\section{Probability Distributions}

Generically, a probability distribution corresponds to a function $p(x)$ which defines the likelihood for random variable ${x}$. {\bf The likelihood that for which the probability of the random variable falling in a small interval $dx$ centered on the value $x_0$ is $p(x_0) dx$.} The distribution describes the \emph{probability density} which, when integrated, yields the probability of $x$ falling between any two values:
\begin{equation}
P( x \in [x_0, x_1] ) = \int_{x_0}^{x_1}  p(x) dx.
\label{eq:prob_int}
\end{equation}
Furthermore the distribution is normalized such that 
\begin{equation}
P( x \in [-\infty,\infty] ) = \int_{-\infty}^{\infty}  p(x) dx = 1.
\end{equation}
The {\bf mean $\mu$} of the distribution is defined as its first moment $\langle x \rangle$,
\begin{equation}
\mu \equiv \int_{-\infty}^{\infty} x p(x) dx,
\label{eq:mean_def}
\end{equation}
and the {\bf variance $\sigma^2$} of the distribution is defined as its second moment $\langle x^2 \rangle$,
\begin{equation}
\sigma^2 \equiv \int_{-\infty}^{\infty} (x-\mu)^2 p(x) dx.
\label{eq:variance_def}
\end{equation}
The probability distributions of course have third moments (skew, $\langle x^3 \rangle$), and fourth moments (kurtosis, $\langle x^4 \rangle$) which are useful but do not enter the everyday life of the experimenter. The mean describes the center of the distribution while the variance describes the width of the distribution. Another important quantity is the {\bf standard deviation $\sigma$} which is the square root of the variance.

\subsection{Gaussian Distribution}

The Gaussian (or normal) distribution is the most used distribution in experimental science. The Gaussian in fully described by a mean $\mu$ and a variance $\sigma^2$. The functional form of the Gaussian distribution is
\begin{equation}
p(x;\mu,\sigma) = \frac{1}{\sigma \sqrt{(2\pi)}} \exp \left[ -\frac{1}{2} \frac{(x-\mu)^2}{\sigma^2} \right].
\label{eq:gaussian}
\end{equation}

\subsubsection{Standard Gaussian Distribution}

By redefining variables $z = (x-\mu)/\sigma$ in Eq \ref{eq:gaussian}, we obtain the Standard Gaussian Distribution ($\mu = 1$, $\sigma=1$):
\begin{equation}
p(z) = \frac{1}{\sigma \sqrt{(2\pi)}} \exp \left( -\frac{1}{2} \frac{z^2}{\sigma^2} \right).
\label{eq:standard_gaussian}
\end{equation}
The ability to define a Standard Gaussian by a simple linear transformation of the random variable has an significant implication for simulations (see section \ref{sec:montecarlo}).

\section{Errors}

\subsection{Errors on Raw Data}

\subsubsection{Counts}

In some experiments, the data are counts. For instance in the classical Rutherford experiment, alpha particles were counted. When studying nuclear processes, product gamma rays may be counted. When studying galaxy populations, you may bin (count) galaxies in brightnesses intervals. In these cases, the data follow Poisson statistics: for a given count N, $\variance_N = N$.

\section{Linear Modeling}

Let $\vec{d}$ be an $n$-long vector of data that we model with a linear model $M\vec{p}$, where $vec{p}$ is an $m$-long vector of parameters and $M$ is an $n \times m$ projection matrix which oper parameters $p$ into the data space. Furthermore, let the noise covariance matrix $N$ be an $n \times n$ diagonal matrix with the  data's variance $\vec{\sigma^2}$ on the diagonal:
\begin{equation}
N  = \left[\begin{matrix} \sigma^2_1 & & 0  \\
 & \ddots & \\
 0 & & \sigma^2_n 
\end{matrix}\right].
\label{eq:covmatrix}
\end{equation}
The goodness of fit for this model is evaluated using the chi-squared statistic:
\begin{equation}
\chi^2(\vec{p}) = (\vec{d} - M\vec{p})^T N^{-1} (\vec{d} - M\vec{p}) = \sum_i \frac{(d_i - [M\vec{p}]_i)^2}{\sigma_i^2}
\label{eq:chi-sq-linear}
\end{equation}
The quantity is $\vec{d} - M\vec{p}$ is called the ``residual''. The chi-squared statistic is the square of this residual over the variance. As the sum in Eq \ref{eq:chi-sq-linear} suggests, for properly estimated errors $\sigma_i^2$, a good fit minimizes chi-squared  and has a value close to $n$, the so-called number of ``degrees of freedom''.

In order to minimize the chi-squared with respect to the linear model parameters, we simply set the derivative of Eq \ref{eq:chi-sq-linear} with respect to these parameters to zero:
\begin{eqnarray}
\frac{d \chi}{d \vec{p}} & =  & 2 M^T N^{-1} (\vec{d} - M\vec{p}) = 0 \nonumber \\
\vec{p} &=&  (M^T N^{-1} M)^{-1} M^T N^{-1} \vec{d}
\label{eq:linearMLSolution}
\end{eqnarray}

\section{Non-linear Modeling}

To interpret a dataset with measurements $\{d_i\}$, we have a model which, given $n$ model parameters $\vec{p}$, gives corresponding values $\{m_i(\vec{p})\}$ . The chi-squared for this model is
\begin{equation}
\chi^2(\vec{p}) = \sum_i \frac{[d_i - m_i(\vec{p})]^2}{\sigma_i^2},
\label{eq:chi-sq}
\end{equation}
where $\sigma_i^2$ is the estimate for the variance on $d_i$. 

The errors on $d_i$ are assumed uncorrelated.
For linear models, the chi-squared function was a quadratic in the parameters, so we can write
\begin{equation}
\chi^2(\vec{p}) = \chi^2(\vec{p}_0) + \frac{d\chi^2}{d\vec{p}}(\vec{p}_0) (\vec{p}-\vec{p}_0)+ \frac{1}{2}(\vec{p}-\vec{p}_0)^T\frac{d^2\chi^2}{d^2\vec{p}}(\vec{p_0}) (\vec{p}-\vec{p}_0),
\end{equation}
where $\vec{p}_0$ is an arbitrary starting point in parameter space. The minimizing ``jump'' in parameter space, $\delta\vec{p} =  (\vec{p_{min}}-\vec{p_0})$, is given by
\begin{eqnarray}
0 & = &\frac{d\chi^2}{d\vec{p}}(\vec{p_0}) + \frac{d^2\chi^2}{d^2\vec{p}}(\vec{p_0})\delta\vec{p}  \nonumber \\
\delta\vec{p} & = & -\left(\frac{d^2\chi^2}{d^2\vec{p}}(\vec{p_0})\right)^{-1} \frac{d\chi^2}{d\vec{p}}(\vec{p_0}) 
\label{eq:quad_decent}
\end{eqnarray}
Note that in general the first derivative of chi-squared is a vector (the gradient), and the second derivative is a matrix (the hessian).

Many times, however, the chi-squared function is not purely quadratic. This is the case for non-linear models. In these cases, the  above prescription for a purely quadratic function can fail. In these cases, we must follow our nose, potentially taking many steps around parameter space to find the minimum of the chi-squared. One easy prescription is to follow the gradient of chi-squared. The step in parameter space is then
\begin{equation}
\delta\vec{p} =  - \vec{C} \frac{d\chi^2}{d\vec{p}}(\vec{p_0}), 
\label{eq:grad_decent}
\end{equation}
where $\vec{C}$ is a constant vector of length $n$ (same as $\vec{p}$). In general, we can use both step formulae (Eq~\ref{eq:quad_decent} and Eq~\ref{eq:grad_decent}) to explore the parameter space in search of the best fit values.

\subsection{Gradient and Hessian}

The next step towards an algorithm for non-linear model fitting is constructing the gradient vector and hessian matrix of the chi-squared function (Eq~\ref{eq:chi-sq}).The gradient is straight forward. The $j^{th}$ element of this vector is
\begin{equation}
\chisqd{j}(\vec{p_0}) = - 2 \sum_i \frac{[d_i - m_i(\vec{p}_0)]}{\sigma_i^2} \left( \frac{\partial m_i}{\partial p_j}(\vec{p}_0) \right).
\label{eq:gradient}
\end{equation} 
And the $(j,k)^{th}$ element of the hessian is
\begin{eqnarray}
\chisqdd{j}{k}(\vec{p_0}) & = & - 2 \sum_{i} \sigma_i^{-2} \left( \frac{\partial m_i}{\partial p_j}(\vec{p}_0) \right)  \left( \frac{\partial m_i}{\partial p_k}(\vec{p}_0) \right) + \frac{[d_i - m_i(\vec{p}_0)]}{\sigma_i^2} \left( \frac{\partial^2 m_i}{\partial p_j\partial p_k}(\vec{p}_0) \right) \nonumber \\
& \approx & - 2 \sum_{i} \sigma_i^{-2} \left( \frac{\partial m_i}{\partial p_j}(\vec{p}_0) \right)  \left( \frac{\partial m_i}{\partial p_k}(\vec{p}_0) \right).
\label{eq:hessian}
\end{eqnarray} 
The second term in the sum on the first line tends to cancel out for a good model near the chi-squared minimum: the quantity $d_i - m_i(\vec{p}_0)$ will be randomly positive and negative across $d_i$ such that its sum over $i$ should be small (Why isn't this the case for Eq \ref{eq:gradient}? My hypothesis is that both of these terms are small near the minimum and it is the first term in the hessian which drives the final steps of the search.). The second term is also supposedly a numerical nuisance, so we drop it for the effective expression of the hessian.

\subsection{Levenburg-Marquardt}

The final step towards solving a non-linear least squares system is providing a prescription for stepping through parameter space and in particular choosing the constant in Eq \ref{eq:grad_decent}. The Levenberg-Marquardt approach is the classic way to do this \cite{levenberg:1944}. In this approach the constant for gradient decent is chosen to be proportional to the inverse of the second derivative with respect to the parameter in question. Following this prescription, Eq \ref{eq:quad_decent} and Eq \ref{eq:grad_decent} can be written
\begin{eqnarray}
-\left[\begin{matrix}
 \chisqdd{1}{1} &  \ldots & \chisqdd{1}{n} \\
\vdots   & \ddots & \vdots\\
\chisqdd{n}{1} & \ldots &  \chisqdd{n}{n} 
\end{matrix}\right]
\left[\begin{matrix} \delta p_1 \\ \vdots \\ \delta p_n\end{matrix} \right] & =  &
\left[\begin{matrix} \chisqd{1} \\ \vdots \\ \chisqd {n}\end{matrix} \right], \label{eq:lma0}\\
\lambda 
\left[\begin{matrix}
\chisqdd{1}{1} & & 0 \\
 & \ddots & \\
0  & & \chisqdd{1}{1}
\end{matrix}\right]
\left[\begin{matrix} \delta p_1 \\ \vdots \\ \delta p_n\end{matrix} \right] & =  & 
\left[\begin{matrix} \chisqd{1} \\ \vdots \\ \chisqd {n}\end{matrix} \right],
\label{eq:lma1}
\end{eqnarray}
where $\lambda$ is a constant. Eq \ref{eq:lma0} and Eq \ref{eq:lma1} specify, respectively, the quadratic (Eq \ref{eq:quad_decent}) and gradient (Eq \ref{eq:grad_decent}) contributions to $\delta\vec{p}$. These equations can be rewritten as a single operation on the gradient
\begin{equation}
\left[\begin{matrix} \delta p_1 \\ \vdots \\ \delta p_n\end{matrix} \right]  =   \left(
\left[\begin{matrix}
 \chisqdd{1}{1} (-1+\lambda) &  \ldots & \chisqdd{1}{n} \\
\vdots   & \ddots & \vdots\\
\chisqdd{n}{1} & \ldots &  \chisqdd{n}{n} (-1+\lambda)
\end{matrix}\right]
  \right)^{-1}
\left[\begin{matrix} \chisqd{1} \\ \vdots \\ \chisqd {n}\end{matrix} \right],
\end{equation}
Setting the matrix on the right hand side to $\lmaInvCov$, the expression simplifies to
\begin{equation}
 \delta \vec{p} =  \lmaInvCov^{-1} \bigtriangledown_p \chi^2.
\label{eq:lma3}
\end{equation}
As a prescription for choosing $\lambda$ and stepping around parameter space, here is a suggestion from the ever useful Numerical Recipes books \cite{press:1992}
\begin{enumerate}
\item Solve Eq \ref{eq:lma3} for $\delta\vec{p}$.
\item If  $\chi^2(\vec{p}+\delta\vec{p}) > \chi^2(\vec{p})$, do \emph{not} set $\vec{p} = \vec{p}+\delta\vec{p}$ (reject the step) and increase $\lambda$ by a factor of 10. Return to start.
\item If  $ \chi^2(\vec{p}+\delta\vec{p}) < \chi^2(\vec{p})$, set $\vec{p} = \vec{p}+\delta\vec{p}$ and decrease $\lambda$ by a factor of 10. Return to start.
\end{enumerate}
You stop iterating this sequence when the quantity $\chi^2(\vec{p}+\delta\vec{p}) - \chi^2(\vec{p})$ is significantly less than one.

\section{Error Propagation}

Let the variance on a dataset $\{d_i\}$ ($n$ long) be given by $\{\variance_i\}$. The variance of a function of the data $p(\{d_i\})$ is given by
\begin{equation}
\variance_p = \sum_i \left| \pderiv{p}{d_i} \right|^2 \variance_i
\label{eq:errorprop}
\end{equation}
A corollary of this is that, for functions linear in $\{d_i\}$, the fractional variance ($\variance_p/p^2$) of the function is the quadrature sum of the fractional variance of the data. This is corollary does not hold for non-linear functions.

Eq \ref{eq:errorprop} can be written in vector form as
\begin{equation}
\variance_p =  \vec{\bigtriangledown}_d  p \cdot N \cdot \vec{\bigtriangledown}_d p,
\label{eq:errorpropvec}
\end{equation}
where $N$ is given by Eq \ref{eq:covmatrix}. More generally we can define the covariance matrix of a set of $m$ dependent 
variables (e.g., model parameters) as
\begin{equation}
C = \vec{\bigtriangledown}_d  \vec{p} \cdot N \cdot \vec{\bigtriangledown}_d \vec{p}
\end{equation}
where $\vec{\bigtriangledown}_d  \vec{p}$ has dimensions of ($n \times m$). Referring to  Eq \ref{eq:linearMLSolution} for an expression for $\vec{p}$, the covariance matrix takes the form
\begin{equation}
C = (M^T N^{-1} M)^{-1}.
\end{equation}
This expression can be directly related to the Hessian of the chi-squared in Eq \ref{eq:chi-sq-linear}:
\begin{equation}
\frac{d\chi^2}{d^2\vec{p}} = 2 M^T N^{-1} M \equiv 2 \alpha.
\label{eq:alphadef}
\end{equation}
Thus half the Hessian (defined here as $\alpha$) is the inverse of the  covariance matrix. For a non-linear model which is well approximated by linear terms near the minimum of the chi-squared function, the inverse of the $\alpha$ matrix is the formal covariance. This quantity can be calculated at the end of a non-linear model fit in order to give errors -- it is often returned by algorithms based on the Levenburg-Marquardt chi-squared minimization.

Generally near the minimum, the chi-squared function can approximated by a second order function:
\begin{equation}
\Delta \chi^2 = \delta \vec{p} \cdot \alpha \cdot \delta \vec{p}
\label{eq:chi-sq-min}
\end{equation}
where we have used the definition of the matrix $\alpha$ from Eq \ref{eq:alphadef}, $\Delta\chi^2 = \chi^2 - \chi^2_{min}$ and $\delta \vec{p} = \vec{p} - \vec{p}_{min}$. In order to find the uncertainty in a parameter $\delta p_1$, we allow this parameter to take an arbitrary value and minimize over the other parameters. This latter condition implies that the gradient of $\chi^2$ will be zero along directions besides that corresponding to $\delta p_1$.
\begin{equation}
\frac{d \Delta \chi^2}{d \vec{p}} = \alpha \cdot \delta \vec{p} = \left[ \begin{matrix} c \\ \vdots \\ 0 \end{matrix}\right]
\label{eq:minimization-condition}
\end{equation}
Furthermore, for a single degree of freedom ($\delta p_1$), the change in chi-square (Eq \ref{eq:chi-sq-min}) corresponding to the 68\% confidence interval is $\Delta \chi^2 = \delta \vec{p} \cdot \alpha \cdot \delta \vec{p} = 1$. This condition constrains the value of $c$ in Eq \ref{eq:minimization-condition} such that
\begin{equation}
\delta \vec{p}(68\%) = C \left[ \begin{matrix} 1/ \sqrt{C_{11}} \\ \vdots \\ 0 \end{matrix}\right],
\end{equation}
where we have used the identity $C = \alpha^{-1}$. It then follows that the standard deviation of parameter $p_1$ is $\sqrt{C_{11}}$. 

More generally, one can plot the confidence contours of multiple parameters. Given the constraints of the printed page, two parameter error ellipses are most frequently used and provide insight to degeneracy among parameters. For $\nu$ parameters, one constructs a $\nu \times \nu$ sub-covariance matrix by extracting the intersections of rows and columns associated with the parameters of interested. Then one plots chi-squared contours corresponding to the appropriate confidence intervals. See Table \ref{tab:chi-sq-conf}.

\begin{table}
\centering
\caption{Chi-squared values corresponding to confidence levels (CL) for various numbers of degrees of freedom.}
\begin{tabular}{|c|c|c|c|}
\hline
C.L.  & 1 dof  & 2 dof & 3 dof \\ \hline \hline
68\% & 1.00  & 2.30 & 3.53 \\
95.4\% & 4.00 & 6.17 & 8.02 \\ \hline
\end{tabular}
\label{tab:chi-sq-conf}
\end{table}

\section{Monte Carlo Simulations}
\label{sec:montecarlo}

Monte Carlo simulations are simulations of experiments. Monte Carlo is a famous gambling venue, and this name must have been chosen because these simulations probe out the probabilities associated with experiments.
\begin{itemize} 
\item Random (or pseudorandom) data are generated based on either predictions/models or statistical properties (i.e., errors) derived from existing samples. 
\item These simulated data are then reduced to derive model parameters. 
\item This prescription is repeated many times to explore the space of model parameters allowed by the input errors.
\end{itemize}
Monte Carlos are useful for planning an experiment. For a given experimental configuration (e.g., integration time, distribution of measurements, etc), we can use Monte Carlos to predict the resulting constraints on models. For instance, in the Muon Lifetime experiment one can use Monte Carlos to estimate how many decay events are needed to constrain the Muon lifetime to percent level. Or with the Rutherford scattering, one could use a Monte Carlo to decide how long to integrate at each angle to get acceptable errors on the differential crossection. In a similar way, Monte Carlos are also useful for propagating errors to models once the dataset is taken.

\subsection{Uniform Random Deviates}
\label{sec:urd}

It's impossible for computer programs to generate truly random sequences of numbers (random deviates) for a Monte Carlo. Instead, there are algorithms to generate {\bf pseudorandom numbers}  given an arbitrary seed. The seed can be provided from the user or pulled from the lower order digits of the computer clock. A standard method for creating a pseudorandom sequence of integers is the multiplicative congruential method for which the $n^{th}$ random integer in the sequence is
\begin{equation}
x_n = (a x_{n-1}) \bmod{M}.
\end{equation}
Where $a$ and $M$ are carefully chosen integers and $x_0$ the seed. The $\bmod{M}$ operation is the modulo with respect to M. This generates at most an $M$ element sequence of numbers. \emph{Numerical Recipes} suggests $a = 16807$ and $M = 2^{32} - 1$ \cite{press:1992}. These pseudorandom numbers can be normalized such that they fall in the unit interval simply by dividing by $M$.
\begin{equation}
x_n' = \frac{(a x_{n-1}) \bmod{M}}{M}.
\label{eq:urd}
\end{equation}
 The numbers then approximate (for large $M$) random variables drawn from a uniform distribution on the unit interval:
\begin{equation}
p(x) =
   \begin{cases}
      1, & \text{if $x \in [0,1)$} \\
      0, & \text{if $x \notin [0,1)$}
  \end{cases}.
\label{eq:uc}
\end{equation}
This distribution has mean $\mu = 0.5$ and variance $\sigma^2 = 1/12$ (see Eq \ref{eq:variance_def}). Our approximation is characterized by $\mu = 0.5$ and $\sigma^2 = 1/12 - 1/6M$.

\subsection{Gaussian Random Deviates}
\label{sec:grd}

\begin{figure}[ht]
\includegraphics[width=6in]{gaussian_deviate.png}
\caption{Gaussian Deviates. The normalized histogram derives from 100,000 random deviates derived as in Eq \ref{eq:gaussian_deviate} with $N=12$.}
\end{figure}

Our experimental errors are generally approximated by the Gaussian distribution (Eq \ref{eq:gaussian}). We therefore would like an algorithm that generates random deviates from this distribution. In particular it would be nice to have an algorithm which generates deviates from a Standard Gaussian  distribution (Eq \ref{eq:standard_gaussian}). Deviates from the standard distribution can be scaled to any Gaussian distribution with by the transformation $z = \sigma x + \mu$.

We are helped here by the {\bf central limit theorem} which states that the sum of $N$ random samples from any distribution tends to a Gaussian distribution for large $N$. Consider the uniform random deviates generated by the algorithm in Eq \ref{eq:urd} and drawn from the distribution \ref{eq:uc}. For $N$ uniform deviates thus defined, we can approximate a Gaussian deviate by
\begin{equation}
x_G = \sum_{i=1}^N x_i - N/2.
\label{eq:gaussian_deviate}
\end{equation}
The distribution describing $x_G$ has mean $\mu = 0.$ and variance $\sigma^2 = N/12$ (recall the uniform distribution has $\sigma^2 = 1/12$). A convenient choice the approximate a Standard Gaussian distribution is then $N = 12$.

Of course there are packaged routines for computing deviates. For instance, the numerical python package has numpy.random.uniform() and numpy.random.normal().

%\subsection{Bootstrap Monte Carlo}


\begin{thebibliography}{99}
\bibitem{levenberg:1944}Kenneth Levenberg. ``A Method for the Solution of Certain Non-Linear Problems in Least Squares''. The Quarterly of Applied Mathematics 2: 164-168 (1944).
\bibitem{press:1992}William Press et al. {\it Numerical Recipes in C}. Cambridge: Cambridge University Press, 1992.
\end{thebibliography}

\end{document}