% !TeX program = pdflatex
\documentclass[11pt]{article}
% ---------- Encoding & Fonts ----------
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{lmodern}
% ---------- Layout & Typesetting ----------
\usepackage[a4paper,margin=1in]{geometry}
\usepackage{microtype}
\usepackage{setspace}
\setstretch{1.05}
\usepackage{enumitem}
\setlist{nosep}
% ---------- Math & Symbols ----------
\usepackage{amsmath,amssymb,amsthm,bm,mathtools}
% ---------- Graphics ----------
\usepackage{graphicx}
\usepackage{xcolor}
% ---------- Tables ----------
\usepackage{booktabs}
\usepackage{array}
\usepackage{tabularx}
% ---------- Clever references ----------
\usepackage{hyperref}
\hypersetup{
colorlinks=true,
linkcolor=blue!60!black,
urlcolor=blue!60!black,
citecolor=blue!60!black
}
\usepackage{cleveref}
% ---------- Callouts ----------
\usepackage{tcolorbox}
% ---------- Bibliography ----------
\usepackage[numbers,sort&compress]{natbib}
% ---------- TikZ for schematic figures ----------
\usepackage{tikz}
\usetikzlibrary{arrows.meta,positioning,calc,fit,shapes.geometric,shapes.misc,backgrounds}
% ---------- Theorem Environments ----------
\newtheorem{theorem}{Theorem}
\newtheorem{assumption}{Assumption}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}
\newtheorem{proposition}{Proposition}
\newtheorem{remark}{Remark}
% ---------- Helpers ----------
\newcommand{\KL}{\mathrm{KL}}
\newcommand{\E}{\mathbb{E}}
% ---------- Title ----------
\title{RCM/RGE as Lens: Proof-Backed Domain Maps, Rates,\\
and Falsifiable Predictions via Stochastic Mirror Descent}
\author{C.\,L. Vaillant}
\date{Aug 2025}
\begin{document}
\maketitle
\begin{abstract}
\textbf{Scope.} \emph{Recursive Collapse / Generative Emergence} (RCM/RGE) is a \emph{lens}, not new mathematics: it re-expresses adaptive systems as stochastic mirror descent (SMD) on a free-energy \(F\) in a geometry \(\psi\).
\textbf{Contribution.} We supply a rigorous SMD backbone---almost-sure convergence, PL/K\L{} relaxations with rates, high-probability finite-time bounds, and Polyak--Juditsky asymptotics---and we \emph{demonstrate} two full mappings: (i) \textbf{predictive coding} as SMD in exponential-family geometry, including a cortical implementation sketch and an MDS noise proof; (ii) \textbf{Wright--Fisher} population genetics as SMD in entropy geometry, including conditions where gradient structure fails. We derive a \textbf{cross-domain rate law} explaining why brains converge faster than evolution under curvature and noise scales; we quantify \textbf{emergence} and catalogue \textbf{failure modes}.
\textbf{Positioning.} We compare to the Free Energy Principle (FEP): RCM/RGE adds explicit SA conditions, finite-time/rate guarantees, falsifiable predictions, and a cross-domain transfer recipe.
\textbf{Honest scope.} We intentionally \emph{do not} claim proofs for thermodynamics or AI alignment here; they follow similar logic and are reserved for future work.
\end{abstract}
% =========================================================
\section{SMD Backbone and Guarantees}
% =========================================================
Let \(\Delta=\{p\in\mathbb{R}^d_{\ge 0}:\sum_i p_i=1\}\).
Let \(\psi:\Delta\to\mathbb{R}\) be \(1\)-strongly convex w.r.t.\ a norm \(\|\cdot\|\), with Bregman divergence \(D_\psi(p\|q)=\psi(p)-\psi(q)-\langle\nabla\psi(q),p-q\rangle\). A protocol produces signals \(\xi_t\) and stochastic estimates \(g_t=g(p_t,\xi_t)\) with mean field \(G(p)=\E[g(p,\xi)]\). Let \(F:\Delta\to\mathbb{R}\) be \(C^1\) with \(G(p)=\nabla F(p)\).
\paragraph{SMD/RCM Update.}
\begin{equation}
\label{eq:update}
\nabla\psi(p_{t+1})=\nabla\psi(p_t)-\eta_t\, g_t \quad\Longleftrightarrow\quad p_{t+1}=\arg\min_{p\in\Delta}\{\langle g_t,p\rangle+\tfrac{1}{\eta_t}D_\psi(p\|p_t)\}.
\end{equation}
\begin{assumption}[SA Conditions]\label{ass:sa}
(a) \(\eta_t>0\), \(\sum_t\eta_t=\infty\), \(\sum_t\eta_t^2<\infty\);\;
(b) \(m_t:=g_t-\nabla F(p_t)\) is an MDS: \(\E[m_t|\mathcal F_t]=0\), \(\E[\|m_t\|_*^2|\mathcal F_t]\le\sigma^2\);\;
(c) \(\psi\) \(1\)-strongly convex; \(F\) \(\mu\)-strongly convex w.r.t.\ \(\psi\); \(G\) Lipschitz; \(\Delta\) compact.
\end{assumption}
\begin{theorem}[Attractor Convergence]\label{thm:rcmrge-main}
Under \Cref{ass:sa}, \(p^\star=\arg\min F\) is unique and \(p_t\to p^\star\) a.s.; moreover \(\sum_t\eta_t(F(p_t)-F^\star)<\infty\).
\end{theorem}
\begin{definition}[PL]\label{def:pl}
\(F\) satisfies Polyak--Łojasiewicz if \(F(p)-F^\star\le\frac{1}{2\mu}\|\nabla F(p)\|_*^2\).
\end{definition}
\begin{theorem}[Rates]\label{thm:pl}
Under \Cref{ass:sa}(a,b) and PL, with \(\eta_t=\eta_0/t\) and Polyak--Ruppert averaging \(\bar p_T\): \(\E[F(\bar p_T)-F^\star]=O(1/T)\) and \(\sqrt{T}(\bar p_T-p^\star)\Rightarrow \mathcal N(0,\Sigma)\).
\end{theorem}
\begin{theorem}[HP Bound]\label{thm:hp}
For convex \(F\) and \(\|m_t\|_*\le M\) a.s., with \(S_T=\sum_{t\le T}\eta_t\) and \(\bar p_T=\frac{1}{S_T}\sum_{t\le T}\eta_t p_t\),
\[
\Pr\!\Big(F(\bar p_T)-F^\star \le \tfrac{D_\psi(p^\star\|p_0)}{S_T}+C_1\tfrac{\sum\eta_t^2}{S_T}+C_2\sqrt{\tfrac{\sum\eta_t^2}{S_T^2}\log\tfrac{1}{\delta}}\Big)\ge 1-\delta.
\]
\end{theorem}
% =========================================================
\section{Case I: Predictive Coding as SMD (Exponential Families)}
% =========================================================
\label{sec:pc-derivation}
Let \(q_\theta(x)\) be a minimal exponential family with natural parameter \(\theta\), sufficient statistics \(T(x)\), cumulant \(A(\theta)\), mean parameter \(\mu=\nabla A(\theta)\) \citep{Amari1998}. Observations \(y_t\sim p(y|x)\) arrive online.
\paragraph{Free energy and geometry.} Define
\begin{equation}\label{eq:F-pc}
F(\theta)=\E_{q_\theta}[-\log p(y_t|x)]+\KL(q_\theta\|\pi_0).
\end{equation}
Set \(\psi(\theta)=A(\theta)\); then \(D_\psi(\theta\|\theta')=D_A(\theta\|\theta')\) equals the Bregman divergence dual to \(\KL(q_{\theta'}\|q_{\theta})\) on the family. Thus SMD in \(\theta\) is natural-gradient descent in \(\mu\).
\paragraph{Update and noise.} Let \(\ell_t(\theta)\) be the instantaneous loss. The stochastic gradient \(g_t=\nabla_\theta \ell_t(\theta)\) yields \(\nabla A(\theta_{t+1})=\nabla A(\theta_t)-\eta_t g_t\). With \(\mathcal F_t=\sigma(\theta_0,y_{1:t})\), the centered term \(m_t=g_t-\E[g_t|\mathcal F_t]\) satisfies \(\E[m_t|\mathcal F_t]=0\) by conditional independence of \(y_t\), and has bounded second moment for sub-Gaussian (or clipped) observations.
\paragraph{Cortical implementation (hierarchical predictive coding).} Following \citep{RaoBallard1999,Friston2005,Bastos2012}:
\begin{itemize}[leftmargin=2em]
\item \emph{Representations:} Superficial pyramidal cells (L2/3) encode predictions \(\mu=\nabla A(\theta)\); deep pyramidal cells (L5/6) encode error \(\varepsilon=y-h(\mu)\).
\item \emph{Message passing:} Feedforward errors drive updates of \(\mu\); feedback predictions modulate sensory units.
\item \emph{Dual-timescale dynamics.} In mean parameters \(\mu\),
\[
\tau_{\mathrm{fast}}\dot{\mu} = -\,\partial_{\mu}\,\ell(y,\mu),\qquad
\tau_{\mathrm{slow}}\dot{\theta} = -\,\eta\,\nabla_{\theta}F(\theta),
\]
with \(\partial_{\mu}F=(\nabla^2 A(\theta))^{-1}\nabla_\theta F\).
\item \emph{Biophysical step-size:} \(\eta_t\sim \eta_0/t\) (metaplastic decay) or \(\eta_t\sim \eta_0/\sqrt{t}\) (homeostatic downregulation).
\end{itemize}
\begin{proposition}[Predictive Coding as SMD]\label{prop:pc-smd}
Under \eqref{eq:F-pc}, \(\psi=A\), and SA steps \(\eta_t\), predictive coding implements SMD with MDS noise. Therefore \Cref{thm:rcmrge-main,thm:hp,thm:pl} apply: a.s.\ convergence, \(\tilde O(1/\sqrt{T})\) HP ergodic bounds, and \(O(1/T)\) with averaging under PL.
\end{proposition}
\paragraph{Falsifiable prediction (finite-time rate).} Let \(e_t=\|y_t-\E_{q_{\theta_t}}[h(x)]\|^2\). Then \(F(\bar\theta_T)-F^\star=\tilde O(1/\sqrt{T})\) w.h.p., and \(O(1/T)\) under PL with averaging. \emph{Test:} repeated-grating paradigms in V1 should show \(e_T\approx a/\sqrt{T}+b\) (or \(a'/T+b'\)) before noise floor.
% =========================================================
\section{Case II: Wright--Fisher $\Rightarrow$ Replicator SMD}
% =========================================================
\label{sec:wf-derivation}
For a Wright--Fisher population (size \(N\)), selection \(f\), mutation prior \(\pi_0\), allele frequencies \(p_t\in\Delta\) evolve by multinomial sampling.
\paragraph{Diffusion limit and drift.} As \(N\to\infty\), we obtain the SDE
\[
dp_t = \Big(p_t\odot(f - \langle f,p_t\rangle\mathbf 1) - \tau \nabla_p \KL(p_t\|\pi_0)\Big)\,dt \;+\; \Sigma(p_t)^{1/2}\,dW_t,\quad
\Sigma_{ij}(p)=\tfrac{1}{N}\big(p_i\delta_{ij}-p_ip_j\big).
\]
\paragraph{Entropy geometry and free energy.} Let \(\psi(p)=\sum_i p_i\log p_i\). Define \(F(p)=-\langle f,p\rangle+\tau\KL(p\|\pi_0)\). The drift is the KL-mirror gradient \(-\nabla F\); the mean ODE is replicator with entropic regularization \citep{Harper2009}.
\paragraph{SA form and noise.} A discrete-generation update yields \(\nabla\psi(p_{t+1})=\nabla\psi(p_t)-\eta_t(\nabla F(p_t)+m_t)\) with \(\E[m_t|\mathcal F_t]=0\), \(\E\|m_t\|_*^2=O(1/N)\).
\begin{proposition}[Wright--Fisher $\to$ SMD]\label{prop:wf-smd}
Under the diffusion approximation and entropy geometry, Wright--Fisher implements SMD on \(F\) with MDS noise of variance \(O(1/N)\). Then \Cref{thm:rcmrge-main,thm:hp,thm:pl} apply.
\end{proposition}
\begin{remark}[Where mapping breaks]
Strong frequency-dependence can make \(G\neq\nabla F\) (nonconservative drift); small \(N\) induces large variance beyond SA; epistasis may produce nonconvex \(F\), requiring K\L{} analysis.
\end{remark}
% =========================================================
\section{Quantifying Emergence}
% =========================================================
We quantify \emph{emergent structure} at the attractor \(p^\star\) by concentration relative to prior \(p_0\). Let \(H(p)=-\sum_i p_i\log p_i\) and define an effective dimension \(d_{\mathrm{eff}}(p)=\exp(H(p))\).
\begin{definition}[Emergence index]
\[
\mathcal{E}(p^\star;p_0)\;=\;\frac{d_{\mathrm{eff}}(p_0)}{d_{\mathrm{eff}}(p^\star)}\;\;\ge 1.
\]
\end{definition}
\paragraph{Worked example: Bayesian posterior concentration.} With a uniform prior on the \(d\)-simplex, \(d_{\mathrm{eff}}(p_0)=d\). Under identifiability and sufficient data, the posterior concentrates near the true parameter; the limit is Dirac with \(d_{\mathrm{eff}}(p^\star)\to 1\). Hence \(\mathcal{E}(p^\star;p_0)\to d\), quantifying maximal emergence from a flat prior.
% =========================================================
\section{A Cross-Domain Rate Law: Brains \texorpdfstring{$\gg$}{>>} Evolution}
% =========================================================
\label{sec:rate-law}
Under PL and \(\eta_t=\eta_0/t\) with averaging, SGD/SMD yields \( \E[F(\bar p_T)-F^\star]\approx C\cdot \sigma^2/(\mu T) \) (ignoring lower-order terms), where \(\mu\) is the PL curvature and \(\sigma^2\) the noise scale. Hence the time to reach \(\epsilon\)-accuracy scales as
\begin{equation}\label{eq:epsilon-time}
T(\epsilon)\;\approx\; C\,\frac{\sigma^2}{\mu\,\epsilon}.
\end{equation}
\begin{proposition}[Rate Ratio Law (with ranges)]\label{prop:rate-ratio}
Let \((\mu_{\mathrm{b}},\sigma^2_{\mathrm{b}})\) be brain curvature/noise and \((\mu_{\mathrm{e}},\sigma^2_{\mathrm{e}})\) evolutionary. Then
\[
\frac{T_{\mathrm{brain}}(\epsilon)}{T_{\mathrm{evo}}(\epsilon)} \;\approx\; \frac{\sigma^2_{\mathrm{b}}/\mu_{\mathrm{b}}}{\sigma^2_{\mathrm{e}}/\mu_{\mathrm{e}}}.
\]
Typical orders: \(\mu_{\mathrm{b}}\in[10,10^3]\) (early sensory; \(\mu_{\mathrm{b}}\sim \mathrm{SNR}^2\) \citep{GeislerKersten2002}) and \(\mu_{\mathrm{e}}\in[10^{-3},10^{-1}]\) (selection coefficient \(s\) \citep{Lenski1991,Kimura1983}). With comparable noise scales, \(T_{\mathrm{brain}}/T_{\mathrm{evo}}\in[10^2,10^6]\).
\emph{Falsification criterion:} ratios far outside this band (e.g., by \(>10\times\)) indicate mis-estimated curvatures/noise or violation of PL (necessitating K\L{} analysis).
\end{proposition}
\paragraph{Falsifiable prediction.} Estimate \((\mu,\sigma)\) from early-time slopes of neural discrimination learning vs.\ microbial selection trajectories; compare via \eqref{eq:epsilon-time}.
% =========================================================
\section{From Claims to Evidence: Simulation \& Data Plan}
% =========================================================
\paragraph{Figure~\ref{fig:pc-sim}: Predictive coding rate validation (schematic).} Compile-safe \emph{schematic} using TikZ to visualize expected slopes; simulation plots can replace this figure later.
\begin{figure}[t]
\centering
\begin{tikzpicture}[scale=1]
\draw[->] (0,0) -- (8,0) node[below] {$t$};
\draw[->] (0,0) -- (0,4) node[left] {$F(\theta_t)-F^\star$};
% 1/sqrt(t) curve
\draw[thick,domain=0.5:7.5,smooth,variable=\t] plot ({\t},{2.5/sqrt(\t)});
\node at (5,1.0) {$\sim c/\sqrt{t}$};
% 1/t curve
\draw[thick,dashed,domain=0.8:7.5,smooth,variable=\t] plot ({\t},{1.8/(\t)});
\node at (5,0.6) {$\sim c'/t$ (PL + averaging)};
% legend
\draw[thick] (1,3.5)--(1.8,3.5); \node[right] at (2,3.5) {$\eta_t=1/\sqrt{t}$};
\draw[thick,dashed] (1,3.0)--(1.8,3.0); \node[right] at (2,3.0) {$\eta_t=1/t$ (PL)};
\end{tikzpicture}
\caption{Predictive coding rates (schematic). Solid: \(\tilde O(1/\sqrt{t})\) high-probability bound. Dashed: \(O(1/t)\) under PL with Polyak--Ruppert averaging.}
\label{fig:pc-sim}
\end{figure}
\paragraph{Simulation protocol (to be implemented).} Linear–Gaussian model: \(x^\star \sim \mathcal N(0,I)\), \(y_t = Hx^\star + \varepsilon_t\), \(\varepsilon_t \sim \mathcal N(0,\sigma^2I)\). Variational family \(q_\theta(x)=\mathcal N(\mu,\Sigma)\). Initialize \(\theta_0\). For \(t=1{:}T\): sample \(y_t\), compute \(g_t=\nabla_\theta F(\theta_t,y_t)\), update via \eqref{eq:update}; record \(F(\theta_t)-F^\star\); fit \(a/\sqrt{t}+b\) and \(a'/t+b'\).
% =========================================================
\section{Relationship to Free Energy Principle (FEP)}
% =========================================================
Friston's FEP proposes that brains minimize variational free energy \citep{Friston2005,Friston2010}. RCM/RGE \emph{formalizes} this as SMD: (i) explicit SA conditions (Assumption~\ref{ass:sa}); (ii) finite-time high-probability bounds (Thm.~\ref{thm:hp}); (iii) PL rates and asymptotic normality (Thm.~\ref{thm:pl}); (iv) falsifiable predictions; (v) cross-domain transfer (\S\ref{sec:rate-law}).
% =========================================================
\section{Failure Modes and Boundary Conditions}
% =========================================================
\textbf{Nonconservative drifts:} if \(G\neq\nabla F\), SMD fails; use monotone-operator SA or extragradient \citep{Borkar2009}. \textbf{Heavy tails / non-PL:} only ergodic bounds apply; adopt robust geometry or reparameterize. \textbf{Nonstationarity:} if \(F_t\) drifts faster than SA, tracking error persists; two-time-scale SA adds an \(O(\text{drift})\) bias \citep{KushnerYin}. \textbf{Finite populations:} large demographic noise violates small-variance SA; require variance reduction or batch updates.
% =========================================================
\section{Plain-Language Summary}
% =========================================================
\begin{tcolorbox}[colback=blue!3!white,colframe=blue!50!black,title=Plain-Language Summary]
\textbf{What is RCM/RGE?} A lens: many systems---brains, evolution, algorithms---repeatedly \emph{collapse} toward stable states while \emph{generating} new structure.
\textbf{What is new here?} Not the math of SMD itself, but the \emph{proof-backed bridge}: we derive how predictive coding and Wright--Fisher \emph{instantiate} SMD, state rate laws tied to measurable curvature/noise, and list tests and failure modes.
\end{tcolorbox}
% =========================================================
\section*{Notes on Scope (Honesty)}
% =========================================================
We \emph{demonstrate} two domains (neuroscience, evolution). Thermodynamics (e.g., Langevin as Wasserstein gradient flow \citep{JordanKinderlehrerOtto1998}) and AI alignment (e.g., policy mirror descent) follow similar logic but are beyond the present paper's proofs.
% =========================================================
\appendix
\section*{Appendix A: Predictive Coding—Dual Coordinates and Natural Gradient}
\addcontentsline{toc}{section}{Appendix A: Predictive Coding—Dual Coordinates and Natural Gradient}
For an exponential family with natural parameter $\theta$, mean parameter $\mu = \nabla A(\theta)$, and cumulant $A$, the Fisher metric in natural coordinates is $\nabla^2 A(\theta)$. Natural gradient descent in $\mu$ with step $\eta$ corresponds to mirror descent in $\theta$ with mirror map $\psi = A$:
\[
\dot{\mu} = - \nabla_\mu F(\mu) \quad\Longleftrightarrow\quad \nabla^2 A(\theta)\,\dot{\theta} = - \nabla_\theta F(\theta).
\]
Thus the discrete SMD step $\nabla A(\theta_{t+1})=\nabla A(\theta_t) - \eta_t\, g_t$ matches the natural gradient update in $\mu$ under Legendre duality.
\section*{Appendix B: Wright--Fisher $\Rightarrow$ Replicator SMD—Diffusion Limit Details}
\addcontentsline{toc}{section}{Appendix B: Wright--Fisher $\Rightarrow$ Replicator SMD—Diffusion Limit Details}
Consider a population of size $N$ with allele frequencies $p\in\Delta$. In each generation,
\[
X' \sim \mathrm{Multinomial}\!\left(N,\ \tilde p\right), \quad \tilde p_i \propto p_i \exp\{f_i\}\cdot \pi_{0,i}^{\tau},
\]
where $f_i$ is the Malthusian fitness (or payoff) and $\pi_0$ encodes mutation as an entropic prior with intensity $\tau\ge 0$. Write $p' = X'/N$ and expand $\tilde p = p + \frac{1}{N}\,v(p) + o(N^{-1})$. A standard central-limit scaling (see, e.g., \cite{Kimura1983}) yields
\[
dp_t \;=\; v(p_t)\,dt \;+\; \Sigma(p_t)^{1/2}\,dW_t, \quad \Sigma_{ij}(p)=\frac{1}{N}\left(p_i\delta_{ij}-p_ip_j\right).
\]
Choosing the entropic geometry $\psi(p)=\sum_i p_i\log p_i$ and
\[
F(p) = -\langle f, p\rangle + \tau \,\KL(p\|\pi_0),
\]
the mean drift equals the \emph{mirror gradient} $-\nabla F$ in the KL geometry:
\[
v_i(p) \;=\; p_i \Big(f_i - \sum_j p_j f_j\Big) - \tau \Big(\log\frac{p_i}{\pi_{0,i}} - \sum_j p_j \log\frac{p_j}{\pi_{0,j}}\Big),
\]
so the mean ODE is the entropic-regularized replicator. The discrete-generation WF process is an SMD scheme with MDS noise of variance $O(1/N)$.
\section*{Appendix C: High-Probability Bound Constant Tracking}
\addcontentsline{toc}{section}{Appendix C: High-Probability Bound Constant Tracking}
In mirror-descent analysis, the one-step inequality
\[
D_\psi(p^\star\|p_{t+1}) - D_\psi(p^\star\|p_t) \le -\eta_t \langle \nabla F(p_t), p_t-p^\star\rangle + \eta_t\langle m_t, p^\star-p_t\rangle + \frac{\eta_t^2}{2}\|g_t\|_*^2
\]
together with $\psi$-strong convexity and Azuma–Hoeffding on the martingale term yields the HP bound in Theorem~\ref{thm:hp}.
% ---------- References ----------
\bibliographystyle{plainnat}
\begin{thebibliography}{99}
\bibitem{Amari1998}
Amari, S. (1998). Natural gradient works efficiently in learning. \emph{Neural Computation}, 10(2), 251--276. doi:10.1162/089976698300017746
\bibitem{RaoBallard1999}
Rao, R. P. N., \& Ballard, D. H. (1999). Predictive coding in the visual cortex. \emph{Nature Neuroscience}, 2, 79--87. doi:10.1038/4580
\bibitem{Friston2005}
Friston, K. (2005). A theory of cortical responses. \emph{Phil. Trans. R. Soc. B}, 360(1456), 815--836. doi:10.1098/rstb.2005.1622
\bibitem{Friston2010}
Friston, K. (2010). The free-energy principle. \emph{Nature Reviews Neuroscience}, 11, 127--138. doi:10.1038/nrn2787
\bibitem{Bastos2012}
Bastos, A. M., et al. (2012). Canonical microcircuits for predictive coding. \emph{Neuron}, 76(4), 695--711. doi:10.1016/j.neuron.2012.10.038
\bibitem{GeislerKersten2002}
Geisler, W. S., \& Kersten, D. (2002). Illusions, perception, and Bayesian inference. \emph{Annual Review of Psychology}, 53, 1--25.
\bibitem{Lenski1991}
Lenski, R. E., Rose, M. R., Simpson, S. C., \& Tadler, S. C. (1991). Long-term experimental evolution in \emph{E. coli}. \emph{Am. Nat.}, 138(6), 1315--1341.
\bibitem{Kimura1983}
Kimura, M. (1983). \emph{The Neutral Theory of Molecular Evolution}. Cambridge University Press.
\bibitem{Harper2009}
Harper, M. (2009). Information geometry and evolutionary game theory. \emph{arXiv:0911.1383}.
\bibitem{JordanKinderlehrerOtto1998}
Jordan, R., Kinderlehrer, D., \& Otto, F. (1998). The variational formulation of the Fokker--Planck equation. \emph{SIAM J. Math. Anal.}, 29(1), 1--17.
\bibitem{Robbins1951}
Robbins, H., \& Monro, S. (1951). A stochastic approximation method. \emph{Ann. Math. Stat.}, 22(3), 400--407.
\bibitem{KushnerYin}
Kushner, H. J., \& Yin, G. G. (2003). \emph{Stochastic Approximation and Recursive Algorithms and Applications} (2nd ed.). Springer.
\bibitem{Borkar2009}
Borkar, V. S. (2009). \emph{Stochastic Approximation: A Dynamical Systems Viewpoint}. Cambridge University Press.
\end{thebibliography}
\end{document}