%~Mouliné par MaN_auto v.0.27.3 2023-11-02 10:07:30
\documentclass[CRMATH,Unicode,XML]{cedram}

\TopicEN{Functional analysis}
\TopicFR{Analyse fonctionnelle}


\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{cleveref}

\DeclareMathOperator\id{id}

\DeclareMathOperator{\var}{Var}
\DeclareMathOperator{\cov}{Cov}
\DeclareMathOperator*{\minimize}{\mathrm{minimize}}
\DeclareMathOperator*{\maximize}{\mathrm{maximize}}
\DeclarePairedDelimiter\norm{\lVert}{\rVert}


\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\Rd}{\mathbb{R}^d}
\newcommand{\eps}{\varepsilon}
\renewcommand{\phi}{\varphi}

\newcommand{\KL}{\mathrm{KL}}
\newcommand{\clN}{\mathcal{N}}
\newcommand{\opp}{\mathrm{op}}

\newcommand\mc[1]{\mathcal{#1}}
\newcommand\mss[1]{\mathsf{#1}}
\newcommand\msf[1]{\mathsf{#1}}
\newcommand\on[1]{\operatorname{#1}}


\newcommand*{\op}[1]{\|#1\|_{\mathrm{op}}}
\newcommand*{\frob}[1]{\|#1\|_{\mathrm{F}}}
\newcommand*{\two}[1]{\|#1\|_{2}}

\newcommand*{\kl}[3][]{
\ifthenelse{\isempty{#1}}{\operatorname{D}(#2\,\|\,#3)} {\operatorname{D}(#2\,\|\,#3\mid#1)} }
\newcommand*{\tv}[2]{\mathrm{d_{TV}}(#1, #2)}
\newcommand*{\chis}[2]{\chi^2(#1, #2)}

\newcommand*{\triplenorm}[1]{{\left\vert\kern-0.25ex\left\vert\kern-0.25ex\left\vert #1
\right\vert\kern-0.25ex\right\vert\kern-0.25ex\right\vert}}


\newcommand*{\bern}[1]{\mathrm{Bern}(#1)}
\newcommand*{\pois}[1]{\mathrm{Pois}(#1)}
\newcommand*{\bin}[1]{\mathrm{Bin}(#1)}


\newcommand*{\p}[1]{\mathbb P\left\{#1\right\}}
\newcommand*{\pp}[2]{\mathbb P_{#1}\left\{#2\right\}}


\newcommand*{\indic}[1]{\1_{#1}}
\newcommand*{\ep}{\varepsilon}
\newcommand*{
\defeq}{\coloneqq}
\newcommand*{\rd}{\mathrm{d}}
\newcommand*{\dd}{\,\rd}


\newcommand\deq{\coloneqq}
\newcommand\mmid{\mathbin{\|}}

%\newcommand*{\aram}[1]{{\textcolor{red}{[\textbf{AAP:} #1]}}}
%\newcommand*{\sinho}[1]{{\textcolor{blue}{[\textbf{SC:} #1]}}}
%\newcommand*{\edit}[1]{{\textcolor{blue}{#1}}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\graphicspath{{./figures/}}

\newcommand*{\mk}{\mkern -1mu}
\newcommand*{\Mk}{\mkern -2mu}
\newcommand*{\mK}{\mkern 1mu}
\newcommand*{\MK}{\mkern 2mu}

%\hypersetup{urlcolor=purple, linkcolor=blue, citecolor=red}

\newcommand*{\relabel}{\renewcommand{\labelenumi}{(\theenumi)}}
\newcommand*{\romanenumi}{\renewcommand*{\theenumi}{\roman{enumi}}\relabel}
\newcommand*{\Romanenumi}{\renewcommand*{\theenumi}{\Roman{enumi}}\relabel}
\newcommand*{\alphenumi}{\renewcommand*{\theenumi}{\alph{enumi}}\relabel}
\newcommand*{\Alphenumi}{\renewcommand*{\theenumi}{\Alph{enumi}}\relabel}
\let\oldtilde\tilde
\renewcommand*{\tilde}[1]{\mathchoice{\widetilde{#1}}{\widetilde{#1}}{\oldtilde{#1}}{\oldtilde{#1}}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\title{An entropic generalization of Caffarelli's contraction theorem via covariance inequalities}

\alttitle{Une généralisation entropique du théorème de contraction de Caffarelli à l'aide d'inégalités de covariance}

\author{\firstname{Sinho} \lastname{Chewi}}
\address{School of Mathematics, Institute for Advanced Study, Princeton, USA}
\email[S. Chewi]{ schewi@ias.edu}

\author{\firstname{Aram-Alexandre} \lastname{Pooladian}\IsCorresp}
\address{Center for Data Science, New York University, New York, USA}
\email[A-A. Pooladian]{aram-alexandre.pooladian@nyu.edu}

\thanks{This work was completed while SC was visiting NYU. SC was supported by the Department of Defense (DoD) through the National Defense Science \& Engineering Graduate Fellowship (NDSEG) Program. AAP was partially supported by the Natural Sciences and Engineering Research Council of Canada.}


\begin{abstract}
The optimal transport map between the standard Gaussian measure and an $\alpha$-strongly log-concave probability measure is $\alpha^{-1/2}$-Lipschitz, as first observed in a celebrated theorem of Caffarelli. In this paper, we apply two classical covariance inequalities (the Brascamp--Lieb and Cramér--Rao inequalities) to prove a sharp bound on the Lipschitz constant of the map that arises from \emph{entropically regularized} optimal transport. In the limit as the regularization tends to zero, we obtain an elegant and short proof of Caffarelli's original result. We also extend Caffarelli's theorem to the setting in which the Hessians of the log-densities of the measures are bounded by arbitrary positive definite commuting matrices.
\end{abstract}

\begin{altabstract}
La fonction de transport optimale entre la mesure gaussienne standardisée et une mesure de probabilité $\alpha$-fortement log-concave est $\alpha^{-1/2}$-Lipschitz, comme l'a noté Caffarelli dans le célèbre théorème qui porte désormais son nom. Dans ce travail, nous utilisons deux inégalités de covariance classiques (l'inégalité de Brascamp--Lieb ainsi de celle de Cramèr--Rao) pour établir une borne optimale sur la constante de Lipschitz de la fonction de transport associée au transport optimal avec \emph{régularisation entropique}. En étudiant le cas limite où l'effet de la régularisation disparait, nous obtenons une démonstration courte et élegante du théorème de Caffarelli. De surcroît, cette approche nous permet d'étendre la validité du théoreme de Caffarelli au cas de log-densités dont les hessiens sont contrôlés par des matrices positives définies qui peuvent être choisies arbitrairement tant qu'elles commutent entre elles.
\end{altabstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{document}

\maketitle

\section{Introduction}\label{sec:intro}

In~\cite{caffarelli2000monotonicity}, Caffarelli proved the following seminal result.
\begin{theo}[Caffarelli's contraction theorem]\label{thm: caf_con}
Let $P = \exp(-V)$ and $Q = \exp(-W)$ have smooth densities on $\R^d$, with $\nabla^2 V \preceq \beta_V I $ and $\nabla^2 W \succeq \alpha_W I \succ 0$. Then, the optimal transport map $\nabla \phi_0$ from $P$ to $Q$ is $\sqrt{\beta_V/\alpha_W}$-Lipschitz.
\end{theo}

Here, $\phi_0 : \R^d\to\R$ is a convex function, known as a \emph{Brenier potential}. The optimal transport map $\nabla \phi_0 : \R^d\to\R^d$ pushes forward $P$ to $Q$, in the sense that if $X$ is a random variable with law $P$, then $\nabla \phi_0(X)$ is a random variable with law $Q$. See Section~\ref{sec: background_ot} and the textbook~\cite{villani2003topics} for background on optimal transport.

Caffarelli's contraction theorem can be used to transfer functional inequalities, such as a Poincaré inequality, from the standard Gaussian measure on $\R^d$ to other probability measures~\cite{bakrygentilledoux2014}. Towards this end, recent works have also constructed and studied alternative Lipschitz transport maps (e.g.~\cite{kim2012generalization,mikulincer2021brownian, mikulincer2022lipschitz, neeman2022lipschitz}), but still the properties of the original optimal transport map remain of fundamental interest, with many questions unresolved~\cite{valdimarsson2007otpotential,colombo2015lipschitz}.

Indeed, besides the application to functional inequalities, the structural properties of optimal transport maps play a fundamental role in theoretical and methodological advances in optimal transport, such as the control of the curvature of the Wasserstein space through the notion of extendible geodesics~\cite{legouicetal2019fast, ahidarlegouicparis2020barycenters}, the stability of Wasserstein barycenters~\cite{chewietal2020buresgd}, and the statistical estimation of optimal transport maps~\cite{hutter2021minimax}.

In applied domains, however, the inauspicious computational and statistical burden of solving the original optimal transport problem has instead led practitioners to consider \emph{entropically regularized} optimal transport, as pioneered by Cuturi in~\cite{cuturi2013sinkhorn}. In addition to its practical merits, entropic optimal transport enjoys a rich mathematical theory, rooted in its connection to the classical Schr\"odinger bridge problem~\cite{leonard2014schrodinger}, which has led to powerful applications to high-dimensional probability~\cite{ledoux2018remarks, fathi2020proof, gentiletal2020entropichwi}. As such, it is natural to study the properties of the entropic analogue of the optimal transport map.

In this paper, we prove a generalization of Caffarelli's contraction theorem to the setting of entropic optimal transport. Namely, we study the Hessian of the \emph{entropic Brenier potential} (see Section~\ref{sec: background_eot}), which admits a representation as a covariance matrix (Lemma~\ref{lem: hessians_pi}). By applying two well-known inequalities for covariance matrices (the Brascamp--Lieb inequality and the Cramér--Rao inequality), we quickly deduce a sharp upper bound on the operator norm of the Hessian which holds for any value $\eps > 0$ of the regularization parameter.

As a byproduct of our analysis, by sending $\eps \searrow 0$ and appealing to recent convergence results for the entropic Brenier potentials~\cite{berntonetal2022entropic}, we obtain the shortest proof of Caffarelli's contraction theorem to date. Notably, our argument allows us to sidestep the regularity of the optimal transport map, which is a key obstacle in Caffarelli's original proof and many others in the literature (see, e.g.,~\cite{kolesnikov2011mass}).

Recently, in~\cite{fathi2020proof} (see also~\cite{prodhommethesis}), Fathi, Gozlan, and Prod'homme gave a proof of Caffarelli's theorem using a surprising equivalence between Theorem~\ref{thm: caf_con} and a statement about Wasserstein projections, which was discovered through the theory of weak optimal transport~\cite{gozlanjuillet2020weakot}. In order to verify the latter, their proof also used ideas from entropic optimal transport.\footnote{In particular, with some effort, a bound on the Hessian of the entropic Brenier potential can also be read off from their proof.} In comparison, we note that our argument is much more direct.

To further demonstrate the applicability of our technique, in Section~\ref{scn:commuting_matrices} we prove a generalization of Caffarelli's result which reveals a remarkable extremal property of optimal transport maps between Gaussians. Namely, if $\nabla^2 V \preceq A^{-1}$ and $\nabla^2 W \succeq B^{-1}$, where $A$ and $B$ are arbitrary commuting positive definite matrices, then the Hessian of the Brenier potential from $P$ to $Q$ is pointwise upper bounded (in the PSD ordering) by $A^{-1/2} B^{1/2}$, the Hessian of the Brenier potential from $\mc N(0,A)$ to $\mc N(0,B)$. To the best of our knowledge, this result is new.

\section{Background}\label{sec:background}

\subsection{Assumptions}

We study probability measures $P$, $Q$ on $\R^d$ satisfying the following mild regularity assumptions.

\begin{enonce}{Assumption}[Regularity conditions]
We henceforth refer to the \emph{source measure} as $P$ and the \emph{target measure} as $Q$. We say that $(P, Q)$ satisfies our regularity conditions if:
\begin{enumerate}
\item\label{assump2.1} $P$ has full support on $\R^d$ and $Q$ is supported on a convex subset of $\R^d$. Let $\Omega_Q$ denote the interior of the support of $Q$, so that $\Omega_Q$ is a convex open set.
\item\label{assump2.2} $P$ and $Q$ admit positive Lebesgue densities on $\R^d$ and $\Omega_Q$, which we can therefore be written $\exp(-V)$ and $\exp(-W)$ respectively for functions $V, W : \R^d\to\R \cup \{\infty\}$. We abuse notation and identify the measures with their densities, thus writing $P = \exp(-V)$ and $Q = \exp(-W)$.
\item\label{assump2.3} We assume that $V$ and $W$ are twice continuously differentiable on $\R^d$ and $\Omega_Q$ respectively.
\end{enumerate}
\end{enonce}

Some of these assumptions can be eventually relaxed, but they suffice for the purposes of this work. Throughout the rest of the paper and for the sake of simplicity, these regularity assumptions are assumed to hold for the probability measures under consideration.

\subsection{Optimal transport without regularization}\label{sec: background_ot}

Let $P$ and $Q$ be probability measures with finite second moment. The \emph{optimal transport problem} is the following optimization problem:
\begin{align}\label{eq: kant_p}
\minimize_{\pi\,\in\,\Pi(P,Q)}\quad\int \tfrac{1}{2}\,\norm{x-y}^2 \, \dd \pi(x,y)
\end{align}
where $\Pi(P,Q)$ is the set of joint probability measures with marginals $P$ and $Q$. The following fundamental result characterizes the optimal solution to~\eqref{eq: kant_p}.

\begin{theo}[{Brenier's theorem}]\label{thm: brenier_thm}
Suppose that $P$ admits a density with respect to Lebesgue measure. Then, there exists a proper, convex, lower semicontinuous function $\phi_0 : \R^d\to\R \cup \{\infty\}$ such that the optimal transport plan in~\eqref{eq: kant_p} can be written $\pi_0 = {(\id, \nabla \phi_0)}_\sharp P$. The function $\phi_0$ is called the \emph{Brenier potential}, and the mapping $\nabla \phi_0$ is called the \emph{optimal transport map} from $P$ to $Q$. Moreover, the optimal transport map $\nabla \phi_0$ is unique up to $P$-almost everywhere equality.

The Brenier potential $\phi_0$ is obtained as the solution to the dual problem
\begin{align}\label{eq: ot_dual}
\maximize_{\phi\,\in\,\Gamma_0}\quad\int \left(\frac{\norm\cdot^2}{2} -\phi\right) \dd P + \int \left(\frac{\norm\cdot^2}{2} -\phi^*\right) \dd Q\,,
\end{align}
where $\phi^*$ is the convex conjugate to $\phi$, and $\Gamma_0$ is the set of proper, convex, lower semicontinuous functions on $\R^d$.
\end{theo}


We refer to~\cite{villani2003topics} for further background.
\subsection{Optimal transport with entropic regularization}\label{sec: background_eot}

We recall that \emph{entropic optimal transport} is the problem that arises when we add the {Kullback--Leibler (KL) divergence}, $D_\text{KL}(\cdot\mmid\cdot)$, as a regularizer to~\eqref{eq: kant_p}:
\begin{align}\label{eq: eot_p}
\minimize_{\pi\,\in\,\Pi(P,Q)}\quad\int\tfrac{1}{2}\,\norm{x-y}^2 \dd \pi(x,y) + \eps \, D_{\KL}\left(\pi \mmid P\otimes Q\right)\,.
\end{align}
The following statement characterizes the solution to~\eqref{eq: eot_p}~\cite{Csi75, PeyCut19, berntonetal2022entropic}.
\begin{theo}[Entropic optimal transport]
Let $P$ and $Q$ be probability measures on $\R^d$ and fix $\eps > 0$. Then there exists a unique solution $\pi_\eps \in \Pi(P, Q)$ to~\eqref{eq: eot_p}. Moreover, $\pi_\eps$ has the form
\begin{align}\label{eq:entropic_plan}
\pi_\eps\left(\dd x,\dd y\right) = \exp\left(\frac{f_\eps(x) + g_\eps(y) - \frac{1}{2} \, \norm{x-y}^2}{\eps}\right) \, P(\dd x) \, Q(\dd y)\,,
\end{align}
where $(f_\eps,g_\eps)$ are maximizers for the dual problem
\begin{align}
\maximize_{(f,g)\,\in\,L^1(P)\,\times\,L^1(Q)}\quad\int f \dd P + \int g \dd Q &-\eps \iint e^{\left(f(x)+g(y)-\frac{1}{2} \, \norm{x-y}^2\right)/\eps}\dd P(x)\dd Q(y) + \eps\,.
\end{align}
\end{theo}

The constraint that $\pi_\eps$ has marginals $P$ and $Q$ implies the following dual optimality conditions for $(f_\eps,g_\eps)$ (see~\cite{mena2019statistical, berntonetal2022entropic} for more details):
\begin{align}
&f_\eps(x) = -\eps\log\int e^{\left(g_\eps(y) - \frac{1}{2} \, \norm{x-y}^2\right)/\eps}\dd Q(y) \qquad \left(x \in \R^d\right)\,, \label{eq: opt_cond1}
\\
&g_\eps(y) = -\eps\log\int e^{\left(f_\eps(x) - \frac{1}{2} \, \norm{x-y}^2\right)/\eps}\dd P(x) \qquad \left(y \in \R^d\right)\,. \label{eq: opt_cond2}
\end{align}
In particular, $f_\eps$ and $g_\eps$ are smooth. In this work, it is more convenient to work with the \emph{entropic Brenier potentials}, defined as
\begin{align}\label{eq: ent_brenier}
\left(\phi_\eps,\psi_\eps\right) \deq \left(\frac12\,\|\cdot\|^2 - f_\eps,\;\frac12\,\|\cdot\|^2 - g_\eps\right)\,.
\end{align}
Since $(f_\varepsilon, g_\varepsilon)$ are only unique up to adding a constant to $f_\varepsilon$ and subtracting the same constant from $g_\varepsilon$, we fix the normalization convention $\int f_\varepsilon \, \dd P = \int g_\varepsilon \, \dd Q$. Under this condition, it was shown by Nutz and Wiesel in~\cite{berntonetal2022entropic} that we have convergence to the Brenier potential $\varphi_\eps \to \varphi_0$ as $\eps \searrow 0$; we recall an abbreviated version of the statement for the convenience of the reader:

\begin{theo}\label{thm: nutzweiselthm}
For any choice of regularization parameter $\varepsilon > 0$, let $(\phi_\eps,\psi_\eps)$ be the unique entropic Brenier potentials with the normalization condition
\[
\int \left(\frac12\, \|\cdot\|^2 - \phi_\eps\right) \dd P = \int \left(\frac12\, \|\cdot\|^2 - \psi_\eps\right) \dd Q\,.
\]
If $(\phi_0, \phi_0^*)$ are unique, it holds that $\lim_{\eps\,\searrow\,0} \phi_\eps = \phi_0$ in $L^1(P)$ and $\lim_{\eps\,\searrow\,0} \psi_\eps = \phi_0^*$ in $L^1(Q)$.
\end{theo}


Adopting this new notation, with $P = \exp(-V)$ and $Q = \exp(-W)$, we can rewrite the entropic optimal plan as
\begin{align*}
\pi_\eps\left(\dd x,\dd y\right) = \exp\left(- \frac{\varphi_\eps(x) + \psi_\eps(y) - \langle x, y\rangle }{\eps} - V(x) - W(y)\right) \, \dd x \, \dd y\,.
\end{align*}

The entropic Brenier potentials were first introduced to develop a computationally tractable estimator of the optimal transport map $\nabla\phi_0$~\cite{seguy2017large,pooladian2021entropic,pooladian2022debiaser}. Indeed, this is motivated by the following observation, which acts as an entropic version of Brenier's theorem. Write $\pi_\eps^{Y\mid X=x}$ for the conditional distribution of $Y$ given $X=x$ for $(X,Y) \sim \pi_\eps$, and similarly define $\pi_\eps^{X\mid Y=y}$. Then, by~\cite[Proposition~1]{pooladian2021entropic}, $\nabla \phi_\eps$ is the barycentric projection
\begin{equation}\label{eq: t_eps_expectation}
\nabla \phi_\eps(x) = \int y \, \dd \pi_\eps^{Y\mid X=x}(y)\,.
\end{equation}
For clarity, we abuse notation and abbreviate $\pi_\eps^{Y\mid X=x}$ by $\pi_\eps^x$ and $\pi_\eps^{X\mid Y=y}$ by $\pi_\eps^y$ when there is no danger of confusion.

The following lemma is a straightforward computation using~\eqref{eq:entropic_plan}, \eqref{eq: opt_cond1}, and~\eqref{eq: opt_cond2}.

\begin{lemm}\label{lem: hessians_pi}
It holds that
\begin{align*}
& \nabla^2\phi_\eps(x) = \eps^{-1}\cov_{Y\sim\pi_\eps^x}(Y)\,, \qquad \text{ and } \qquad \nabla^2\psi_\eps(y) = \eps^{-1}\cov_{X\sim\pi_\eps^y}(X)\,.
\end{align*}
In particular, both $\varphi_\eps$ and $\psi_\eps$ are convex. Moreover, under our regularity conditions,
\begin{align*}
& \nabla_y^2\log\left(1/\pi_\eps^x\right)(y) = \eps^{-1}\, \nabla^2\psi_\eps(y) + \nabla^2W(y)\,,\\
& \nabla_x^2\log\left(1/\pi_\eps^y\right)(x) = \eps^{-1}\, \nabla^2\phi_\eps(x) + \nabla^2V(x)\,.
\end{align*}
\end{lemm}

\subsection{Covariance inequalities}

In our proofs, we make use of the following key inequalities.

\begin{lemm}\label{lem:key}
Let $P = \exp(-V)$ be a probability measure on $\R^d$ and assume that $V$ is twice continuously differentiable on the interior of its domain. Then, the following hold.
\begin{enumerate}
\item\label{Lemm7.1} (Brascamp--Lieb inequality) If in addition we assume that $P$ is strictly log-concave, then it holds that
\[
\cov_{X\sim P}(X) \preceq \E_{X\sim P}\left[{\left(\nabla^2 V(X)\right)}^{-1}\right]\,.
\]
\item\label{Lemm7.2} (Cramér--Rao inequality)
\[
\cov_{X\sim P}(X) \succeq {\left(\E_{X\sim P}\left[\nabla^2 V(X)\right]\right)}^{-1}\,.
\]
\end{enumerate}
\end{lemm}

The Brascamp--Lieb inequality is classical, and we refer readers to~\cite{bobkovledoux2000brunnmintobrascamplieblsi, bakrygentilledoux2014, cordero2017transport} for several proofs. To make our exposition more self-contained, we provide a proof of the Cramér--Rao inequality in the appendix.

\section{Main theorem}

We now state and prove our main theorem.

\begin{theo}\label{thm:main}
Let $P = \exp(-V)$ and $Q = \exp(-W)$.
\begin{enumerate}
\item\label{theo8.1} Suppose that $(P, Q)$ satisfy our regularity assumptions, as well as
\begin{align*}
\nabla^2 V \preceq \beta_V I\,, \qquad\text{and}\qquad \nabla^2 W \succeq \alpha_W I \succ 0\,.
\end{align*}
Then, for every $\varepsilon > 0$ and all $x\in \R^d$, the Hessian of the entropic Brenier potential satisfies
\begin{align*}
\nabla^2 \varphi_\eps(x)
\preceq \frac{1}{2} \, \left(\sqrt{4\beta_V/\alpha_W + \varepsilon^2 \beta_V^2} - \varepsilon \beta_V\right)\, I\,.
\end{align*}
\item\label{theo8.2} Suppose that $(Q, P)$ satisfy our regularity assumptions, as well as
\begin{align*}
\nabla^2 V \succeq \alpha_V I \succ 0\,, \qquad\text{and}\qquad \nabla^2 W \preceq \beta_W I\,.
\end{align*}
Then, for every $\varepsilon > 0$ and all $x\in \Omega_P \deq \on{int}(\on{supp}(P))$, the Hessian of the entropic Brenier potential satisfies
\begin{align*}
\nabla^2 \varphi_\eps(x) &\succeq \frac{1}{2} \, \left(\sqrt{4\alpha_V/\beta_W + \varepsilon^2 \alpha_V^2} - \varepsilon \alpha_V\right) \, I\,.
\end{align*}
\end{enumerate}
\end{theo}

Observe that as $\varepsilon \searrow 0$, we formally expect the following bounds on the Brenier potential:
\begin{align*}
\sqrt{\alpha_V/\beta_W}\, I \preceq \nabla^2 \varphi_0(x) \preceq \sqrt{\beta_V/\alpha_W} \, I\,.
\end{align*}
In particular, this recovers Caffarelli's contraction theorem (Theorem~\ref{thm: caf_con}). We make this intuition rigorous below by appealing to convergence results for the entropic potentials as the regularization parameter $\varepsilon$ tends to zero.

\begin{proof}[Proof of Theorem~\ref{thm:main} - Upper bound.] Fix $x \in \R^d$. Recall from Lemma~\ref{lem: hessians_pi} that
\begin{align*}
\nabla^2\varphi_\eps(x) = \eps^{-1} \cov_{Y \sim \pi_\eps^x}(Y)\,.
\end{align*}
By an application of the Brascamp{--}Lieb inequality, this results in the upper bound
\begin{equation}\label{eq:main_thm_upper_bd}
\begin{split}
\nabla^2\varphi_\eps(x) &= \eps^{-1}\cov_{Y\sim \pi_\eps^x}(Y) \\
&\preceq \eps^{-1}\E_{Y \sim \pi_\eps^x}\left[\left(\eps^{-1}\,\nabla^2\psi_\eps(Y) + \nabla^2 W(Y)\right)^{-1} \right] \\
&\preceq \E_{Y \sim \pi_\eps^x}\left[\left(\nabla^2\psi_\eps(Y) + \eps \alpha_W I\right)^{-1}\right]\,, 
\end{split}
\end{equation}
where in the last inequality we also used the lower bound on the spectrum of $\nabla^2W$. Next, using Lemma~\ref{lem: hessians_pi} and the Cramér--Rao inequality (Lemma~\ref{lem:key}), we obtain the lower bound
\begin{align*}
\nabla^2 \psi_\eps(Y) &= \eps^{-1}\cov_{X\sim \pi_\eps^Y}(X) \\
&\succeq \eps^{-1} \, \left(\E_{X \sim \pi_\eps^Y}\left[\eps^{-1}\,\nabla^2\varphi_\eps(X) + \nabla^2V(X) \right] \right)^{-1} \\
&\succeq \left(\E_{X \sim \pi_\eps^Y}\left[\nabla^2\varphi_\eps(X) + \eps \beta_V I \right]\right)^{-1}\,,
\end{align*}
where we used the upper bound on the spectrum of $\nabla^2 V$. Combining these inequalities,
\begin{align*}
\nabla^2 \varphi_\eps(x) &\preceq \E_{Y \sim \pi_\eps^x}\left[\left(\left(\E_{X \sim \pi_\eps^Y}\left[\nabla^2\varphi_\eps(X) + \eps\beta_V I \right]\right)^{-1} + \eps\alpha_WI \right)^{-1}\right]\,.
\end{align*}

Now, define the quantity
\begin{align*}
L_\eps \deq \sup_{x\,\in\,\R^d} \lambda_{\max}\left(\nabla^2\varphi_\varepsilon(x)\right)\,.
\end{align*}
From~\eqref{eq:main_thm_upper_bd} and the fact that $\psi_\eps$ is convex (Lemma~\ref{lem: hessians_pi}), it follows that $L_\eps$ is finite: $L_\eps \le {(\eps \alpha_W)}^{-1}$. Then, we have shown
\begin{align*}
\lambda_{\max}\left(\nabla^2 \varphi_\varepsilon(x)\right) &\le \left(\left(L_\eps + \varepsilon \beta_V \right)^{-1} + \varepsilon \alpha_W\right)^{-1}\,.
\end{align*}
Taking the supremum over $x \in \R^d$,
\begin{align*}
L_\eps &\le \left(\left(L_\eps + \varepsilon \beta_V\right)^{-1} + \varepsilon \alpha_W\right)^{-1}\,.
\end{align*}
Solving the inequality yields
\begin{align}\label{eq:Leps_bd}
L_\eps \leq \frac{1}{2} \,\left(\sqrt{4\beta_V/\alpha_W + \eps^2\beta_V^2} - \eps\beta_V\right)\,.
\end{align}

\textbf{Lower bound.} The lower bound argument is symmetric, but we give the details for completeness. Using Lemma~\ref{lem: hessians_pi} and the Cramér--Rao inequality (Lemma~\ref{lem:key}),
\begin{align*}
\nabla^2 \varphi_\varepsilon(x) &= \eps^{-1} \cov_{Y\sim \pi_\eps^x}(Y) \\
&\succeq \eps^{-1} \, \left(\E_{Y\sim\pi_\eps^x}\left[\eps^{-1} \, \nabla^2 \psi_\eps(Y) + \nabla^2 W(Y) \right]\right)^{-1} \\
&\succeq \left(\E_{Y\sim\pi_\eps^x}\left[\nabla^2 \psi_\eps(Y) + \eps \beta_W I\right]\right)^{-1}\,.
\end{align*}
Applying Lemma~\ref{lem: hessians_pi} and the Brascamp--Lieb inequality (Lemma~\ref{lem:key}),
\begin{align*}
\nabla^2 \psi_\eps(Y) &= \eps^{-1} \cov_{X\sim \pi_\eps^Y}(X) \\
&\preceq \eps^{-1} \E_{X\sim\pi_\eps^Y}\left[\left(\eps^{-1} \, \nabla^2 \varphi_\eps(X) + \nabla^2 V(X) \right)^{-1} \right] \\
&\preceq \E_{X\sim\pi_\eps^Y}\left[\left(\nabla^2 \varphi_\eps(X) + \eps \alpha_V I \right)^{-1} \right]\,.
\end{align*}
Combining the two inequalities and setting
\begin{align*}
\ell_\eps & \deq \inf_{x\,\in\,\Omega_P} \lambda_{\min}\left(\nabla^2 \varphi_\eps(x)\right)\,,
\end{align*}
we deduce that
\begin{align*}
\ell_\eps &\ge \left(\left(\ell_\eps + \eps \alpha_V\right)^{-1} + \eps \beta_W\right)^{-1}\,.
\end{align*}
On the other hand, from Lemma~\ref{lem: hessians_pi}, we know that $\ell_\eps \ge 0$. Solving the inequality then yields
\begin{align*}
\ell_\eps &\ge \frac{1}{2} \, \left(\sqrt{4\alpha_V/\beta_W +\eps^2 \alpha_V^2} - \eps \alpha_V\right)\,. \qedhere
\end{align*}
\end{proof}

Next, we rigorously deduce Caffarelli's contraction theorem from Theorem~\ref{thm:main}.

\begin{proof}[Proof of Caffarelli's contraction (Theorem~\ref{thm: caf_con})]
For every $\varepsilon > 0$, by Theorem~\ref{thm:main}, we have proven that $\nabla^2 \varphi_\varepsilon \preceq L_\varepsilon I$, with $L_\eps$ as in~\eqref{eq:Leps_bd}. Equivalently, this can be reformulated as saying that $\frac{L_\varepsilon \, \norm \cdot^2}{2} - \varphi_\varepsilon$ is convex.~Fix some $\delta > 0$; in particular, for $\varepsilon$ sufficiently small, $\frac{(\sqrt{\beta_V/\alpha_W} + \delta) \, \norm \cdot^2}{2} - \varphi_\varepsilon$ is convex.

Upon passing to a sequence $\varepsilon_k \searrow 0$, existing results on the convergence of entropic optimal transport potentials show that $\varphi_{\varepsilon_k} \to \varphi_0$ in $L^1(P)$ (see Theorem~\ref{thm: nutzweiselthm}). Passing to a further subsequence, we obtain $\varphi_{\varepsilon_k} \to \varphi_0$ ($P$-almost surely). It follows that $\frac{(\sqrt{\beta_V/\alpha_W} + \delta) \, \norm \cdot^2}{2} - \varphi_0$ is convex for every $\delta > 0$ (see the remark after~\cite[Theorem~25.7]{rockafellar1997convexanalysis}), and thus for $\delta = 0$.
\end{proof}

\begin{rema}
Our main theorem provides both upper and lower bounds for $\nabla^2 \varphi_\eps$. In the case when $\eps = 0$, the lower bound follows from the upper bound. Indeed, if $\varphi_0$ is the Brenier potential for the optimal transport from $P$ to $Q$, then the convex conjugate $\varphi_0^*$ is the Brenier potential for the optimal transport from $Q$ to $P$. By applying Caffarelli's contraction theorem to $\varphi_0^*$ and appealing to convex duality, it yields a lower bound on $\nabla^2 \varphi_0$. However, we are not aware of a method of deducing the lower bound from the upper bound for positive values of $\eps$.
\end{rema}

\begin{rema}
In Appendix~\ref{sec: gaussian_case}, by inspecting the Gaussian case, we show that Theorem~\ref{thm:main} is sharp for every $\eps > 0$.
\end{rema}

\begin{rema}
In the proof of Theorem~\ref{thm:main}, we do not use the full force of the Brascamp--Lieb inequality. Rather, we use the covariance inequality in Lemma~\ref{lem:key} which is a corollary of the usual Brascamp--Lieb inequality obtained by applying it to linear test functions.
\end{rema}

An inspection of the proof of the upper bound in Theorem~\ref{thm:main} reveals the following more general pair of inequalities.

\begin{prop}\label{prop:main}
Let $(P, Q)$ be probability measures satisfying our regularity conditions. Then, for all $x \in \R^d$, $y \in \Omega_Q$,
\begin{align*}
\nabla^2 \varphi_\eps(x) &\preceq \E_{Y\sim \pi_\eps^x}\left[\left(\nabla^2 \psi_\eps(Y) + \eps \, \nabla^2 W(Y)\right)^{-1}\right]\,, \\
\nabla^2 \psi_\eps(y) &\succeq \left(\E_{X\sim\pi_\eps^y}\left[\nabla^2 \varphi_\eps(X) + \eps \, \nabla^2 V(X)\right] \right)^{-1}\,.
\end{align*}
\end{prop}

In the next section, we use these inequalities to prove a generalization of Caffarelli's theorem.

\section{A generalization to commuting positive definite matrices}\label{scn:commuting_matrices}
In the next result, we replace the main assumptions of Caffarelli's theorem, namely $\nabla^2 V \preceq \beta_V I$ and $\nabla^2 W \succeq \alpha_W I$, by the conditions
\begin{align}\label{eq:general_matrices}
\nabla^2 V \preceq A^{-1} \quad\text{and}\quad \nabla^2 W \succeq B^{-1}\,,
\end{align}
where $A$ and $B$ are commuting positive definite matrices. Recall that the Hessian of the Brenier potential between the Gaussian distributions $\mc N(0, A)$ and $\mc N(0,B)$ is the matrix $A^{-1/2} B^{1/2}$~\cite{gelbrich1990formula}. In light of this observation, the following theorem is sharp for every pair of commuting positive definite $(A,B)$, and shows that the Brenier potential between Gaussians achieves the largest possible Hessian among all source and target measures obeying the constraint~\eqref{eq:general_matrices}.

\begin{theo}\label{thm:commuting_matrices}
Let $(P,Q)$ satisfy our regularity conditions as well as the condition~\eqref{eq:general_matrices}. Then, the Hessian of the Brenier potential satisfies the uniform bound: for all $x\in \R^d$, it holds that
\begin{align*}
\nabla^2 \varphi_0(x)
\preceq A^{-1/2} B^{1/2}\,.
\end{align*}
\end{theo}

As in Theorem~\ref{thm:main}, the proof technique also yields a lower bound on $\nabla^2 \varphi_0$ under appropriate assumptions. We omit this result because it is straightforward.

\begin{proof}
Let $C_\varepsilon$ be the smallest constant $C \ge 0$ such that $\nabla^2 \varphi_\eps(x) \preceq A^{-1/2} B^{1/2} + C I$ for all $x\in \R^d$. In light of Theorem~\ref{thm:main}, $C_\eps$ is well-defined and finite. Equivalently,
\begin{align*}
C_\eps &= \sup_{x\,\in\,\R^d} \sup_{e\,\in\,\R^d, \; \norm e = 1}{\left\langle e, \left[\nabla^2 \varphi_\eps(x) - A^{-1/2} B^{1/2}\right] \, e\right\rangle}\,.
\end{align*}
Let $(x, e)$ achieve the above supremum. Using our assumptions and Proposition~\ref{prop:main}, we obtain
\begin{align*}
C_\eps &= \left\langle e, \left[\nabla^2 \varphi_\eps(x) - A^{-1/2} B^{1/2}\right] \, e \right\rangle \\
&\le \left\langle e, \left[\left(\E_{Y\sim \pi_\eps^x} \nabla^2 \psi_\eps(Y) + \eps B^{-1}\right)^{-1} - A^{-1/2} B^{1/2} \right] \, e \right\rangle \\
&\le \left\langle e, \left[\left(\left(A^{-1/2} B^{1/2} + C_\eps I + \eps A^{-1}\right)^{-1} + \eps B^{-1}\right)^{-1} - A^{-1/2} B^{1/2} \right] \, e \right\rangle\,.
\end{align*}
From our assumptions and Theorem~\ref{thm:main}, we know that the spectrum of $M_\eps \deq A^{-1/2} B^{1/2} + C_\eps I$ is bounded away from zero and infinity as $\eps \searrow 0$, which justifies the Taylor expansion
\begin{align*}
\left(\left(M_\eps + \eps A^{-1}\right)^{-1} + \eps B^{-1}\right)^{-1} &= \left(M_\eps^{-1} - \eps M_\eps^{-1} A^{-1} M_\eps^{-1} + \eps B^{-1} + O\left(\eps^2\right)\right)^{-1} \\
&= M_\eps + \eps A^{-1} - \eps M_\eps B^{-1} M_\eps + O\left(\eps^2\right)I\,.
\end{align*}
Hence,
\begin{align*}
C_\eps &\le \left\langle e, \left[M_\eps + \eps A^{-1} - \eps M_\eps B^{-1} M_\eps + O\left(\eps^2\right)I - A^{-1/2} B^{1/2}\right] \, e \right\rangle \\
&\le C_\eps + \eps \, \left\langle e, \left[A^{-1} - M_\eps B^{-1} M_\eps\right] \, e \right\rangle + O\left(\eps^2\right) \\
&= C_\eps \ { - } \ \eps \, \left\langle e, \left[{ 2 }C_\eps A^{-1/2} B^{-1/2} + C_\eps^2 B^{-1}\right] \, e \right\rangle + O\left(\eps^2\right)\,.
\end{align*}
This shows that $\lim_{\eps\searrow0} C_\eps = 0$ (otherwise ${(C_\eps)}_{\eps\,>\,0}$ would have a strictly positive cluster point which would contradict the above inequality for small enough $\eps > 0$).

By combining this fact with convergence of the entropic Brenier potentials as in the proof of Theorem~\ref{thm: caf_con}, we deduce the result.
\end{proof}

Next, we show how our theorem recovers and extends a result of Valdimarsson~\cite{valdimarsson2007otpotential}, which was used to derive new forms of the Brascamp--Lieb inequality.\footnote{This is a different Brascamp--Lieb inequality than the one in Lemma~\ref{lem:key}.}

\begin{theo}\label{thm: ext_valdimarsson}
Suppose that
\begin{itemize}
\item $\bar A$, $\bar B$, and $G$ are positive definite matrices;
\item $\bar A \preceq G$ and $\bar B$ commutes with $G$;
\item $P = \exp(-\tilde{V}) * \mu$, where $\nabla^2 \tilde{V} \preceq \bar{B}^{-1}G$, $*$ denotes convolution, and $\mu$ is an arbitrary probability measure on $\Rd$;
\item $Q = \exp(-W)$ with $\nabla^2 W \succeq \bar B^{-1/2} \bar A^{-1}\bar B^{-1/2}$.
\end{itemize}
Then, the Brenier potential satisfies $\nabla^2 \varphi_0 \preceq G$.
\end{theo}
\begin{rema}
Valdimarsson's original result required that $P = \mc N(0, \bar B G^{-1}) * \mu$.
\end{rema}

To prove this result, we check that convolution with any probability measure only makes the density more log-smooth.

\begin{lemm}\label{lem: smoothness_lemma}
Let $\widetilde P \propto \exp(-\widetilde V)$ be a probability measure, where $\widetilde V : \R^d\to\R$ is twice continuously differentiable. Let $P \deq \widetilde P * \mu = \exp(-V)$ where $\mu$ is any probability measure on $\R^d$. Suppose that for some positive definite matrix $A^{-1}$, we have $\nabla^2 \widetilde V \preceq A^{-1}$. Then, $\nabla^2 V \preceq A^{-1}$ as well.
\end{lemm}
\begin{proof}
An elementary computation shows that if we define the probability measure
\begin{align*}
\nu_y(\dd x) & \deq \frac{\exp\left(-\widetilde V(y-x)\right) \, \mu(\dd x)}{\int \exp\left(-\widetilde V(y-x')\right) \, \mu(\dd x')}
\end{align*}
then
\begin{align*}
\nabla^2 V(y) &= \E_{X \sim \nu_y}\left[\nabla^2 \widetilde V(y-X)\right] - \cov_{X\sim \nu_y}\left(\nabla \widetilde V(y-X)\right)\,,
\end{align*}
from which the result follows.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{thm: ext_valdimarsson}]
Under Lemma~\ref{lem: smoothness_lemma} and the third assumption, it holds that $P \propto \exp(-V)$ with $\nabla^2 V \preceq \bar{B}^{-1}G$. The other assumptions imply that $Q \propto \exp(-W)$ with
\begin{align*}
\nabla^2 W \succeq \bar{B}^{-1/2}\bar{A}^{-1}\bar{B}^{-1/2} \succeq \bar{B}^{-1/2}G^{-1}\bar{B}^{-1/2} = \bar{B}^{-1}G^{-1}\,.
\end{align*}
By Theorem~\ref{thm:commuting_matrices}, it holds that $\nabla^2 \phi_0 \preceq G.$
\end{proof}


\begin{rema}
It is natural to ask whether Theorem~\ref{thm:commuting_matrices} can be obtained by first applying Caffarelli's contraction theorem to show that the optimal transport map $\widetilde T_0$ between the measures $(A^{-1/2})_\sharp P$ and $(B^{-1/2})_\sharp Q$ is $1$-Lipschitz, and then considering the mapping $T_0(x) \deq B^{1/2} \widetilde T_0(A^{-1/2} x)$. Although $T_0$ is indeed a valid transport mapping from $P$ to $Q$, under our assumptions $\nabla T_0$ is not guaranteed to be symmetric, so it does not make sense to ask that $\nabla T_0 \preceq A^{-1/2} B^{1/2}$.

In Valdimarsson's application to Brascamp--Lieb inequalities, it is crucial that the transport map $T_0$ is chosen so that $\nabla T_0$ is a symmetric positive definite matrix. Symmetry of $\nabla T_0$ implies that $T_0$ is the gradient $\nabla \phi_0$ of a function $\phi_0 : \R^d\to\R$, and positive definiteness implies that $\phi_0$ is convex. By Brenier's theorem, the unique gradient of a convex function that pushes forward $P$ to $Q$ is the optimal transport map. Thus, it is crucial that we consider the \emph{optimal} transport map here; in particular, alternative maps such as the ones in~\cite{kim2012generalization, mikulincer2021brownian} cannot be applied.
\end{rema}

\section{Discussion}

We have proven a generalization of Caffarelli's celebrated theorem on the Lipschitz properties of the optimal transport map to the setting of entropic optimal transport using two complementary covariance inequalities (the Brascamp--Lieb inequality and the Cramér--Rao inequality).

We conjecture that our proof technique can also be used to recover the bounds on the moment measure mapping in~\cite{klartag2014momentmeasure}, provided that the existence of an ``entropic moment measure'' can be established (with convergence towards the true moment measure as the regularization tends to zero). As this is outside the scope of this work, we do not pursue this question here.

\section*{Acknowledgements}
The authors thank Ramon van Handel, Jonathan Niles-Weed, Philippe Rigollet, and anonymous reviewers for helpful comments.


\begin{appendix}
\section{Proof of the Cramér--Rao lower bound}\label{sec:cr_proof}

\begin{proof}[Proof of Lemma~\ref{lem:key}, Cramér--Rao inequality]
For any smooth and compactly supported test function $h : \R^d\to\R$, integration by parts yields
\begin{align*}
\E_P \nabla h &= \int \nabla h \, \dd P = -\int (h \, \nabla \ln P) \, \dd P = \int \left(h - \E_P h\right) \, \nabla V \, \dd P
\end{align*}
where we used the fact that $\E_P \nabla \ln P = 0$. Therefore,
\begin{align}\label{eq:cr1}
\left\langle \E_P \nabla h, \left(\E_P \nabla^2 V\right)^{-1} \, \E_P \nabla h\right\rangle &= \int \left(h - \E_P h\right) \, \left\langle \nabla V, \left(\E_P \nabla^2 V\right)^{-1} \, \E_P \nabla h \right\rangle \, \dd P\,.
\end{align}
Applying the Cauchy--Schwarz inequality,
\begin{align*}
\eqref{eq:cr1} &\le \sqrt{\left(\var_P h\right) \int \left\langle \E_P \nabla h, {\left(\E_P \nabla^2 V\right)}^{-1} \, {\left(\nabla V\right)}^{\otimes 2} \, {\left(\E_P \nabla^2 V\right)}^{-1}\, \E_P \nabla h \right\rangle \, \dd P}\,.
\end{align*}
Integration by parts shows that $\int \nabla V^{\otimes 2} \, \dd P = \int \nabla^2 V \, \dd P$, and upon rearranging we deduce that
\begin{align}\label{eq:cr2}
\var_P h &\ge \left\langle \E_P \nabla h, \left(\E_P \nabla^2 V\right)^{-1} \, \E_P \nabla h\right\rangle\,.
\end{align}
By approximation, this continues to hold for any locally Lipschitz $h : \R^d\to\R$ with $\E_P\norm{\nabla h} < \infty$.

Specializing the inequality~\eqref{eq:cr2} to $h \deq \langle e, \cdot\rangle$ for a unit vector $e\in\R^d$ then recovers the Cramér--Rao inequality of Lemma~\ref{lem:key}.
\end{proof}

\section{Gaussian case}\label{sec: gaussian_case}

Suppose $P = \clN(0,A)$ and $Q = \clN(0,B)$ are Gaussians. Then, it is known that the Hessian of the Brenier potential is given by~\cite{gelbrich1990formula}
\begin{align*}
\nabla^2\varphi_0(x) = A^{-1/2} \, {\left(A^{1/2}BA^{1/2}\right)}^{1/2} \,A^{-1/2}\,.
\end{align*}
If we have
\begin{align*}
A^{-1} \preceq \beta I \qquad\text{and}\qquad B^{-1} \succeq \alpha I \succ 0\,,
\end{align*}
then Caffarelli's contraction theorem (Theorem~\ref{thm: caf_con}) implies
\begin{align*}
\left\|\nabla^2\phi_0\right\|_{\text{op}} \leq \sqrt{\beta/\alpha}\,.
\end{align*}
This matches the bound of~\cite[Lemma~2]{altschuler2021averaging}.

For $\eps > 0$, the upper bound from Theorem~\ref{thm:main} implies
\begin{align}\label{eq:gaussian_eps}
\norm{\nabla^2 \phi_\eps}_{\opp} \leq \frac{1}{2} \, \left(\sqrt{4\beta/\alpha + \eps^2 \beta^2} - \eps \beta \right)\,.
\end{align}
On the other hand, from~\cite{janati2020entropic, mallasto2021entropy}, it is known that
\begin{align*}
\nabla^2 \phi_\eps(x) &= A^{-1/2} \, \left(A^{1/2} B A^{1/2} + \frac{\eps^2}{4} \, I\right)^{1/2} \, A^{-1/2} - \frac{\eps}{2} \, A^{-1}\,.
\end{align*}
In particular, if we take $A = \beta^{-1} I$ and $B = \alpha^{-1} I$, then~\eqref{eq:gaussian_eps} is an equality. Hence, Theorem~\ref{thm:main} is sharp for every $\eps > 0$.
\end{appendix}


\bibliographystyle{crplain}
\bibliography{crmath20221110}
\end{document}