%~Mouliné par MaN_auto v.0.28.0 2023-07-31 12:20:10
\documentclass[CRMATH,Unicode,XML]{cedram}

\TopicFR{Analyse numérique}
\TopicEN{Numerical analysis}

\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\Span}{Span}

\newcommand\deadlink{{\color{red}DEAD LINK!}}
\graphicspath{{./figures/}}

\newcommand\un{\mathbf{1}}

%Widebar

\makeatletter
\let\save@mathaccent\mathaccent
\newcommand*\if@single[3]{%
  \setbox0\hbox{${\mathaccent"0362{#1}}^H$}%
  \setbox2\hbox{${\mathaccent"0362{\kern0pt#1}}^H$}%
  \ifdim\ht0=\ht2 #3\else #2\fi
  }
%The bar will be moved to the right by a half of \macc@kerna, which is computed by amsmath:
\newcommand*\rel@kern[1]{\kern#1\dimexpr\macc@kerna}
%If there's a superscript following the bar, then no negative kern may follow the bar;
%an additional {} makes sure that the superscript is high enough in this case:
\newcommand*\widebar[1]{\@ifnextchar^{{\wide@bar{#1}{0}}}{\wide@bar{#1}{1}}}
%Use a separate algorithm for single symbols:
\newcommand*\wide@bar[2]{\if@single{#1}{\wide@bar@{#1}{#2}{1}}{\wide@bar@{#1}{#2}{2}}}
\newcommand*\wide@bar@[3]{%
  \begingroup
  \def\mathaccent##1##2{%
%Enable nesting of accents:
    \let\mathaccent\save@mathaccent
%If there's more than a single symbol, use the first character instead (see below):
    \if#32 \let\macc@nucleus\first@char \fi
%Determine the italic correction:
    \setbox\z@\hbox{$\macc@style{\macc@nucleus}_{}$}%
    \setbox\tw@\hbox{$\macc@style{\macc@nucleus}{}_{}$}%
    \dimen@\wd\tw@
    \advance\dimen@-\wd\z@
%Now \dimen@ is the italic correction of the symbol.
    \divide\dimen@ 3
    \@tempdima\wd\tw@
    \advance\@tempdima-\scriptspace
%Now \@tempdima is the width of the symbol.
    \divide\@tempdima 10
    \advance\dimen@-\@tempdima
%Now \dimen@ = (italic correction / 3) - (Breite / 10)
    \ifdim\dimen@>\z@ \dimen@0pt\fi
%The bar will be shortened in the case \dimen@<0 !
    \rel@kern{0.6}\kern-\dimen@
    \if#31
      \overline{\rel@kern{-0.6}\kern\dimen@\macc@nucleus\rel@kern{0.4}\kern\dimen@}%
      \advance\dimen@0.4\dimexpr\macc@kerna
%Place the combined final kern (-\dimen@) if it is >0 or if a superscript follows:
      \let\final@kern#2%
      \ifdim\dimen@<\z@ \let\final@kern1\fi
      \if\final@kern1 \kern-\dimen@\fi
    \else
      \overline{\rel@kern{-0.6}\kern\dimen@#1}%
    \fi
  }%
  \macc@depth\@ne
  \let\math@bgroup\@empty \let\math@egroup\macc@set@skewchar
  \mathsurround\z@ \frozen@everymath{\mathgroup\macc@group\relax}%
  \macc@set@skewchar\relax
  \let\mathaccentV\macc@nested@a
%The following initialises \macc@kerna and calls \mathaccent:
  \if#31
    \macc@nested@a\relax111{#1}%
  \else
%If the argument consists of more than one symbol, and if the first token is
%a letter, use that letter for the computations:
    \def\gobble@till@marker##1\endmarker{}%
    \futurelet\first@char\gobble@till@marker#1\endmarker
    \ifcat\noexpand\first@char A\else
      \def\first@char{}%
    \fi
    \macc@nested@a\relax111{\first@char}%
  \fi
  \endgroup
}
\makeatother

\let\oldbar\bar
\renewcommand*{\bar}[1]{\mathchoice{\widebar{#1}}{\widebar{#1}}{\widebar{#1}}{\oldbar{#1}}}

\newcommand*{\mk}{\mkern -1mu}
\newcommand*{\Mk}{\mkern -2mu}
\newcommand*{\mK}{\mkern 1mu}
\newcommand*{\MK}{\mkern 2mu}

\hypersetup{urlcolor=purple, linkcolor=blue, citecolor=red}

\let\oldtilde\tilde
\renewcommand*{\tilde}[1]{\mathchoice{\widetilde{#1}}{\widetilde{#1}}{\oldtilde{#1}}{\oldtilde{#1}}}
\let\oldhat\hat
\renewcommand*{\hat}[1]{\mathchoice{\widehat{#1}}{\widehat{#1}}{\oldhat{#1}}{\oldhat{#1}}}

\let\oldmapsto\mapsto
\renewcommand*{\mapsto}{\mathchoice{\longmapsto}{\oldmapsto}{\oldmapsto}{\oldmapsto}}
\renewcommand*{\to}{\mathchoice{\longrightarrow}{\rightarrow}{\rightarrow}{\rightarrow}}

\makeatletter
\def\@setafterauthor{%
  \vglue3mm%
%  \hspace*{0pt}%
\begingroup\hsize=12cm\advance\hsize\abstractmarginL\raggedright
\noindent
%\hspace*{\abstractmarginL}\begin{minipage}[t]{10cm}
   \leftskip\abstractmarginL
  \normalfont\Small
  \@afterauthor\par
\endgroup
\vskip2pt plus 3pt minus 1pt
}
\makeatother


\title{A convergent Deep Learning algorithm for approximation of polynomials}

\author{\firstname{Bruno} \lastname{Despr\'es}}
\address{Laboratoire Jacques-Louis Lions, Sorbonne Universit\'e, 4 place Jussieu, 75005 Paris, France}
\email[B. Despr\'es]{bruno.espres@sorbonne-universite.fr}

%~\keywords{Deep Learning, polynomials, Neural Networks, convergence}
\subjclass{65Q20, 65Y99, 78M32}

\begin{abstract}
We start from the contractive functional equation proposed in~\cite{desp:anc}, where it was shown that the polynomial solution of functional equation can be used to initialize a Neural Network structure, with a controlled accuracy. We propose a novel algorithm, where the functional equation is solved with a converging iterative algorithm which can be realized as a Machine Learning training method iteratively with respect to the number of layers. The proof of convergence is performed with respect to the $L^\infty$ norm. Numerical tests illustrate the theory and show that stochastic gradient descent methods can be used with good accuracy for this problem.
\end{abstract}

\begin{document}

\maketitle

\section{Introduction}

Neural Networks representations of real monovariate polynomials defined on the closed segment $x\in I=[0,1]$ play a central role in the numerical analysis of Neural Networks (one can refer to~\cite{devore, desp:anc, spici, lecun:quand, bobo, schwabi}). A central result is the Yarostky Theorem~\cite{yaya} which provides an approximation result of general functions, by means of a specific Neural Network approximation of the polynomial $x\mapsto x^2$ where the activation function is ReLU $R(x)=\max(0,x)$. This specific Neural Network can have an arbitrary large number of hidden layers, so it provides a simple example of a Deep Neural Network with perfectly known coefficients.

However, as pointed out by Ronald DeVore in 2019~\cite{vovo}, the stability of the approximation of polynomial functions by Deep Neural Networks (i.e. with many hidden layers) is not addressed in the current theory~\cite{goodfellow}. In particular it is already not the case for the Deep Neural Network which approximates the polynomial $x\mapsto x^2$. To the best of the understanding of the author of this Note, it is because all recent developments are devoted to abstract approximation theory and are scarcely related to constructive algorithms.

The present Note provides a positive answer to the Ronald DeVore's remark, by showing that there exists an algorithm with the following properties:
\begin{itemize}
\item the algorithm constructs a series of functions which intend to approximate given real polynomials in the reference segment $I$.
\item under conditions, the algorithm is proved to be convergent in the $L^\infty$ norm.
\item it can be implemented in a Neural Networks/Machine Learning platform. The iterations of the algorithm directly corresponds to the number of hidden layers. The number of neurons which are trained per iteration of the algorithm is constant.
\item numerical tests implemented in Tensorflow/Keras/Python~\cite{chollet} for very simple polynomials confirm the theoretical properties in terms convergence and stability.
\end{itemize}

Even if restricted to academic extremely simple functions, the Deep Learning algorithm presented in this Note seems the first one with a proof of convergence in $L^\infty$ norm, where the iteration parameter is identical to the layer index. The reason is that the proposed formulation is not doomed with the curse of non convex optimization problems. Instead it solves a series of convex minimization problems. The convergence is insured due to the underlying contractive structure explained in~\cite{desp:anc}. In the language of Neural Networks/Machine Learning, this Deep Network is trained by means of a series of Shallow Networks (for which a comprehensive theory with Barron's functions emerges in~\cite{weinan}).

The Note is organized as follows. The setting of the problem is presented in Section~\ref{sec:2}. Discussion of a sharper contraction constant is the matter of Section~\ref{sec:3}. Section~\ref{sec:4} is the core of the Note where we define a specific Machine Learning algorithm as a series of convex minimization problems. In Section~\ref{sec:5}, we show with an example that both contraction constants of Section~\ref{sec:2} and~\ref{sec:3} are non optimal. Final Section~\ref{sec:6} is devoted to numerical tests which illustrate the theoretical properties. A discussion of the complexity of the algorithm and the presentation of some open problems are proposed at the end.


\section{A functional equation} \label{sec:2}

The notations and results are borrowed from a previous work~\cite{desp:anc}. They form the foundations on which the algorithm will be developed and justified.

The set of real polynomials is $ P^n =\left\{ p \text{ of degree }\leq n
\right\}$. The set of continuous functions $C^0(I)$ over $I$ is equipped with the maximal norm $\|f\|_{L^\infty(I)}=\max_{i\in I}|f(x)|$. We consider a subdivision in $m\geq 1$ subintervals $[x_j, x_{j+1}]$ where $ 0=x_0<x_1<\dots < x_j < \dots < x_m=1$ where $ x_j=j\Delta x$ with $ \Delta x=1/m$. The set of continuous piecewise linear functions is
\[
V_h=\left\{ u\in C^0(I), \ u_{\mid (x_j, x_{j+1})}\in P^1\text{ for all }0\leq j \leq m-1
\right\}.
\]
As in~\cite{desp:anc}, we define the subset $E_h\subset V_h$
\[
E_h=\left\{ u\in V_h\ : \ u(I)\subset I, \ u \text{ is non constant on exactly one subinterval}\right\}.
\]
Functions in this set are called basis functions because of their central role, even if they are not classical Finite element basis functions. Finite Elements in $V_h$ or in $E_h$ are easy to implement with ReLU ($R(x)=\max(0,x)$) and with TReLU (ReLU with threshold) activation functions. The function TReLU is described in the Keras online documentation \url{https://keras.io/api/layers/activations/}.

Once a real polynomial function $H\in P^n$ is given, the following problem is considered~\cite{desp:anc}.

\begin{enonce}{Problem}\label{prob:1}
Find $\left(e_0, e_1, \dots, e_r, \beta_1, \dots, \beta_r\right)\in V_h\times (E_h)^r\times \mathbb R^r$ such that the identity below holds
\begin{equation}\label{eq:1}
H(x)=e_0(x)+ \sum_{i=1}^r \beta_i H\circ e_i(x),\quad x\in I,
\end{equation}
with the contraction condition
\begin{equation}\label{eq:1_r}
K<1, \qquad K=\sum_{i=1}^r |\beta_i|.
\end{equation}
\end{enonce}

If $n=1$ the problem becomes trivial. That is why we only consider $ n\geq 2$. The classical example~\cite{devore, hama, yaya} is $H(x)=x(1-x)$ which satisfies $H(x)=\frac14 g(x)+\frac14 H(g(x)) $ where $g$ is the normalized finite element function: $g(x)=2x$ for $0\leq x \leq \frac12$ and $g(x)=2(1-x)$ for $\frac12 \leq x \leq 1$. Set $e_1(x)=\min (2x,1)$ and $e_2(x)=\min (2(1-x), 1)$ with $e_1,e_2\in E_{h}$ for $h=1/2$. One obtains $H(x)=e_0(x)+\frac14 H(e_1(x))+ \frac14 H(e_2(x))$ where $ e_0(x)=\frac14 g(x) $. The contraction property~\eqref{eq:1_r} is satisfied with a constant $\sum |\beta_i|=\frac14+\frac14=\frac12$.

Contrary to~\cite{desp:anc}, we now completely specify the basis functions in order to optimize some approximations property and to explicit further implementation. We use a double index notation where $e_s=e_{j,k}$ for $1\leq s \leq r$, $1\leq j \leq m$ and $1\leq k\leq n-1$. The basis functions are translated one from the other (this property was already stated in~\cite{desp:anc})
\[
e_{j+1,k}(x)=e_{j,k}(x-\Delta x),
\]
so the basis functions are completely determined by basis functions $e_{1,k}$. For $1\leq k \leq n-1$ we take in this work
\begin{equation}\label{eq:new1}
\begin{cases}
e_{1,k}(x)=\frac{k-1}{2(n-1)}, & x\leq 0, \\
e_{1,k}(x)=\frac{nm}{2(n-1)} x +\frac{k-1}{2(n-1)}, & 0\leq x \leq 1/m\\
e_{1,k}(x)=\frac{n+k-1}{2(n-1)}, & 1/m \leq x.
\end{cases}
\end{equation}
It makes a total of $r$ basis functions with
\begin{equation}\label{eq:moi}
r=m(n-1).
\end{equation}

\begin{lemm}
For all possible indices $1\leq j \leq m$ and $ 1\leq k \leq n-1$, one has $e_{j,k}\in E_h$.
\end{lemm}

\begin{proof}
Once the claim is proved for $e_{1,k}$, it will be generalized by translation to the other basis functions. By construction $e_{1,k}$ is continuous and piecewise affine. The left value is non negative $e_{1,k}(0)=\frac{k-1}{2(n-1)}\geq 0$. The right value is $ e_{1,k}(1/m)=
\frac{nm}{2(n-1)} \times \frac1m +\frac{k-1}{2(n-1)}= \frac{n+k-1}{2(n-1)}\leq \frac{n+n-2}{2(n-1)} \leq 1 $. Therefore $x\mapsto e_{1,k}(x)$ takes its values between 0 and 1, which shows the claim.
\end{proof}

We remind the fundamental result proved recently in~\cite{desp:anc}.

\begin{theo}\label{theor:1}
Let $H\in P^n$ with more precisely $\deg(H)=n$. There exists a threshold value $ m_*$ such that the functional equation~\eqref{eq:1} has a solution with the contraction property~\eqref{eq:1_r} for all $m\geq m_*$.
\end{theo}

\begin{proof}
For sake of completeness, we provide a short proof adapted to the new notations.

%$\bullet$
Consider the difference $q=H-\sum_{j=1}^m \sum_{k=1}^{n-1}\beta_{j,k}H\circ e_{j,k}$ where the coefficients are still unknowns at this stage. The second derivative of $H$ is noted $p=H''$. In all intervals, one can calculate the second derivative $q''\in P^{n-2}$. In the interval $I_j=[j/m, (j+1)/m]$ one has
\[
q''=p-\sum_{k=1}^{n-1}\beta_{j,k} \left(\frac{nm}{2(n-1)} \right)^2 p\circ e_{j,k}
\]
The objective is to find coefficients $\beta_{j,k} $ such that $q''=0$ in $I_j$.

%$\bullet$
The equality $q''=0$ in $I_j$ is equivalent to $\frac{d^r}{dx^r} q''(x_j)=0$ for $r=0, \dots, n-2$, because $q''$ is a polynomial of degree $n-2$ in $I_j$. It yields a square linear system
\[
\sum_{k=1}^{n-1}\left[\left(\frac{nm}{2(n-1)}\right)^{2+r} p^{(r)}\left(\frac{k-1}{2(n-1)} \right) \right] \beta_{j,k} =p^{(r)}(x_j), \qquad 0\leq r\leq n-2.
\]
The system is reorganized as
\begin{equation}\label{eq:new3}
\sum_{k=1}^{n-1} p^{(r)}\left(\frac{k-1}{2(n-1)} \right)
\times
\beta_{j,k} = \left(\frac{2(n-1)} {nm}\right)^{2+r} p^{(r)}(x_j), \qquad 0\leq r\leq n-2.
\end{equation}

%$\bullet$
The linear system~\eqref{eq:new3} can be recast under the form $M\beta=b$ where the unknown $\beta$ is the collection of the $\beta_{j,k}$ for $1\leq k \leq n-1$, the matrix $M\in \mathcal M_{n-1}(\mathbb R)$ is of Vandermonde type and the source $b$ is provided by the right hand side of~\eqref{eq:new3}. Since the polynomial $p\in P^{n-2}$ is non zero, the derivatives run for $r=0$ to $r=n-1$, and the $n-1$ different evaluation points $\frac{k-1}{2(n-1)}$ are different (between $0$ and $\frac12$), then it is a classical matter to show that $\det(M)\neq 0$. The coefficients of the linear system on the left hand side the matrix does not depend on the interval index $j$, so the matrix $M$ does not depend neither on $j$ nor on $m$, but only on $n$. Therefore $M^{-1}$ is bounded uniformly with respect to $m$. It is visible at inspection of~\eqref{eq:new3} that the right hand satisfies $\| b\|=O(1/m^2)$. Therefore one obtains the bound
\begin{equation}\label{eq:new50}
\max_{k} |\beta_{j, k}|\leq C/m^2 \text{ uniformly with respect to }j.
\end{equation}
Therefore
\begin{equation}\label{eq:new4}
K=\sum_{j=1}^m \sum_{k=1}^{n-1} |\beta_{j, k}|\leq C(n-1)/m,
\end{equation}
which yields the contraction property for $m\geq m^*$ high enough.

%$\bullet$
With these coefficients, the difference $q$ is a continuous function with vanishing second derivatives in all $I_j$. Therefore one can write $q=-e_0\in V_h$ and the claim is proved.
\end{proof}

What distinguishes the family~\eqref{eq:new1} with the more general family considered in~\cite[Theorem~1]{desp:anc} is that the slope of the basis functions is always $>1$. Indeed the slope $\frac{nm}{2(n-1)} $ is by construction the same for all basis functions~\eqref{eq:new1}, so since $m\geq 2$ the property is evident. Considering~\eqref{eq:new3}, the greater the slope the smaller the right hand side of the linear system. This is a reason why it is possible to think that the family~\eqref{eq:new1} provides better control of the coefficients $\beta_{j,k}$, so ultimately is optimal to obtain a smaller contraction constant $K$, see~\eqref{eq:new4}.


\section{A sharper contraction constant}\label{sec:3}



The right hand side operator~\eqref{eq:1} written for the new basis functions is
\begin{equation}\label{eq:new10}
\left\{
\begin{aligned}
\mathcal H:  L^\infty(I) & \to L^\infty(I)\\
G & \mapsto \sum_{j=1}^m \sum_{k=1}^{n-1} \beta_{j,k} G\circ e_{j,k}.
\end{aligned}
\right.
\end{equation}
It is endowed with the contraction property~\eqref{eq:1_r}
\begin{equation}\label{eq:new12}
\| \mathcal H G\|_{L^\infty(I)} \leq K \|G\|_{L^\infty(I)}, \qquad K=\sum_{j=1}^m \sum_{k=1}^{n-1} |\beta_{j,k} |<1.
\end{equation}
Such property is central for the justification of the Deep Learning algorithm of this Note.

The purpose in this Section is to present a sharper bound. Let us define
\[
\widetilde K =2
\left(\max\nolimits_{j=1}^m \sum_{k=1}^{n-1} |\beta_{j,k}| \right).
\]

\begin{lemm}\label{lem:6}
One has the general bound: \ $
\inf_{e\in V_h} \| \mathcal H(G)-e\|_{L^\infty(I)} \leq \widetilde K \| G \|_{{L^\infty(I)} }$.
\end{lemm}

\begin{remark}\label{rem:sca}
The new contraction constant is sharper asymptotically for large $m$, because the sum with respect to $m$ in~\eqref{eq:new12} is removed. Using~\eqref{eq:new50}, one gets the bound $\widetilde K\leq C(n-1)/m^2$ which is better than~\eqref{eq:new4}.
\end{remark}

\begin{proof}
We write $\un_j$ the indicatrix function of the interval $I_j$, that is $\un_j(x)=1$ for $(j-1)/m<x<j/m$ and $\un_j(x)=0$ for $x<(j-1)/m$ or $j/m<x$.

Let $x_{j+\frac12}=(j+\frac12)/m$ be the central point in the interval $I_j$. One can write
\[
\mathcal H(G)=\sum_j \sum_k \beta_{j,k} G\circ e_{j,k} \times \un_j +
\sum_j C_j \un_j
\]
where
\begin{equation}\label{eq:new14}
C_j=\sum_{i<j}\sum_k \beta_{i,k} G\circ e_{i,k} \bigl(x_{j+\frac12}\bigr)+
\sum_{j<i}\sum_k \beta_{i,k} G\circ e_{i,k} \bigl(x_{j+\frac12}\bigr).
\end{equation}
Let $e\in V_h$. A triangular inequality yields that
\begin{equation}\label{eq:new20}
\| \mathcal H(G)-e\|_{L^\infty(I)}\leq \left\| \sum_j \sum_k \beta_{j,k} G\circ e_{j,k} \times \un_j \right\|_{L^\infty(I)} + \left\| \sum_j C_j \un_j -e \right\|_{L^\infty(I)}.
\end{equation}
The first term is bounded as
\begin{equation}\label{eq:new21}
\left\| \sum_j \sum_k \beta_{j,k} G\circ e_{j,k} \times \un_j \right\|_{L^\infty(I)}
\leq \left(\max\nolimits_{j=1}^m \sum_{k=1}^{n-1} |\beta_{j,k}| \right) \| G \|_{{L^\infty(I)} }.
\end{equation}
To bound the second term, we design a piecewise affine function $e\in V_h$ as follows
\[
e(x_0)=C_1, \quad e(x_j)=\frac{C_j+C_{j+1}}2 \text{ for }1\leq j \leq m_1, \quad e(x_m)=C_m.
\]
A proof by drawing shows that
\begin{equation}\label{eq:new22}
\left\| \sum C_j \un_j -e \right\|_{L^\infty(I)} \leq \max\nolimits_{j=1}^{m-1} \frac{|C_{j+1}-C_j|}2.
\end{equation}
The definition of the constants $C_j$ and $C_{j+1}$ yields that
\begin{align*}
C_{j+1}-C_j&=\sum_{i<j}\sum_k \beta_{i,k} G\circ e_{i,k} \bigl(x_{j+\frac32}\bigr)+\sum_k \beta_{j,k} G\circ e_{j,k} \bigl(x_{j+\frac32}\bigr) +\sum_{j+1<i}\sum_k \beta_{i,k} G\circ e_{i,k} \bigl(x_{j+\frac32}\bigr) \\
&\qquad -\sum_{i<j}\sum_k \beta_{i,k} G\circ e_{i,k} \bigl(x_{j+\frac12}\bigr) -\sum_k \beta_{j+1,k} G\circ e_{j+1,k} \bigl(x_{j+\frac12}\bigr) -\sum_{j<i}\sum_k \beta_{i,k} G\circ e_{i,k} \bigl(x_{j+\frac12}\bigr).
\end{align*}
By definition~\eqref{eq:new1}, the basis functions are constant on the left and on the right of the interval where they vary linearly. Therefore
\[
\sum_{i<j}\sum_k \beta_{i,k} G\circ e_{i,k} \bigl(x_{j+\frac32}\bigr) -
\sum_{i<j}\sum_k \beta_{i,k} G\circ e_{i,k} \bigl(x_{j+\frac12}\bigr)=0
\]
and
\[
\sum_{j+1<i}\sum_k \beta_{i,k} G\circ e_{i,k} \bigl(x_{j+\frac32}\bigr) -\sum_{j<i}\sum_k \beta_{i,k} G\circ e_{i,k} \bigl(x_{j+\frac12}\bigr) =0.
\]
So one deduces that
\begin{equation}\label{eq:new40}
\begin{aligned}
|C_{j+1}-C_j| &= \left|\sum_k \beta_{j,k} G\circ e_{j,k} \bigl(x_{j+\frac32}\bigr) - \sum_k \beta_{j+1,k} G\circ e_{j+1,k} \bigl(x_{j+\frac12}\bigr)\right| \\
& \leq \sum_k | \beta_{j,k} | \| G\|_{L^\infty(I)} + \sum_k | \beta_{j+1,k}|\| G\|_{L^\infty(I)}
\end{aligned}
\end{equation}
which turns into
\begin{equation}\label{eq:new23}
|C_{j+1}-C_j| \leq 2 \left(\max\nolimits_{j=1}^m \sum_{k=1}^{n-1} |\beta_{j,k} |\right) \| G \|_{{L^\infty(I)} }.
\end{equation}
The claim is a consequence of \eqref{eq:new20}{\rm--}\eqref{eq:new23}.
\end{proof}


\section{A Deep Machine Learning algorithm} \label{sec:4}

This Section comes to the core of this Note by presenting an abstract Deep Machine Learning algorithm endowed with a proof of convergence with respect to the number of layers (so the name Deep Learning). This Deep Learning algorithm can be implemented in one of the Neural Networks/Machine Learning softwares freely available such as Tensorflow/Keras~\cite{chollet}, Scikit-Learn (Inria\footnote{\url{https://scikit-learn.org/stable/}}), Pytorch (Facebook\footnote{\url{https://pytorch.org}}), Julia (MIT licence\footnote{\url{https://julialang.org}}). This list is not exhaustive.

It is sufficient for this presentation to consider that Neural Networks/Machine Learning softwares have two features: a) they manipulate functions assembled with ReLU like activation functions and composition of functions, b) they perform numerical optimisation to fit the coefficients, in particular with stochastic gradient descent methods~\cite{des:boo, goodfellow}.


We assume that a certain number of basis functions $e_1, \dots, e_r$ are decided. The basis functions are assembled with the ReLU activation functions and alike. In the context of this work, a natural choice is to take the basis functions defined by~\eqref{eq:new1}. We will write $\beta=(\beta_1, \dots, \beta_r)\in \mathbb R^r$. For $(e_0,\beta,f)\in V_h\times \mathbb R^r\times L^\infty(I)$, we will write $g(e_0,\beta,f) \in L^\infty(I)$ the function defined by
\[
g(e_0,\beta,f)=e_0+\sum_{i=1}^r \beta_i f \circ e_i.
\]
Let us decide of a given polynomial $H\in P^n$. One constructs a cost function where the main variables to optimize are $e_0$ and $\beta$ and the parameters are the functions $f$ and $H$
\[
J(e_0,\beta:f,H)=\left\| g(e_0,\beta,f)- H\right\|_{L^\infty(I)}
\]
The algorithm below defines a sequence of functions $(f_k)_{k\in \mathbb N}$ as the solution of minimization problems.
\begin{itemize}
\item \emph{Initialization:} The seed is the null function
\begin{equation}\label{eq:new31}
f_0(x)=0 \text{ for all }x\in I.
\end{equation}
\item \emph{Iterations on $k$:} The next function is $ f_{k+1}=g(e_0,\beta,f_k) $ where $ (e_0^k,\beta^k)$ is any solution of the minimization problem
\begin{equation}\label{eq:new30}
(e_0^k,\beta^k)=\argmin_{(d_0,{\alpha})\in V_h\times \mathbb R^r} \| g(d_0,\alpha,f_k)- H\|_{L^\infty(I)}.
\end{equation}
\end{itemize}

After comments on the Neural Network implementation of \eqref{eq:new31}{\rm--}\eqref{eq:new30}, we will show the convergence $f_k\to H$ in the $L^\infty$ norm.

\begin{lemm}
The number of hidden layers in a Neural Network implementation of $f_k$ is $k-1$.
\end{lemm}

\begin{proof}
Since $f_0=0$ then $f_1=g(e_0^1,\beta^1,0)\in V_h$. Then $f_2=g(e_0^2,\beta^2,f_1)$ can be implemented in a Neural Network with one hidden layer. Then $f_3=g(e_0^3,\beta^3,f_2)$ can be implemented in a Neural Network with two hidden layers, and so one and so forth.
\end{proof}

The algorithm is correctly defined as shown by the next result.

\begin{lemm}
The minimization problem is convex (but not strictly convex), and has always at least one solution. For any $(f,H)\in L^\infty(I)\times L^\infty(I)$, there exists a minimizer $(e_0,\beta)\in V_h \times \mathbb R^r$ such that
\[
J(e_0,\beta:f,H)\leq J(d_0,\alpha:f,H) \qquad \text{for all } (d_0,\alpha) \in V_h \times \mathbb R^r.
\]
The minimizer $(e_0,\beta)$ is a priori non unique.
\end{lemm}

\begin{proof}
Such a proof is standard in convex analysis in finite dimension~\cite{hiriart}.

The dimension of the linear space $V_h$ is equal to $m+1$, so any function $h_0\in V_h$ admits the linear expansion $h_0=\sum_{p=1}^{m+1} \delta_p \varphi_p$ where the coefficients of the linear expansion are $\delta_p\in \mathbb R$ and $V_h=\Span(\varphi_p)_{1\leq p \leq m+1}$. Then one can write
\begin{equation}\label{eq:free}
g(h_0,\gamma,f)=\sum_{p=1}^{m+1} \delta_p \varphi_p + \sum_{i=1}^r \beta_i f \circ e_i
\end{equation}
which shows that $g(h_0,\gamma,f)$ belongs to the linear subspace of $L^\infty(I)$ generated by the $(\varphi_p)_p$ and the $(f \circ e_i)_i$. Let us take a linear basis in the vectorial space, the basis has a dimension $m+1\leq q\leq m+1+r$, so the sum can be recovered in function on $q$ coefficients only. It is a classical exercice (not reproduced here) to show that the cost function $J(f_0,\gamma:f,H)$ is coercive with respect to these $q$ coefficients: that is the cost function tends to $+\infty$ as a norm of these $q$ coefficients tends to $+\infty$. Since the cost function is bounded from below, then it has a minimum. Finally it is a standard matter that the $L^\infty$ norm does not bring strict coercivity so uniqueness is not guaranteed.
\end{proof}


As a consequence the sequence defined by the sequence of minimization problems~\eqref{eq:new30} is a priori non unique. Nevertheless it is convergent.

\begin{theo}\label{theor:main}
Assume \eqref{eq:1}{\rm--}\eqref{eq:1_r}. Then a series \eqref{eq:new31}{\rm--}\eqref{eq:new30} converges with the bound
\[
\| f_k-H\|_{L^\infty(I)} \leq
\widehat K^k \| H\|_{L^\infty(I)}, \qquad \widehat K=\min(K,\widetilde K).
\]
\end{theo}

\begin{proof}
By~\eqref{eq:new30} one has the bound $
\| f_{k+1}-H\|_{L^\infty(I)} \leq \left\| d_0+\sum_{i=1}^r \alpha_i f_k\circ e_i- H
\right\|_{L^\infty(I)} $ for all $d_0\in V_h$ and all $\alpha=(\alpha_1, \dots, \alpha_r)\in \mathbb R^r$.

%$\bullet$
Let us take $(d_0,\alpha)=(e_0,\beta)$ in accordance with the representation \eqref{eq:1}{\rm--}\eqref{eq:1_r}. One deduces the inequality
\begin{align*}
\| f_{k+1}-H\|_{L^\infty(I)} & \leq \left\| e_0+\sum_{i=1}^r \beta_i f_k\circ e_i- e_0- \sum_{i=1}^r \beta_i H\circ e_i
\right\|_{L^\infty(I)} \\
& \leq \left\|\sum_{i=1}^r \beta_i (f_k-H)\circ e_i
\right\|_{L^\infty(I)} \\
& \leq \sum_{i=1}^r |\beta_i| \| f_{k}-H\|_{L^\infty(I)} \\
& \leq K \| f_{k}-H\|_{L^\infty(I)}.
\end{align*}

%$\bullet$
One can also take $(d_0,\alpha)=(e_0-e,\beta)$ where $e\in V_h$ is arbitrary. One obtains
\begin{align*}
\| f_{k+1}-H\|_{L^\infty(I)} & \leq \left\| e_0-e+\sum_{i=1}^r \beta_i f_k\circ e_i- e_0- \sum_{i=1}^r \beta_i H\circ e_i
\right\|_{L^\infty(I)} \\
& \leq \left\|
\mathcal H (f_k-H) -e
\right\|_{L^\infty(I)} \\
& \leq \widetilde K \| f_{k}-H\|_{L^\infty(I)}
\end{align*}
by virtue of~\eqref{eq:new12} and Lemma~\ref{lem:6}.

%$\bullet$
So $ \| f_{k+1}-H\|_{L^\infty(I)} \leq \widehat K \| f_{k}-H\|_{L^\infty(I)} $. Since $f_0=0$, the claim is obtained by iteration on $k$.
\end{proof}


\section{Non optimality of \texorpdfstring{$\widehat K$}{K}}\label{sec:5}

An elementary exemple that will be used for the numerical tests is the following. For this example one can check that $\widehat K>1$. However we will see that a sharper value constant exists at the end of the Section.

\begin{lemm}\label{lem:tchi}
Take the third order Tchebycheff polynomial rescaled in $[0,1]$, that is $H(x)=\linebreak T_3(2x-1)=32x^3-48x^2+18x-1$. Take $m=3$ and $n=3$.

Then there exists $e_0\in V_h$ such that $ H=e_0+\sum_{j=1}^3 \sum_{k=1}^2 \beta_{j,k} H\circ e_{j,k} $ with
\[
\beta_{1,1} =\beta_{3,2}=\frac{2^5}{3^4}- \frac{2^6}{3^6}, \quad \beta_{1,2}= \beta_{3,1}= \frac{2^7}{3^6}-\frac{2^5}{3^4}, \quad \beta_{2,1}= \beta_{2,2}=\frac{2^5}{3^6}.
\]
\end{lemm}

\begin{proof}
The difference $q=H-\sum_{j=1}^2 \sum_{k=1}^2 \beta_{j,k} H\circ e_{j,k}$ is a continuous function. It is a polynomial of degree at most 3 in the three intervals $I_1=[0,1/3]$, $I_2=[1/3,2/3]$ and $I_3=[2/3,1]$.

%$\bullet$
In $I_1$ one has
\[
e_{1,1}(x)=\frac94 x, \quad e_{1,2}(x)=\frac94 x+\frac14, \quad e_{2,1}, \ e_{2,2}, \ e_{3,1} \mbox{ and } e_{3,2} \mbox{ are constant.}
\]
Then $H(e_{1,1}(x))=32\left(\frac94 x \right)^3- 48 \left(\frac94 x \right)^2+h_1x+h_2$ and $H(e_{1,2}(x))=32\left(\frac94 x \right)^3- 24 \left(\frac94 x \right)^2+h_3x+h_4$. Making vanish in $I_1$ the coefficients of the third order and second order monomials in $q$ yields the system
\[
\left\{
\begin{aligned}
\beta_{1,1}+\beta_{1,2}&=\left(\frac49\right)^3=\frac{2^6}{3^6}, \\
2\beta_{1,1}+\beta_{1,2}&=2\left(\frac49\right)^2=\frac{2^5}{3^4}.
\end{aligned}
\right.
\]
The solution is $\beta_{1,1} =\frac{2^5}{3^4}- \frac{2^6}{3^6}$ and $ \beta_{1,2}= \frac{2^7}{3^6}-\frac{2^5}{3^4}$. 

%$\bullet$
Concerning $I_3$ one notices that the third order Tchebycheff is antisymmetric with respect to the center $x=1/2$. It explains why $\beta_{1,1} =\beta_{3,2}$ and $\beta_{1,2}= \beta_{3,1}$. These values can also be obtained after lengthy but elementary calculations.

%$\bullet$
In $I_2$ one has
\[
e_{2,1}(x)=\frac94 x-\frac34, \quad e_{2,2}(x)=\frac94 x-\frac12, \quad e_{1,1}, \ e_{1,2}, \ e_{3,1} \mbox{ and } e_{3,2} \mbox{ are constant.}
\]
One checks that $H(e_{2,1}(x))=32\left(\frac94 x \right)^3- 120 \left(\frac94 x \right)^2+h_5x+h_6$ and $H(e_{2,2}(x))=32\left(\frac94 x \right)^3- 96 \left(\frac94 x \right)^2+h_7x+h_8$. The linear system which corresponds to the annulation in $q$ of the monomials $x^3$ and $x^2$ can be written
\[
\left\{
\begin{aligned}
\beta_{2,1}+\beta_{2,2}&=\left(\frac49\right)^3=\frac{2^6}{3^6}, \\
120\beta_{2,1}+\beta_{2,2}&=48\left(\frac49\right)^2.
\end{aligned}
\right.
\]
The solution is $\beta_{2,1}=\beta_{2,2}=\frac{2^5}{3^6}$.

%$\bullet$
Therefore $q$ is continuous and piecewise affine. One can write $q=-e_0\in V_h$.
\end{proof}

\begin{remark}\label{rem:tchi}
A numerical application shows that $
\beta_{1,1}=\beta_{3,2}=0.3072\dots$, $ \beta_{1,2}=\beta_{3,1}=-0.2194\dots$ and $ \beta_{2,1}=\beta_{2,2}=0.0438\dots$ With these values $ K>\widetilde K>1$. However the passage in the proof from estimate~\eqref{eq:new40} to estimate~\eqref{eq:new23} is non optimal because it looses the fact that~\eqref{eq:new40} concerns two consecutive indices. For $m=3$, two consecutive indices can be either $(j,j+1)=(1,2)$ or $(j,j+1)=(2,3)$. Then, for the third order rescaled Tchebycheff polynomial, one gets the sharper bound
\begin{equation}\label{eq:new60}
\bar K= |\beta_{1,1}|+|\beta_{1,2}| \frac12\left(|\beta_{1,1}|+|\beta_{1,2}|+ |\beta_{2,1}|+|\beta_{2,2}| \right)=
\frac32 \left(|\beta_{1,1}|+|\beta_{1,2}| \right)+| \beta_{2,1}|=0.8340\dots
\end{equation}
Now $\bar K<1$. This bound will be invoked to justify the numerical convergence in test \#2.
\end{remark}


\section{Numerical tests and discussion}\label{sec:6}

Algorithm \eqref{eq:new31}{\rm--}\eqref{eq:new30} has been implemented in Tensorflow/Keras~\cite{chollet} for the purpose of numerical illustrations. The softwares \verb+Branch_data.py+ (training) and \verb+Branch_gene.py+ (data generation) written for the tests are available at the Git repository \url{https://github.com/despresbr/NNNA}. The minimisation problems~\eqref{eq:new30} are performed with the ADAM algorithm which is a stochastic descent gradient method with batches. A dataset is created with oversampling and the cost function in the $L^\infty(I)$ norm is defined by creating a custom loss function. The function $g(e_0,\beta,f_k)$ is assembled by using the concatenation-of-layers technique~\cite[p.~243]{chollet}. The CPU cost of the learning stage is the same for all iterations $k$ since the number of free parameters (i.e. the number of neurons in this case) is the same at each stage of the algorithm because we ask for the same number of epochs and the same size of the batches. We use a trick which is classical for algorithms which have inner loops (new training) inside a global exterior loop (iteration on $k$). That is the starting point for a new training is the end point of the previous training. It helps to save computational efforts. We also follow the prescription~\cite{des:boo, turinici} where a decrease of the gradient length (i.e. the learning rate in Machine Learning language) is advocated.

The minimization in the $L^\infty$ norm is performed approximatively because it is not the objective of stochastic gradient descent algorithms to calculate global minima with sharp accuracy. In particular we do not know precisely the influence of the batches on the accuracy of the minimization procedure. Despite this fact, we observe strong convergence in the tests below where the error $\epsilon_k = \| f_k -H\|_{L^\infty(I)}$ is reported in function of the iteration index $k$. The tests below have been calculated on a GPU node at SCAI-Sorbonne university\footnote{https://scai.sorbonne-universite.fr}. They show convergence of algorithm \eqref{eq:new31}{\rm--}\eqref{eq:new30} in accordance with the main Theorem of convergence~\ref{theor:main}.


\subsection{Test \#1}

The polynomial is $H(x)=x-x^2$. We take $m=2$ and $n=2$, that is $r=2$ basis functions~\eqref{eq:moi}. The training dataset is made with 4000 pairs $(x_s,y_s=H(x_s))_{1\leq s \leq 4000}$ where $x_s$ is sampled uniformly between 0 and 1.
Around 20\% of the pairs are taken outside for validation. The batches are made with 128 pairs.
The error at iteration $k$ is $\epsilon_0=0.0377$, $\epsilon_1=0.00806$, $\epsilon_2=0.00238$, $\epsilon_3=0.000521$, $\epsilon_4=0.000165$, $\epsilon_5=3.66\times 10^{-5}$ then $\epsilon_6=9.97\times 10^{-6}$. The numerical rate of convergence, that is the numerical contraction constant, is $K_{\mathrm{num}}\approx \frac14 $ which is better than the theoretical one.

\begin{figure}[h]
\begin{tabular}{cc}
\includegraphics[height=6.5cm,angle=-90]{trin1_r.pdf}&
\includegraphics[height=6.5cm,angle=-90]{trin2_r.pdf} \\
\includegraphics[height=6.5cm,angle=-90]{trin3_r.pdf}&
\includegraphics[height=6.5cm,angle=-90]{trin4_r.pdf}
\end{tabular}
\caption{Display of the function calculated by the algorithm (broken line) versus the objective $H$ (smooth line). The four plots corresponds to $0\leq k \leq 3$. One observes numerical convergence.}\label{fig:1}
\end{figure}


\subsection{Test \#2}

The polynomial is the rescaled Tchebycheff polynomial of Lemma~\ref{lem:tchi}: $H(x)=32(2x-1)^3-\linebreak 48(2x-1)^2+18x-1$. Theoretical convergence is guaranteed by means of the estimate of Remark~\ref{rem:tchi}. We take the same parameters as in the previous test except that $m=3$ and $n=3$, that is a total of $r=6$ basis functions~\eqref{eq:moi}. The error at iteration $k$ is $\epsilon_0=0.459$, $\epsilon_1=0.179$, $\epsilon_2=0.0834$, $\epsilon_3=0.0409$, $\epsilon_4=0.0109$, then $\epsilon_5=0.00471$. The numerical convergence is observed. It is compatible with the fact that $\bar K<1$ (see~\eqref{eq:new60}). However $\bar K$ is quite close to 1, so it cannot explain the observed fast convergence with a factor $\approx 1/2$.

\subsection{Test \#3}

We perform the same test as in the previous one, except that now $m=5$ and $n=3$. We report only the accuracy of the first four iterations of the algorithm, because the computational cost related to the manipulation of functions increases for $k=5$. The accuracy is $\epsilon_0=0.201$, $\epsilon_1=0.027$, $\epsilon_2=0.0057$ and $\epsilon_3=0.00103$. One observes a more pronounced rate of convergence, in accordance with bound~\eqref{eq:new4} and Lemma~\ref{lem:6}.

\subsection{Test \#4}

Finally we consider the fifth order rescaled Tchebycheff polynomial: $H(x)=16(2x-1)^5- \linebreak 20(2x-1)^3+5(2x-1)$, and we take $m=9$ and $n=5$ (that is $r=36$ basis functions). The errors are $\epsilon_0=0.379$, $\epsilon_1=0.0625$, $\epsilon_2=0.0318$ and $\epsilon_3=0.00947$. A comparison of the exact solution and the numerical solution at step $k=3$ is proposed in Figure~\ref{fig:2}.

\begin{figure}[h]
\includegraphics[height=8cm,angle=-90]{crin5_r.pdf}
\caption{Fifth order rescaled Tchebycheff polynomial: numerical solution (cross) versus exact function (solid line).}\label{fig:2}
\end{figure}

\subsection{Discussion}

We discuss the complexity of the method and present three open problems.

\subsubsection{Complexity}

The complexity of the algorithm can be defined as the relation between the accuracy and the cost in the asymptotic range $k>>1$. We present hereafter simple bounds.

%$\bullet$
As a consequence of Theorem~\ref{theor:main}, the accuracy in $L^\infty$ norm naturally scales like
\[
\varepsilon =O(K^k)
\]
where we remind that $k$ is the number of layers and $K<1$ is the contraction constant.

%$\bullet$
The cost can be estimated in two ways. Either the cost is estimated as the number of neurons to train, equal to the number of free parameters in~\eqref{eq:free} times the number of layers. Since $r=m(n-1)$, one obtains $ C=O(mnk)$. It yields the complexity scaling
\begin{equation}\label{eq:free2}
C=O\left(\lvert\log \varepsilon | \right)
\end{equation}
which is similar to the one of the Yarotsky Theorem~\cite{yaya}.

%$\bullet$
Or the cost is estimated is the number of calculations to perform to calculate one function $f_k$. Due to the hierarchical structure of the whole method, it scales like $ C=O((mn)^k)$. It yields the complexity scaling
\begin{equation}\label{eq:free3}
C=O\left(\frac1{\varepsilon^{\frac{\log mn}{ \lvert\log K|}} }\right)
\end{equation}
which is of course much worse than~\eqref{eq:free2}. For a practical calculation, the relation cost/accuracy is a compromise between these two bounds, depending on the parts which are the most costly. However the numerical experiments clearly show that the scaling~\eqref{eq:free3} is the issue.


%$\bullet$
A related problem is that the scaling $(mn)^k$ may induce an important CPU cost, independently of the level of accuracy. This is related to the technology for hierarchical declaration of functions used in Machine Learning softwares. It is possible that this computational cost is lessen by using a different implementation, but it is not clear and so far this is a huge concern. We have observed that a way to get a control on it and to obtain reasonable accuracy at reasonable cost is to take $m$ large and $k\leq 3$ (as in test \#4).

\subsubsection{Three open problems}

An open mathematical problem is to understand how to recover an optimal contraction/approximation constant in maximal norm, using perhaps technics from Constructive Approximation theory~\cite{vovo:lo}.

An open numerical problem left for further research is the optimization basis functions.

A more general problem is to determine if there could be a way to extrapolate the approach used in this Note for the approximation of more general non polynomial functions in higher dimensions. Any progress in this direction would be of critical importance for the numerical analysis of Machine Learning techniques applied to real life problems. So far, it is a completely open axis of research.

\bibliographystyle{crplain}
\bibliography{CRMATH_Despres_20220845}
\end{document}