\documentstyle[12pt,twoside,leqno,epsf]{article}
\newcounter{zahl}
\sloppy
%\input{mssymb}
%
\oddsidemargin=0.0truecm\relax%
\evensidemargin=0.0truecm\relax%
\marginparwidth=2.5truecm\relax%
\marginparsep=0.2truecm\relax%
\topmargin=-1.5truecm\relax%
%\headheight=1truecm\relax%
%\headsep=0.4truecm\relax%
\textheight=25truecm\relax% HeKoNN Angaben 16 x 25
\textwidth=16truecm\relax%
\topskip=\baselineskip%
\footheight=1truecm\relax%
\footskip=0.4truecm%%
%\oddsidemargin 53pt
%\evensidemargin 53pt
%\marginparwidth 90pt
%\marginparsep 11pt
\marginparpush 5pt
%\topmargin 27pt
%\headheight 12pt
%\headsep 45pt
%\footheight 12pt
%\footskip 25pt
%\textheight 505pt
%\textwidth 365pt
\columnsep 10pt
\columnseprule 0pt
\raggedbottom
\footnotesep 12pt \skip\footins 10pt plus 2pt minus 4pt
\floatsep 12pt plus 2pt minus 2pt
\textfloatsep 20pt plus 2pt minus 4pt
\intextsep 12pt plus 2pt minus 2pt
\dblfloatsep 12pt plus 2pt minus 2pt
\dbltextfloatsep 20pt plus 2pt minus 4pt
\oddsidemargin0cm
\evensidemargin0cm
%\topmargin=10mm
%\headsep=5truemm
\baselineskip=12pt %Zeilenabstand
\setlength{\parskip}{0pt} %Absatzabstand
\setlength{\parindent}{10pt} %Einzug Absatz erste Zeile
\abovedisplayskip=3mm plus 6pt minus 4pt
\belowdisplayskip=3mm plus 6pt minus 4pt
\abovedisplayshortskip=0mm plus 6pt minus 2pt
\abovedisplayshortskip=2mm plus 4pt minus 4pt
%\mathindent=0.8cm %Einzug Formeln
\frenchspacing
%
%
%\input{epic}
%\input{eepic}
%\input{sil}
\newcommand{\scryl}{\scriptstyle}
%
\newcommand{\beq}{\begin{equation}}
\newcommand{\eeq}{\end{equation}}
\newcommand{\ep}{\varepsilon}
\newcommand{\va}{\varphi}
%
\newcommand{\lgl}{\langle}
\newcommand{\rgl}{\rangle}
%
\newcommand{\ra}{\rightarrow}
\newcommand{\rras}{\rightrightarrows}
\newcommand{\la}{\leftarrow}
\newcommand{\lra}{\longrightarrow}
\newcommand{\lla}{\longleftarrow}
\newcommand{\hra}{\hookrightarrow}
\newcommand{\Ra}{\Rightarrow}
\newcommand{\Lra}{\Longrightarrow}
\newcommand{\Llra}{\Longleftrightarrow}
\newcommand{\twohra}{\twoheadrightarrow}
\newcommand{\smetmi}{\smallsetminus}
%
\newcommand{\notI}{I\!\! /}
%
\newcommand{\binom}[2]
{\left( \begin{array}{c} #1 \\[-1mm]
#2 \end{array} \right) }
\newcommand{\full}[3]
{\mathrel{\mathop{#1}\limits^{#2}_{#3}}}
\newcommand{\uset}[2]
{\mathrel{\mathop{#1}\limits_{#2}}}
\newcommand{\oset}[2]
{\mathrel{\mathop{#1}\limits^{#2}}}
%
%
%\newcommand{\Int}{\int\limits}
%\newcommand{\Lim}{\lim\limits}
%\newcommand{\Prod}{\prod\limits}
%\newcommand{\Sum}{\sum\limits}
\newcommand{\bcap}{\bigcap\limits}
\newcommand{\bcup}{\bigcup\limits}
\newcommand{\Bigvee}{\bigvee\limits}
\newcommand{\boplus}{\bigoplus\limits}
%
\newcommand{\ul}{\underline}
\newcommand{\ol}{\overline}
\newcommand{\wt}{\widetilde}
\newcommand{\wh}{\widehat}
%
\newcommand{\id}{{\;\rm id\,}}
\newcommand{\Lie}{{\;\rm Lie\,}}
\newcommand{\map}{{\;\rm map\,}}
\newcommand{\mod}{{\;\rm mod\,}}
%
\newcommand{\hBox}{{$\hfill\Box$}}
%
\newcommand{\C}{{\Bbb C}}
\newcommand{\IL}{{\Bbb L}}
%\newcommand{\N}{{\Bbb N}} %{I\!\!N}
\newcommand{\N}{{I\!\!N}}
\newcommand{\R}{{I\!\!R}}
\newcommand{\Z}{{I\!\!Z}}
\newcommand{\Q}{{\Bbb Q}}
%\newcommand{\R}{{\Bbb R}}
\newcommand{\T}{{\Bbb T}}
%\newcommand{\Z}{{\Bbb Z}}
%
\newcommand{\cA}{{\cal A}}
\newcommand{\cB}{{\cal B}}
\newcommand{\cC}{{\cal C}}
\newcommand{\cD}{{\cal D}}
\newcommand{\cE}{{\cal E}}
\newcommand{\cF}{{\cal F}}
\newcommand{\cG}{{\cal G}}
\newcommand{\cH}{{\cal H}}
\newcommand{\cI}{{\cal I}}
\newcommand{\cJ}{{\cal J}}
\newcommand{\cK}{{\cal K}}
\newcommand{\cL}{{\cal L}}
\newcommand{\cM}{{\cal M}}
\newcommand{\cN}{{\cal N}}
\newcommand{\cO}{{\cal O}}
\newcommand{\cP}{{\cal P}}
\newcommand{\cQ}{{\cal Q}}
\newcommand{\cR}{{\cal R}}
\newcommand{\cS}{{\cal S}}
\newcommand{\cT}{{\cal T}}
\newcommand{\cU}{{\cal U}}
\newcommand{\cV}{{\cal V}}
\newcommand{\cW}{{\cal W}}
\newcommand{\cX}{{\cal X}}
\newcommand{\cY}{{\cal Y}}
\newcommand{\cZ}{{\cal Z}}
%
\newcommand{\cs}{{\cal s}}
\newcommand{\cx}{{\cal x}}
%
\newcommand{\cm}{\mbox{\boldmath $m$}}
\newcommand{\cg}{\mbox{\boldmath $g$}}
\newcommand{\cp}{\mbox{\boldmath $p$}}
\newcommand{\rang}{\mbox{Rang}}
\renewcommand{\theequation}{\thesubsection.\arabic{equation}}
\def\contentsname{Inhaltsangabe}
\pagestyle{empty}
%\pagestyle{myheadings}
%\pagestyle{headings}
%\renewcommand{\thepage}{\hfill--\arabic{page}--\hfill}
%\renewcommand{\pagenumbering}{\hfill--\arabic{page}--\hfill}
%{\bf (\arabic{section}.\arabic{lemma})}
\newfont{\cfb}{eufm10}%fett gedruckt
\newfont{\cfr}{eufm10}%normal "
\newenvironment{lquote}{\begin{list}{}{\setlength{\leftmargin}{0,5cm}}\item[]}{\end{list}}
\newenvironment{script}[1]{\begin{list}{}{\settowidth{\labelwidth}{[#1]}
\setlength{\leftmargin}{\labelwidth}
\addtolength{\leftmargin}{\labelsep}
\parsep0.5ex plus0.2ex minus0.2ex \itemsep0.3ex \labelsep0.5ex
\renewcommand{\makelabel}[1]{[##1]\hfill}}}{\end{list}}
%Bsp.: [en] Der Magen mu{\ss} leer sein
% [u] Das Auge soll sehen , usw.
\newtheorem{defi}{Definition}[subsection]
%\makeidx
%\makeindex
\newcommand{\bplus}{\Box}
\newcommand{\btimes}{\Box}
%\newcommand{\cp}{{\cal P}}
%\newcommand{\ch}{{\cal H}}
%\newcommand{\cR}{{\cal R}}
%\newcommand{\cc}{{\cal c}} kein dtsch c
%\newcommand{\cH}{{\cal H}}
%\newcommand{\cG}{{\cal G}}
%\newcommand{\cT}{{\cal T}}
%\newcommand{\cD}{{\cal D}}
%\newcommand{\cI}{{\cal I}}
%\newcommand{\cB}{{\cal B}}
%\newcommand{\cU}{{\cal U}}
%\newcommand{\cA}{{\cal A}}
%\newcommand{\cS}{{\cal S}}
\newcommand{\kdoty}%
{\raisebox{-0,5ex}{$\not$\mbox{$\stackrel{\scriptstyle\cdot}{\scriptstyle k}$}}}
\newcommand{\pcup}{\raisebox{-0,5ex}{$\stackrel{.}{\cup}$}}
\newcommand{\us}{\raisebox{-0,5ex}{$\stackrel{=}{\scriptstyle s.o.}$}}
\newcommand{\hcT}{{\hat{\cal T}}}
\newcommand{\tcH}{{\tilde{\cal H}}}
\newcommand{\tc}{{\tilde{c}}}
\newcommand{\nsubsetneqq}{\raisebox{-0,5ex}{$\not$\mbox{$\stackrel{\scriptstyle\subset}
{\scriptstyle =}$}}}
\newcommand{\bbox}{\hspace{1cm}\mbox{$\Box$}}
\newcommand{\Ell}{{\overline{\ell}}}
\newcommand{\Int}{\int\limits}
\newcommand{\Sum}{\sum\limits}
\newcommand{\Lim}{\lim\limits}
\newcommand{\Prod}{\prod\limits}
%\newcommand{\nin}{\not\mid\in}
\newcommand{\Bigcup}{\bigcup\limits}
%
\newcommand{\ula}{\ul{\ul{a}}}
\newcommand{\ulb}{\ul{\ul{b}}}
\newcommand{\uln}{\ul{\ul{n}}}
\newcommand{\ulp}{\ul{\ul{p}}}
\newcommand{\ulx}{\ul{\ul{x}}}
\newcommand{\dc}{\mbox{\footnotesize{$\prime \! \cC^{ \!\!\! \sim}$}}}
%
\newcommand{\LL}{{L\!\! L}}
\newcommand{\bQ}{{Q\!\!\!\!I}}
\newcommand{\bC}{{C\!\!\!\!I}}
\newcommand{\bN}{I\!\!N}
\newcommand{\bZ}{Z\!\!\!Z}
\newcommand{\bR}{I\!\!R}
\newcommand{\bL}{I\!\!L}
\newcommand{\bK}{I\!\!K}
\newcommand{\fV}{{\bf V}}
\newcommand{\fU}{{\bf U}}
\newcommand{\fW}{{\bf W}}
\newcommand{\fF}{{\bf F}}
\newcommand{\fG}{{\bf G}}
\newcommand{\fC}{{\bf C}}
\newcommand{\fJ}{{\bf J}}
\newcommand{\fL}{{\bf L}}
\newcommand{\fA}{{\bf A}}
\newcommand{\fI}{{\bf I}}
\newcommand{\fM}{{\bf M}}
\newcommand{\fcB}{{\bf{\cal B}}}
\newcommand{\fcD}{{\bf{\cal D}}}
\newcommand{\fa}{{\bf a}}
\newcommand{\fb}{{\bf b}}
\newcommand{\fc}{{\bf c}}
\newcommand{\fd}{{\bf d}}
\newcommand{\fe}{{\bf e}}
\newcommand{\ff}{{\bf f}}
\newcommand{\fg}{{\bf g}}
\newcommand{\fh}{{\bf h}}
% VERBOTEN (\fi wird von LaTeX gebraucht!) \newcommand{\fi}{{\bf i}}
\newcommand{\fj}{{\bf j}}
\newcommand{\fk}{{\bf k}}
\newcommand{\fl}{{\bf l}}
\newcommand{\fm}{{\bf m}}
\newcommand{\fn}{{\bf n}}
\newcommand{\fo}{{\bf o}}
\newcommand{\fp}{{\bf p}}
\newcommand{\fq}{{\bf q}}
\newcommand{\fr}{{\bf r}}
\newcommand{\fs}{{\bf s}}
\newcommand{\ft}{{\bf t}}
\newcommand{\fu}{{\bf u}}
\newcommand{\fv}{{\bf v}}
\newcommand{\fw}{{\bf w}}
\newcommand{\fx}{{\bf x}}
\newcommand{\fy}{{\bf y}}
\newcommand{\fz}{{\bf z}}
\newcommand{\fZ}{{\bf Z}}
\newcommand{\lr}{{\overline r}}
\newcommand{\os}{{\overline s}}
\newcommand{\on}{{\overline n}}
\newcommand{\oR}{{\overline R}}
\newcommand{\oS}{{\overline S}}
\newcommand{\oX}{{\overline X}}
\newcommand{\oT}{{\overline T}}
\newcommand{\ox}{{\overline x}}
\newcommand{\oa}{{\overline a}}
\newcommand{\DDOTS}{
\mbox{
\raisebox{.2ex}{.}%
\hspace*{1ex}%
\raisebox{.8ex}{.}%
\hspace*{1ex}%
\raisebox{1.4ex}{.}%
}
}
\newcommand{\MB}[1]{
\relax{
\def\embold##1{%
\hbox to 0.02em{$##1$\hss}%
\hbox to 0.02em{$##1$\hss}%
\hbox{$##1$}%
}
\ifmmode
\mathchoice
{\embold{\displaystyle #1}}
{\embold{\textstyle #1}}
{\embold{\scriptstyle #1}}
{\embold{\scriptscriptstyle #1}}
\else
{\embold{$\textstyle #1$}}
\fi
}}
\newcommand{\fO}{\MB{0}}% fO == FettNull (per \MB{math-fett} )
% Trick fuer Optionales Argument fuer \fparbox
\makeatletter
\def\fparbox{\@ifnextchar [{\@fparxbox}{\@fparfbox}}
\def\@fparfbox#1{\fbox{\parbox{0.95\textwidth}{#1}}}
\def\@fparxbox[#1]#2{\fbox{\parbox{#1}{#2}}}
\makeatother
\newcommand{\HalbOben}[1]{%
\smash{\raisebox{1.2ex}{$#1$}}% Raufgeschoben
}%
%-------------- something around text ----------------------------
\newcommand{\below}[2]{\mathrel{\mathop{#1}\limits_{#2}}}
%\newcommand{\above}[2]{\mathrel{\mathop{#1}\limits^{#2}}}
%\newcommand{\full}[3]{\mathrel{\mathop{#1}\limits_{#2}^{#3}}}
\newcommand{\bbelow}[3]{\mathrel{\mathop{#1}\limits_{{#2}\atop{#3}}}}
%----------------------------------------------------------------------
\parindent=0pt
\begin{document}
\title{Automatic speech recognition with neural networks}
\author{Ra\'ul Rojas\thanks{Extended abstract for the short course on
speech recognition at HeKoNN-96}}
\date{Martin-Luther-Universit\"at Halle\\
Fachbereich Mathematik und Informatik}
\maketitle
\thispagestyle{empty}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\section{The speech recognition problem}
Building computers capable of automatically recognizing speech has been an
old dream in the fields of electronics and computer science. Some
experiments were conducted as early as the 1950s and in the 1960s some
systems were already capable of recognizing vowels uttered by different speakers. But until now all expectations have not been fully met. We all know
of several small-scale commercial applications of speech technology
for consumer electronics or for office automation. Most of these systems
work with a limited vocabulary or are speaker dependent in some way. But current
research has as its goal the development of {\em large vocabulary speaker independent continuous
speech recognition}. This long chain of adjectives already underlines
the difficulties which still hamper the large-scale commercial application
of automatic speech recognition: We would like the user to speak without
artificial pauses, we would like that the system could understand anybody,
and this without necessarily knowing the context of a conversation or
monologue.
Artificial neural networks have been proposed as one of the building
blocks for speech recognizers. Their function is to provide a statistical
model capable of associating a vector of speech features with the probability
that the vector could represent any of a given number of phonemes. Neural
networks play the role of statistical machines as we discuss in the second section. But we
will see that our knowledge of the speech recognition process is still very
limited so that fully connectionist models are normally not used. Researchers
have become rather pragmatic and combine the best features of neural modeling
with traditional algorithms or with other statistical approaches, like
Hidden Markov Models, which we will briefly review. Current state of the art systems combine different approaches and therefore they
are called {\em hybrid speech recognition systems}.
\section{Feature extraction}
The first problem for any automatic speech recognizer is finding an
appropriate representation of the speech signal. Assume that the speech
is sampled at constant intervals and denote the amplitude of the speech signal
by $x(0),x(2),\ldots,x(n-1)$. For a good recognition the time between consecutive
measurements should be kept small. The microphone signal is thus a more or
less good representation of speech but contains a lot of redundancy. It
would be better to reduce the number of data points but in such a way as
to preserve most of the information: this is the task of all {\em feature
extraction methods}. Choosing an appropriate method implies considering
the speech production process and what kind of information is `encoded'
in the acoustic signal.
Speech is produced in the vocal tract, which can be thought as a tube of
varying diameter extending from the vocal chords to the lips. The vocal
chords produce a periodic pressure wave which travels along the vocal tract until the energy it contains is released through the mouth and nose. The vocal
tract behaves as a kind of {\em resonator} in which some frequencies are
amplified whereas others are eliminated from the final speech signal.
Different configurations of the vocal organs produce different resonating
frequencies, so that it is safe to assume that detecting the mixture of
frequencies present in the speech signal can provide us with information
about the particular configuration of the vocal tract, and from this
configuration we can try to deduce what phoneme has been produced.
Many methods have been proposed to deal with the task of the spectral
analysis of the speech signal. Some of them have a psychophysical foundation,
that is, they are based on physiological research on human hearing. Others
have arisen in other fields of engineering but have proved to be adequate
for this task. Certainly one of the simplest, but also more powerful
approaches, is computing a short term Fourier spectrum of the speech signal.
\subsection{Fourier analysis}
Given a data set $\fx=(x(0),x(2),\ldots,x(n-1))$ it is the task of Fourier analysis
to reveal its periodic structure. We can think of the data set as function
$X$ evaluated at the points $0,2,\ldots,n-1$. The function $X$ can be written
as a linear combination of the basis functions
\begin{eqnarray*}
f_0(t) & = & \frac{1}{\sqrt{n}}\cos(2\pi t\frac{0}{n}) - i\sin(2\pi t\frac{0}{n}) = \frac{1}{\sqrt{n}}(\omega^*_n)^{0\cdot t}\\
f_1(t) & = & \frac{1}{\sqrt{n}}\cos(2\pi t\frac{1}{n}) - i\sin(2\pi t\frac{1}{n}) = \frac{1}{\sqrt{n}}(\omega^*_n)^{1\cdot t}\\
\vdots & & \vdots \\
f_{n-1}(t) & = & \frac{1}{\sqrt{n}}\cos(2\pi t\frac{n-1}{n}) - i\sin(2\pi t\frac{n-1}{n}) = \frac{1}{\sqrt{n}}(\omega^*_n)^{(n-1)\cdot t}
\end{eqnarray*}
where $\omega_n$ denotes the $n$-th complex root of unity $w_n=exp(-2\pi /n)$. Writing the data set as a linear combination of these functions amounts
to finding which of the given frequencies is present in the data. Denote
by $\fF_n^*$ the $n \times n$ matrix whose columns are the basis functions
evaluated at $t=0,1,\ldots,n-1$, that is, the element at row $i$ and column $j$
of $\fF_n^*$ is $(\omega^*_n)^{ij}$, for $i,j=0,\ldots,n-1$. We are
looking for a vector $\fa$ of amplitudes such that
$$
\fF_n^* \fa = \fx \/.
$$
The $n$-dimensional vector $\fa$ is the {\em spectrum} of the speech signal.
The matrix $\fF_n$ defined as
$$
\fF_n = \frac{1}{\sqrt{n}} \left(
\begin{array}{lllc}
\omega_n^0 & \omega_n^0 & \cdots & \omega_n^0 \\
\omega_n^0 & \omega_n^1 & \cdots & \omega_n^{n-1} \\
\omega_n^0 & \omega_n^2 & \cdots & \omega_n^{2n-2} \\
\vdots & & \ddots & \vdots \\
\omega_n^0 & \omega_n^{n-1} & \cdots & \omega_n^{(n-1)(n-1)}
\end{array}
\right)
$$
is the transpose conjugate of the matrix $\fF_n^*$. Since the basis functions
$f_1,f_2,f_{n-1}$ are mutually orthogonal, this means that $\fF_n^*$ is unitary
and in this case
$$
\fF_n\fF_n^*\fa = \fF_n \fx \hspace{1cm} \Ra \hspace{1cm}\fa=\fF_n\fx \/.
$$
The expression $\fF_n\fx$ is the discrete Fourier transform of the vector $\fx$.
The inverse Fourier transform is given of course by
$$
\fF_n^*\fF_n\fx = \fF_n^*\fa \hspace{1cm}\Ra \hspace{1cm}\fx = \fF_n^*\fa \/.
$$
The speech signal is analyzed as follows: a window of length $n$ is used to
select the data. Such a window can cover for example 10 milliseconds of speech.
The Fourier transform is computed and the magnitudes of the spectral
amplitudes (the absolute values of the elements of the vector $\fa$) are
stored. The window is displaced to cover the next set of $n$ data points
and the new Fourier transform is computed. In this way we get a short-term
spectrum of the speech signal as a function of time, as shown in Fig.~\ref{formants3d}. Our speech recognition algorithms should recover
from this kind of information the correct sequence of phonemes.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=6.09cm\epsfbox{formants3d.eps}}
\caption{\label{formants3d} Temporal variation of the spectrum of the speech signal}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Fast transformations}
Since we are interested in analyzing the speech signal in real time it
is important to reduce the number of numerical operations needed. A Fourier
transform computed as a matrix-vector multiplication requires around $O(n^3)$
multiplications. A better alternative is the Fast Fourier transform, which
is just a rearrangement of the matrix-vector multiplication. The left graphic in Fig.~\ref{fft}
shows the real part of the elements of the Fourier matrix $\fF_n$ (the shading
is proportional to the numerical value). The recursive structure of the matrix
is not immediately evident, but if the even columns are permuted to the left
side of the matrix and the odd columns to the right, the new matrix structure
is the one shown on the right graphic in Fig.~\ref{fft}. Now the recursive
structure is visible. The matrix $\fF_n$ consists of four submatrices of dimension
$n/2 \times n/2$, which are related to the matrix $\fF_{n/2}$ through a
simple formula. In order for the reduction process to work, $n$ must be
a power of two.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=5cm\epsfbox{vreal.eps}\hspace{1cm}\epsfysize=5cm\epsfbox{vperm.eps}}
\caption{\label{fft} The Fourier matrix and the permuted Fourier matrix}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
This rearrangement of the Fourier matrix is the basis of the Fast Fourier
Transform which we will discuss briefly in the short course.
Many speech recognition systems use some kind of variation of the Fourier
coefficients. The problem with the short-term spectra is that the base
frequency of the speaker should be separated from the medium term
information about the shape of the vocal tract. We will discuss two popular
alternatives: cepstral coefficients and linear predictive coding.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Markov Models and Neural Networks}
In speech recognition
researchers postulate that the vocal tract shapes can be quantized in a discrete set
of states roughly associated with the phonemes which compose speech. But
when speech is recorded the exact transitions in the vocal tract cannot be observed and only the produced sound can be measured at some predefined time intervals. These are the emissions and the states of the system are the
quantized configurations of the vocal tract. From the measurements we want to infer
the sequence of states of the vocal tract, i.e. the sequence of utterances
which gave rise to the recorded sounds. In order to make this problem
manageable, the set of states and the set of possible sound parameters
are quantized.
A Hidden Markov Model has the structure shown in Fig.~\ref{hmmexample}. The
state transitions remain invisble for the observer. The only data that is
provided are the emissions (i.e. the spectrum of the signal) at some
points in time.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=5.04cm\epsfbox{hmm.eps}}
\caption{\label{hmmexample} A hidden Markov model}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The general problem when confronted with the recorded sequence of
output values of a HMM is to compute the most probable sequence of
state transitions which could have produced them. This is done with a recursive algorithm.
\subsection{Hidden Markov Models}
A first-order Markov model is any system capable of assuming one of $n$ different
states at time $t$. The system does not change its state at each time step
deterministically but according to a stochastic dynamic.
The probability of transition
from the $i$-th to the $j$-th state at each step is given by $0 \le a_{ij} \le 1$ and does not depend on the previous history of transitions.
These probabilities can be arranged in an $n \times n$ matrix $A$.
We also assume that at each step the model emits one of $m$ possible
output values. We call the probability of emitting the $k$-th output value
while in the $i$-th state $b_{ik}$.
Starting from a definite state at time $t=0$, the system is allowed to run for $T$ time
units and the generated outputs are recorded. Each new run of the system
produces in general a different sequence of output values. The system
is called a HMM because only the emitted values can
be observed but not the state transitions.
The state diagram of a HMM can be represented by a network made of $n$ units (one for each state) and with connections
from each unit to each other. The weights of the connections are the
transition probabilities (Fig.~\ref{hmmstates}).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=4cm\epsfbox{hmmstates.eps}}
\caption{\label{hmmstates} }
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
As in the case of backpropagation through time, we can unfold the
network in order to observe it at each time step. At $t=0$ only one of the $n$ units, say the $i$-th produces
the output 1, all others zero. State $i$ is the
the actual state of the system. The probability that at time $t=1$ the system
reaches state $j$ is given by $a_{ij}$ (only some of these values are shown in the diagram
to avoid cluttering). The probability of reaching state $k$ at $t=2$ is
$$
\sum_{j=1}^n a_{ij}a_{jk}
$$
which is just the net input at the $k$-th node in the stage $t=2$ of the network shown
in Fig.~\ref{hmm}.
Consider now what happens when we can only observe the output of the system but not the
state transitions (refer to Fig.~\ref{likelihood}). If the system starts at $t=0$ in a state given by a discrete probability distribution $\rho_1,\rho_2,\ldots,\rho_n$, then the probability of observing the $k$-th output
at $t=0$ is given by
$$
\sum_{i=1}^n \rho_i b_{ik}.
$$
The probability of observing the $k$-th output at $t=0$ {\em and} the
$m$-th output at $t=1$ is
$$
\sum_{j=1}^n \sum_{i=1}^n \rho_i b_{ik}a_{ij}b_{jm}.
$$
The rest of the stages of the network compute the corresponding probabilities
in a similar manner.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=5cm\epsfbox{c7hmm.eps}}
\caption{\label{hmm} Unfolded Hidden Markov Model}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
How can we find the unknown transition and emission probabilities for such
an HMM? If we are given a sequence of $T$ observed outputs with indices $k_1,k_2,\ldots,k_T$
we would like to maximize the likelihood of this sequence, i.e. the
product of the probabilities that each of them occurs. This can be done by
transforming the unfolded network as shown in Fig.~\ref{likelihood} for $T=3$.
Notice that at each stage $h$ we introduced an additional edge from the
node $i$ with the weight $b_{i,k_h}$. In this way the final node which
collects the sum of the whole computation effectively computes the
likelihood of the observed sequence. Since this unfolded network contains
only differentiable functions at its nodes (in fact only addition and the
identity function) it can be trained using the backpropagation algorithm.
However care must be taken to avoid updating the probabilities in such a
way that they could become negative or greater than 1. Also the transition
probabilities starting from the same node must always add to 1. These conditions can
be enforced by an additional transformation of the network (introducing for example a ``softmax'' function) or by
using the method of Lagrange multipliers. We give only a hint of how
this last technique can be implemented so that the reader completes the network by himself.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=6cm\epsfbox{likelihood.eps}}
\caption{\label{likelihood} Computation of the likelihood of a sequence of observations}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Assume that a function $F$ of $n$ parameters $x_1,x_2,\ldots,x_n$ must be
minimized, subject to the constraint $C(x_1,x_2,\ldots,x_n)=0$. We introduce
a Lagrange multiplier $\lambda$ and define the new function
$$
L(x_1,\ldots,x_n,\lambda)=F(x_1,\ldots,x_n)+\lambda C(x_1,\ldots,x_n).
$$
To minimize $L$ we compute its gradient and set it to zero. To do this numerically,
we follow the negative gradient direction to find the minimum.
Note that since
$$
{\partial L \over \partial \lambda} = C(x_1,\ldots,x_n)
$$
the iteration process does not finish as long as $C(x_1,\ldots,x_n) \neq 0$,
because in that case the partial derivative of $L$ with respect to $\lambda$
is nonzero. If the iteration process converges, we can be sure that the
constraint $C$ is satisfied. Care must be taken when the minimum of $F$
is reached at a saddle point of $L$. In this case some modifications of the
basic gradient descent algorithm are needed.
Fig.~\ref{lagrange} shows a diagram of the network (a {\em Lagrange neural network}) adapted to include a
constraint. Since all functions in the network are differentiable, the
partial derivatives needed can be computed with the backpropagation algorithm.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=3.5cm\epsfbox{lagrange.eps}}
\caption{\label{lagrange} Lagrange neural network}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Classifier networks}
It is now well known that neural networks trained to classify an
$n$-dimensional input $x$ in one out of $M$ classes can actually
learn to compute
the Bayesian a posteriori probabilities that the input $x$ belongs
to each class. Several proofs of this fact, differing only in the
details, have been published \cite{bourlard93}, \cite{richard91}, but they
can be simplified. In this section we offer a shorter proof of the probability property
of classifier neural networks.
\bigskip
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=5.5cm\epsfbox{bayes.eps}}
\caption{\label{bayes} The output $y(v)$ in a differential volume}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Part (a) of Fig.~\ref{bayes} shows the main idea of the proof. Points in an
input space are classified as belonging to a class $A$ or its complement.
This is the first simplification: we do not have to deal with
more than one class. In classifier networks, there is one output line for each class
$C_i$, $i=1,\ldots,M$. Each output $C_i$ is trained to produce a 1 when
the input belongs to class $i$, and otherwise a 0. Since the expected total error is
the sum of the expected individual errors of each output, we can minimize the expected individual
errors independently. This means that we need to consider only one output line
and when it should produce a 1 or a 0.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=4cm\epsfbox{gauss-classes.eps}}
\caption{\label{gauss-classes} Probability distribution of several classes on input space}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Assume that input space is divided into a lattice of differential volumes of size $dv$, each one centered at the $n$-dimensional point
$v$. If at the output representing class $A$ the network computes
the value $y(v) \in [0,1]$ for any point
$x$ in the differential volume $V(v)$ centered at $v$,
and denoting by $p(v)$ the probability $p(A|x \in V(v))$, then the total expected quadratic error is
$$ E_A = \sum_V \{p(v) (1-y(v))^2 + (1-p(v)) y(v)^2 \} dv,$$
where the sum runs over all differential volumes in the lattice.
Assume that the values $y(v)$ can be computed independently for each differential
volume. This means that we can independently minimize each of the terms of
the sum. This is done by differentiating each term with respect to the output $y(v)$ and
equating the result to zero
$$-2p(v)(1-y(v)) + 2(1-p(v))y(v)=0.$$
From this expression we deduce $p(v)=y(v)$, that is the output $y(v)$
which minimizes the error in the differential region centered at $v$ is
the a posteriori probability $p(v)$. In this case the expected error is
$$p(v)(1-p(v))^2+(1-p(v))y(v)^2 = p(v)(1-p(v))$$
and $E_A$ becomes the expected variance of the output line for class $A$.
Note that extending the above analysis to other kinds of error
functions is straightforward. For example, if the error at the output is measured by $\log(1-y(v))$
when the desired output is 1 and $\log(y(v))$ when it is 0, then the
terms in the sum of expected differential errors have the form
$$p(v)\log(1-y(v)) + (1-p(v))\log(y(v)).$$
Differentiating and equating to zero we again find $y(v)=p(v)$.
This short proof also strongly underlines the two conditions needed for neural
networks to produce a posteriori probabilities, namely {\em perfect training}
and {\em enough plasticity} of the network, so as to be able to approximate the patch of
probabilities given by the lattice of differential volumes and the values $y(v)$ which we optimize
independently of each other.
It is still possible to offer a simpler visual proof ``without words''
of the Bayesian property of classifier networks, as is done in part (b) of Fig.~\ref{bayes}. When training to
produce 1 for the class $A$ and 0 for $A^c$, we subject the
function produced by the network to an ``upward force'' proportional to the derivative of the error function, i.e. $(1-y(v))$, and the probability
$p(v)$, and a downward force proportional to $y(v)$ and the probability $(1-p(v))$. Both forces
are in equilibrium when $p(v)=y(v)$.
This result can be visualized with the help of Fig.~\ref{gauss-classes}. Several non-disjoint
clusters represent different classes defined on an input space. The correspondence of each input vector to a class is given only probabilistically. Such an input space
could consist for example of $n$-dimensional vectors, in which each component
is the numerical value assigned to each of $n$ possible symptoms. The classes defined
over this input space are the different illnesses. A vector of symptoms corresponds
to an illness with some probability. This is illustrated in Fig.~\ref{gauss-classes} with
the help of Gaussian shaped probability distributions. The clusters overlap, because
sometimes the same symptoms can correspond to different ailments. Such an overlap
could only be suppressed by acquiring more information.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=7cm\epsfbox{phoneme-net.eps}}
\caption{\label{phoneme-net} Network for the classification of feature vectors}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
This is a nice example of the kind of applications that such classification networks
can have, namely in medical diagnosis. The existent data banks can be used als
training set for a network and to compute the margin of error associated with
the classifications. The network can compute a first diagnosis, which is then given to
a physician which decides if he takes this information into account or overrides
the system.
\subsection{Statistical networks}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=6.5cm\epsfbox{pfad-prob.eps}}
\caption{\label{pfad-prob} Computation of the most probable path}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Neural networks are used as classifier networks to compute the probability
that any of a given set of phonemes could correspond to a given spectrum
and the context of the spectrum. The speech signal is divided in windows
of for example 10ms length. For each window the short term spectrum is
computed and quantized, using for example 18 coefficients. We can train a network to associate spectra with the probability of each phoneme of
occurring in a speech segment. A network like the one shown in Fig.~\ref{phoneme-net} is used. The coefficients of the six previous and
also of the six following windows are used together with the coefficients
of the window we are evaluating. The dimension of the input vector is
thus 234. If we consider 61 possible phonemes we end with the network
of Fig.~\ref{phoneme-net}.
The network is trained with labeled speech data. There are several data bases
which can be used for this purpose, but we will also discuss semiautomatic
methods for speech labeling. Once the network has been trained it can be
used to compute the emission probabilities of phonemes.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{figure}[ht]
\centerline{\epsfysize=8.36cm\epsfbox{phon-and.eps}}
\caption{\label{phon-and} Markov model for the word ``and''}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Once a set of emission probabilities has been computed for several time
frames $1,2\ldots,m$ it is necessary to compute the most probable path
of transitions of the vocal tract and emissions. This can be done using dynamic
programming methods of the same type as those generically known as {\em
time warping}. Fig.~\ref{pfad-prob} shows a matrix of phoneme probabilities
for which we want to compute such a most probable path. The result could be the
set of shaded nodes.
Finally, we will discuss how to integrate HMMs and neural networks in
speech recognition tasks. The role of the neural network is to provide
probabilities of emission. The role of the Markov model is to reduce the
number of alternatives that we want to consider. Fig.~\ref{phon-and} shows
an example of a Markov model for the word ``and''. The model contains
so many possibilities because this is a very common word and we want to
consider several possible variations. Other words which are not so
common can be modelled in a more simple manner.
I will provide a more extensive list of references at HeKoNN-96.
\begin{small}
\begin{thebibliography}{99}
\bibitem{bourlard93} Bourlard, H., Morgan, N. (1993), {\em Connectionist Speech Recognition},
Kluwer.
\bibitem{rabiner} Rabiner, L., Bing-Hwang, J. (1993), {\em Fundamentals of Speech Recognition},
Prentice-Hall International, London.
\bibitem{richard91} Richard, M. D., Lippmann, R. P. (1991), ``Neural Network Classifiers
Estimate {\em a posteriori} Probabilities'', {\em Neural Computation}, Vol. 3, N. 4, S. 461-483.
\bibitem{theorie} Rojas, R. (1996), {\em Neural Networks}, Springer-Verlag, New York.
\end{thebibliography}
\end{small}
\end{document}