Edinburgh Speech Tools
2.4-release
All
Classes
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Pages
EST_sigpr_frame.h
1
/*************************************************************************/
2
/* */
3
/* Centre for Speech Technology Research */
4
/* University of Edinburgh, UK */
5
/* Copyright (c) 1995,1996 */
6
/* All Rights Reserved. */
7
/* */
8
/* Permission is hereby granted, free of charge, to use and distribute */
9
/* this software and its documentation without restriction, including */
10
/* without limitation the rights to use, copy, modify, merge, publish, */
11
/* distribute, sublicense, and/or sell copies of this work, and to */
12
/* permit persons to whom this work is furnished to do so, subject to */
13
/* the following conditions: */
14
/* 1. The code must retain the above copyright notice, this list of */
15
/* conditions and the following disclaimer. */
16
/* 2. Any modifications must be clearly marked as such. */
17
/* 3. Original authors' names are not deleted. */
18
/* 4. The authors' names are not used to endorse or promote products */
19
/* derived from this software without specific prior written */
20
/* permission. */
21
/* */
22
/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25
/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30
/* THIS SOFTWARE. */
31
/* */
32
/*************************************************************************/
33
34
#ifndef __EST_SIGPR_FRAME_H__
35
#define __EST_SIGPR_FRAME_H__
36
37
#include "EST_FMatrix.h"
38
39
40
41
/**@name Linear Prediction functions
42
Including, generation of coefficients from the signal, reflection
43
coefficients, line spectral frequencies, areas.
44
*/
45
//@{
46
47
/** Produce the full set of linear prediction coefficients from a
48
frame of speech waveform.
49
50
@param sig: the frame of input waveform
51
@param acf: the autocorrelation coefficients
52
@param ref: the reflection coefficients
53
@param lpc: the LPC coefficients
54
55
The order of the lpc analysis is given as the size of the <parameter>
56
lpc <parameter> vector - 1. The coefficients are placed in the
57
locations 1 - size, and the energy is placed in location 0.
58
*/
59
60
void
sig2lpc(
const
EST_FVector
&sig,
EST_FVector
&acf,
61
EST_FVector
&ref,
EST_FVector
&lpc);
62
63
64
/** Calculate cepstral coefficients from lpc coefficients.
65
66
It is possible to calculate a set of cepstral coefficients from
67
lpc coefficients using the relationship:
68
69
\[c_{k}= a_{k} + \frac{1}{k}\sum_{i=1}^{k-1} i c_{i} a_{k-1}\]
70
71
The order of the cepstral analysis can be different from the lpc
72
order. If the cepstral order is greater, interpolation is used (FINISH
73
add equation). Both orders are taken from the lengths of the
74
respective vectors. Note that these cepstral coefficients take on the
75
assumptions (and errors) of the lpc model and hence will not be the
76
same as cepstral coefficients calculated using DFT functions.
77
78
@param lpc: the LPC coefficients (input)
79
@param lpc: the cepstral coefficients (output)
80
*/
81
82
void
lpc2cep(
const
EST_FVector
&lpc,
EST_FVector
&cep);
83
84
85
86
/** Produce a set linear prediction coefficients from a
87
frame of speech waveform. {\tt sig} is the frame of input waveform,
88
and {\tt lpc} are the LPC coefficients. The
89
{\bf order} of the lpc analysis is given as the size of the {\tt lpc}
90
vector -1. The coefficients are placed in the locations 1 - size, and
91
the energy is placed in location 0.
92
*/
93
void
sig2lpc(
const
EST_FVector
&sig,
EST_FVector
&lpc);
94
95
/** Produce a set of reflection coefficients from a
96
frame of speech waveform. {\tt sig} is the frame of input waveform,
97
and {\tt ref} are the LPC coefficients. The
98
{\bf order} of the lpc analysis is given as the size of the {\tt lpc}
99
vector -1. The coefficients are placed in the locations 1 - size, and
100
the energy is placed in location 0.
101
*/
102
void
sig2ref(
const
EST_FVector
&sig,
EST_FVector
&ref);
103
104
105
/**@name Area Functions
106
Using the analogy of the lossless tube, the
107
cross-sectional areas of the sections of this tube are related to the reflection coefficients and can be calculated from the following relationship:
108
109
\[\frac{A_{i+1}}{A_{i}} = \frac{i - k_{i}}{1 + k_{i}} \]
110
111
*/
112
//@{
113
/** The area according to the formula. */
114
void
ref2truearea(
const
EST_FVector
&ref,
EST_FVector
&area);
115
116
/** An approximation of the area is calculate by skipping the denominator
117
in the formula. */
118
void
ref2area(
const
EST_FVector
&ref,
EST_FVector
&area);
119
120
/** The logs of the areas. */
121
void
ref2logarea(
const
EST_FVector
&ref,
EST_FVector
&logarea);
122
//@}
123
124
/** Calculate the reflection coefficients from the lpc
125
coefficients. Note that in the standard linear prediction analysis,
126
the reflection coefficients are generated as a by-product. @see
127
sig2lpc */
128
129
void
lpc2ref(
const
EST_FVector
&lpc,
EST_FVector
&ref);
130
131
/** Calculate the linear prediction coefficients from the reflection
132
coefficients.
133
Use the equation:
134
\[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\]
135
136
@see lpc2ref*/
137
138
void
ref2lpc(
const
EST_FVector
&ref,
EST_FVector
&lpc);
139
140
/** Calculate line spectral frequencies from linear prediction coefficients.
141
Use the equation:
142
\[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\]
143
144
@see lsf2lpc
145
*/
146
147
void
lpc2lsf(
const
EST_FVector
&lpc,
EST_FVector
&lsf);
148
149
/** Calculate line spectral frequencies from linear prediction coefficients.
150
Use the equation:
151
\[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\]
152
153
@see lpc2lsf
154
*/
155
156
void
lsf2lpc(
const
EST_FVector
&lsf,
EST_FVector
&lpc);
157
//@}
158
159
void
frame_convert(
const
EST_FVector
&in_frame,
const
EST_String
&in_type,
160
EST_FVector
&out_frame,
const
EST_String
&out_type);
161
162
163
164
// end of lpc functions
165
166
/**@name Energy and power frame functions
167
*/
168
169
//@{
170
171
/** Calculate the power for a frame of speech. This is defined as
172
\[power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2\]
173
*/
174
175
176
void
sig2pow(
EST_FVector
&frame,
float
&power);
177
178
/** Calculate the root mean square energy for a frame of speech. This
179
is defined as \[energy=\sqrt{\frac{1}{n}\sum_{i=1}^{n}a_{i}^2}\] */
180
181
void
sig2rms(
EST_FVector
&frame,
float
&rms_energy);
182
183
//@}
184
// end of power and energy
185
186
/**@name Frame based filter bank and cepstral analysis
187
188
These functions are \Ref{Frame based signal processing functions}.
189
*/
190
191
//@{
192
193
/** Calculate the (log) energy (or power) in each channel of a Mel
194
scale filter bank for a frame of speech. The filters are triangular, are
195
evenly spaced and are all of equal width, on a Mel scale. The upper and lower
196
cutoffs of each filter are at the centre frequencies of the adjacent filters.
197
The Mel scale is described under {\tt Hz2Mel}.
198
199
@see Hz2Mel
200
@see sig2fft
201
@see fft2fbank
202
*/
203
204
void
sig2fbank(
const
EST_FVector
&sig,
205
EST_FVector
&fbank_frame,
206
const
float
sample_rate,
207
const
bool
use_power_rather_than_energy,
208
const
bool
take_log);
209
210
/** Calculate the energy (or power) spectrum of a frame of speech. The FFT
211
order is determined by the number of samples in the frame of speech, and is
212
a power of 2. Note that the FFT vector returned corresponds to frequencies
213
from 0 to half the sample rate. Energy is the magnitude of the FFT; power is
214
the squared magnitude.
215
216
@see fft2fbank
217
@see sig2fbank
218
*/
219
220
void
sig2fft(
const
EST_FVector
&sig,
221
EST_FVector
&fft_vec,
222
const
bool
use_power_rather_than_energy);
223
224
/** Given a Mel filter bank description, bin the FFT coefficients
225
to compute the output of the filters. The first and last elements of
226
{\tt mel_fbank_frequencies} define the lower and upper bound of
227
the first and last filters respectively and the intervening elements
228
give the filter centre frequencies. That is, {\tt mel_fbank_frequencies} has
229
two more elements than {\tt fbank_vec}.
230
231
@see fastFFT
232
@see sig2fft
233
@see sig2fbank
234
@see fbank2melcep
235
*/
236
237
void
fft2fbank(
const
EST_FVector
&fft_frame,
238
EST_FVector
&fbank_vec,
239
const
float
Hz_per_fft_coeff,
240
const
EST_FVector
&mel_fbank_frequencies);
241
242
/** Compute the discrete cosine transform of log Mel-scale filter bank output
243
to get the Mel cepstral coefficients for a frame of speech.
244
Optional liftering (filtering in the cepstral domain) can be applied to
245
normalise the magnitudes of the coefficients. This is useful because,
246
typically, the higher order cepstral coefficients are significantly
247
smaller than the lower ones and it is often desirable to normalise
248
the means and variances across coefficients.
249
250
The lifter (cepstral filter) used is:
251
\[c_i' = \{ 1 + \frac{L}{2} sin \frac{\Pi i}{L} \} \; c_i\]
252
253
A typical value of L used in speech recognition is 22. A value of L=0 is taken
254
to mean no liftering. This is equivalent to L=1.
255
256
@see sig2fft
257
@see fft2fbank
258
@see sig2fbank
259
*/
260
261
void
fbank2melcep(
const
EST_FVector
&fbank_vec,
262
EST_FVector
&mfcc,
263
const
float
liftering_parameter,
264
const
bool
include_c0 =
false
);
265
266
/** Make a triangular Mel scale filter. The filter is centred at
267
{\tt this_mel_centre} and
268
extends from {\tt this_mel_low} to {\tt this_mel_high}. {\tt half_fft_order}
269
is the length of a power/energy spectrum covering 0Hz to half the sampling
270
frequency with a resolution of {\tt Hz_per_fft_coeff}.
271
272
The routine returns a vector of weights to be applied to the energy/power
273
spectrum starting at element {\tt fft_index_start}.
274
The number of points (FFT coefficients) covered
275
by the filter is given by the length of the returned vector {\tt filter}.
276
277
@see fft2fbank
278
@see Hz2Mel
279
@see Mel2Hz
280
*/
281
282
void
make_mel_triangular_filter(
const
float
this_mel_centre,
283
const
float
this_mel_low,
284
const
float
this_mel_high,
285
const
float
Hz_per_fft_coeff,
286
const
int
half_fft_order,
287
int
&fft_index_start,
288
EST_FVector
&filter);
289
290
/**@name Frequency conversion functions
291
292
These are functions used in \Ref{Filter bank and cepstral analysis}.
293
*/
294
295
//@{
296
297
/** Convert Hertz to Mel. The Mel scale is defined by
298
\[f_{\mbox{Mel}} = 1127 \; log( 1 + \frac{f_{\mbox{Hertz}}}{700} )\]
299
300
@see Mel2Hz
301
@see Frequency conversion functions
302
*/
303
304
float
Hz2Mel(
float
frequency_in_Hertz);
305
306
/**
307
Convert Mel to Hertz.
308
309
@see Hz2Mel
310
*/
311
312
float
Mel2Hz(
float
frequency_in_Mel);
313
314
//@}
315
// end of frequency conversion functions
316
317
//@}
318
// end of filter bank and cepstral analysis
319
320
321
322
323
#endif
/* __EST_SIGPR_FRAME_H__ */
include
sigpr
EST_sigpr_frame.h
Generated on Wed Dec 24 2014 09:16:35 for Edinburgh Speech Tools by
1.8.3.1