Edinburgh Speech Tools  2.4-release
EST_sigpr_frame.h
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1995,1996 */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33
34 #ifndef __EST_SIGPR_FRAME_H__
35 #define __EST_SIGPR_FRAME_H__
36
37 #include "EST_FMatrix.h"
38
39
40
41 /**@name Linear Prediction functions
42 Including, generation of coefficients from the signal, reflection
43 coefficients, line spectral frequencies, areas.
44 */
45 //@{
46
47 /** Produce the full set of linear prediction coefficients from a
48  frame of speech waveform.
49
50 @param sig: the frame of input waveform
51 @param acf: the autocorrelation coefficients
52 @param ref: the reflection coefficients
53 @param lpc: the LPC coefficients
54
55 The order of the lpc analysis is given as the size of the <parameter>
56 lpc <parameter> vector - 1. The coefficients are placed in the
57 locations 1 - size, and the energy is placed in location 0.
58 */
59
60 void sig2lpc(const EST_FVector &sig, EST_FVector &acf,
61  EST_FVector &ref, EST_FVector &lpc);
62
63
64 /** Calculate cepstral coefficients from lpc coefficients.
65
66 It is possible to calculate a set of cepstral coefficients from
67 lpc coefficients using the relationship:
68
69 $c_{k}= a_{k} + \frac{1}{k}\sum_{i=1}^{k-1} i c_{i} a_{k-1}$
70
71 The order of the cepstral analysis can be different from the lpc
72 order. If the cepstral order is greater, interpolation is used (FINISH
73 add equation). Both orders are taken from the lengths of the
74 respective vectors. Note that these cepstral coefficients take on the
75 assumptions (and errors) of the lpc model and hence will not be the
76 same as cepstral coefficients calculated using DFT functions.
77
78 @param lpc: the LPC coefficients (input)
79 @param lpc: the cepstral coefficients (output)
80 */
81
82 void lpc2cep(const EST_FVector &lpc, EST_FVector &cep);
83
84
85
86 /** Produce a set linear prediction coefficients from a
87  frame of speech waveform. {\tt sig} is the frame of input waveform,
88  and {\tt lpc} are the LPC coefficients. The
89  {\bf order} of the lpc analysis is given as the size of the {\tt lpc}
90  vector -1. The coefficients are placed in the locations 1 - size, and
91  the energy is placed in location 0.
92 */
93 void sig2lpc(const EST_FVector &sig, EST_FVector &lpc);
94
95 /** Produce a set of reflection coefficients from a
96  frame of speech waveform. {\tt sig} is the frame of input waveform,
97  and {\tt ref} are the LPC coefficients. The
98  {\bf order} of the lpc analysis is given as the size of the {\tt lpc}
99  vector -1. The coefficients are placed in the locations 1 - size, and
100  the energy is placed in location 0.
101 */
102 void sig2ref(const EST_FVector &sig, EST_FVector &ref);
103
104
105 /**@name Area Functions
106 Using the analogy of the lossless tube, the
107 cross-sectional areas of the sections of this tube are related to the reflection coefficients and can be calculated from the following relationship:
108
109 $\frac{A_{i+1}}{A_{i}} = \frac{i - k_{i}}{1 + k_{i}}$
110
111 */
112 //@{
113 /** The area according to the formula. */
114 void ref2truearea(const EST_FVector &ref, EST_FVector &area);
115
116 /** An approximation of the area is calculate by skipping the denominator
117 in the formula. */
118 void ref2area(const EST_FVector &ref, EST_FVector &area);
119
120 /** The logs of the areas. */
121 void ref2logarea(const EST_FVector &ref, EST_FVector &logarea);
122 //@}
123
124 /** Calculate the reflection coefficients from the lpc
125 coefficients. Note that in the standard linear prediction analysis,
126 the reflection coefficients are generated as a by-product. @see
127 sig2lpc */
128
129 void lpc2ref(const EST_FVector &lpc, EST_FVector &ref);
130
131 /** Calculate the linear prediction coefficients from the reflection
132 coefficients.
133 Use the equation:
134 $power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2$
135
136 @see lpc2ref*/
137
138 void ref2lpc(const EST_FVector &ref, EST_FVector &lpc);
139
140 /** Calculate line spectral frequencies from linear prediction coefficients.
141 Use the equation:
142 $power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2$
143
144 @see lsf2lpc
145 */
146
147 void lpc2lsf(const EST_FVector &lpc, EST_FVector &lsf);
148
149 /** Calculate line spectral frequencies from linear prediction coefficients.
150 Use the equation:
151 $power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2$
152
153 @see lpc2lsf
154 */
155
156 void lsf2lpc(const EST_FVector &lsf, EST_FVector &lpc);
157 //@}
158
159 void frame_convert(const EST_FVector &in_frame, const EST_String &in_type,
160  EST_FVector &out_frame, const EST_String &out_type);
161
162
163
164 // end of lpc functions
165
166 /**@name Energy and power frame functions
167 */
168
169 //@{
170
171 /** Calculate the power for a frame of speech. This is defined as
172 $power=\frac{1}{n}\sum_{i=1}^{n}a_{i}^2$
173 */
174
175
176 void sig2pow(EST_FVector &frame, float &power);
177
178 /** Calculate the root mean square energy for a frame of speech. This
179 is defined as $energy=\sqrt{\frac{1}{n}\sum_{i=1}^{n}a_{i}^2}$ */
180
181 void sig2rms(EST_FVector &frame, float &rms_energy);
182
183 //@}
184 // end of power and energy
185
186 /**@name Frame based filter bank and cepstral analysis
187
188 These functions are \Ref{Frame based signal processing functions}.
189 */
190
191 //@{
192
193 /** Calculate the (log) energy (or power) in each channel of a Mel
194 scale filter bank for a frame of speech. The filters are triangular, are
195 evenly spaced and are all of equal width, on a Mel scale. The upper and lower
196 cutoffs of each filter are at the centre frequencies of the adjacent filters.
197 The Mel scale is described under {\tt Hz2Mel}.
198
199 @see Hz2Mel
200 @see sig2fft
201 @see fft2fbank
202 */
203
204 void sig2fbank(const EST_FVector &sig,
205  EST_FVector &fbank_frame,
206  const float sample_rate,
207  const bool use_power_rather_than_energy,
208  const bool take_log);
209
210 /** Calculate the energy (or power) spectrum of a frame of speech. The FFT
211 order is determined by the number of samples in the frame of speech, and is
212 a power of 2. Note that the FFT vector returned corresponds to frequencies
213 from 0 to half the sample rate. Energy is the magnitude of the FFT; power is
214 the squared magnitude.
215
216 @see fft2fbank
217 @see sig2fbank
218 */
219
220 void sig2fft(const EST_FVector &sig,
221  EST_FVector &fft_vec,
222  const bool use_power_rather_than_energy);
223
224 /** Given a Mel filter bank description, bin the FFT coefficients
225 to compute the output of the filters. The first and last elements of
226 {\tt mel_fbank_frequencies} define the lower and upper bound of
227 the first and last filters respectively and the intervening elements
228 give the filter centre frequencies. That is, {\tt mel_fbank_frequencies} has
229 two more elements than {\tt fbank_vec}.
230
231 @see fastFFT
232 @see sig2fft
233 @see sig2fbank
234 @see fbank2melcep
235 */
236
237 void fft2fbank(const EST_FVector &fft_frame,
238  EST_FVector &fbank_vec,
239  const float Hz_per_fft_coeff,
240  const EST_FVector &mel_fbank_frequencies);
241
242 /** Compute the discrete cosine transform of log Mel-scale filter bank output
243 to get the Mel cepstral coefficients for a frame of speech.
244 Optional liftering (filtering in the cepstral domain) can be applied to
245 normalise the magnitudes of the coefficients. This is useful because,
246 typically, the higher order cepstral coefficients are significantly
247 smaller than the lower ones and it is often desirable to normalise
248 the means and variances across coefficients.
249
250 The lifter (cepstral filter) used is:
251 $c_i' = \{ 1 + \frac{L}{2} sin \frac{\Pi i}{L} \} \; c_i$
252
253 A typical value of L used in speech recognition is 22. A value of L=0 is taken
254 to mean no liftering. This is equivalent to L=1.
255
256 @see sig2fft
257 @see fft2fbank
258 @see sig2fbank
259 */
260
261 void fbank2melcep(const EST_FVector &fbank_vec,
262  EST_FVector &mfcc,
263  const float liftering_parameter,
264  const bool include_c0 = false);
265
266 /** Make a triangular Mel scale filter. The filter is centred at
267 {\tt this_mel_centre} and
268 extends from {\tt this_mel_low} to {\tt this_mel_high}. {\tt half_fft_order}
269 is the length of a power/energy spectrum covering 0Hz to half the sampling
270 frequency with a resolution of {\tt Hz_per_fft_coeff}.
271
272 The routine returns a vector of weights to be applied to the energy/power
273 spectrum starting at element {\tt fft_index_start}.
274 The number of points (FFT coefficients) covered
275 by the filter is given by the length of the returned vector {\tt filter}.
276
277 @see fft2fbank
278 @see Hz2Mel
279 @see Mel2Hz
280 */
281
282 void make_mel_triangular_filter(const float this_mel_centre,
283  const float this_mel_low,
284  const float this_mel_high,
285  const float Hz_per_fft_coeff,
286  const int half_fft_order,
287  int &fft_index_start,
288  EST_FVector &filter);
289
290 /**@name Frequency conversion functions
291
292 These are functions used in \Ref{Filter bank and cepstral analysis}.
293 */
294
295 //@{
296
297 /** Convert Hertz to Mel. The Mel scale is defined by
298 $f_{\mbox{Mel}} = 1127 \; log( 1 + \frac{f_{\mbox{Hertz}}}{700} )$
299
300 @see Mel2Hz
301 @see Frequency conversion functions
302 */
303
304 float Hz2Mel(float frequency_in_Hertz);
305
306 /**
307 Convert Mel to Hertz.
308
309 @see Hz2Mel
310 */
311
312 float Mel2Hz(float frequency_in_Mel);
313
314 //@}
315 // end of frequency conversion functions
316
317 //@}
318 // end of filter bank and cepstral analysis
319
320
321
322
323 #endif /* __EST_SIGPR_FRAME_H__ */