Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
EST_sigpr_utt.h
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1995,1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 
34 #ifndef __EST_SIGPR_UTT_H__
35 #define __EST_SIGPR_UTT_H__
36 
37 #include "sigpr/EST_sigpr_frame.h"
38 #include "sigpr/EST_Window.h"
39 #include "EST_Track.h"
40 #include "EST_Wave.h"
41 
42 #define DEFAULT_WINDOW_NAME "hamming"
43 #define DEFAULT_FRAME_FACTOR 2.0
44 
45 /* Note: some of these functions deliberately don't have
46  doc++ style comments, mainly because they are, or will be
47  superseded soon.
48 */
49 
50 /**@name Functions for use with frame based processing
51 
52 In the following functions, the input is a \Ref{EST_Wave} waveform,
53 and the output is a (usually multi-channel) \Ref{EST_Track}. The
54 track must be set up appropriately before hand. This means the track
55 must be resized accordingly with the correct numbers of frame and
56 channels.
57 
58 The positions of the frames are found by examination of the {\bf time}
59 array in the EST_Track, which must be filled prior to the function
60 call. The usual requirement is for fixed frame analysis, where each
61 analysis frame is, say, 10ms after the previous one.
62 
63 A common alternative is to perform pitch-synchronous
64 analysis where the time shift is related to the local pitch period.
65 
66 */
67 
68 //@{
69 
70 /** Produce a single set of coefficients from a waveform. The type of
71  coefficient required is given in the argument <parameter>type</parameter>.
72  Possible types are:
73 
74 <variablelist>
75 
76 <varlistentry><term>lpc</term><listitem>linear predictive coding</listitem></varlistentry>
77 
78 <varlistentry><term>cep</term><listitem>cepstrum coding from lpc coefficients</listitem></varlistentry>
79 
80 <varlistentry><term>melcep</term><listitem>Mel scale cepstrum coding via fbank</listitem></varlistentry>
81 
82 <varlistentry><term>fbank</term><listitem>Mel scale log filterbank analysis</listitem></varlistentry>
83 
84 <varlistentry><term>lsf</term><listitem>line spectral frequencies</listitem></varlistentry>
85 
86 <varlistentry><term>ref</term><listitem>Linear prediction reflection coefficients</listitem></varlistentry>
87 
88 <varlistentry><term>power</term><listitem></listitem></varlistentry>
89 
90 <varlistentry><term>f0</term><listitem>srpd algorithm</listitem></varlistentry>
91 
92 <varlistentry><term>energy</term><listitem>root mean square energy</listitem></varlistentry>
93 
94 </variablelist>
95 
96 The order of the analysis is calculated from the number of
97 channels in <parameter>fv</parameter>. The positions of the analysis
98 windows must be given by filling in the track's time array.
99 
100 This function windows the waveform at the intervals given by the track
101 time array. The length of each window is <parameter>factor<parameter>
102 * the local time shift. The windowing function is giveb by
103 <parameter>wf</parameter>.
104 
105 @param sig: input waveform
106 @param fv: output coefficients. These have been pre-allocated and the
107  number of channels in a indicates the order of the analysis.
108 @param type: the types of coefficients to be produced. "lpc", "cep" etc
109 @param factor: the frame length factor, i.e. the analysis frame length
110  will be this times the local pitch period.
111 
112 @param wf: function for windowing. See \Ref{Windowing mechanisms}
113 */
114 
115 void sig2coef(EST_Wave &sig, EST_Track &a, EST_String type,
116  float factor = 2.0,
117  EST_WindowFunc *wf = EST_Window::creator(DEFAULT_WINDOW_NAME));
118 
119 /** Produce multiple coefficients from a waveform by repeated calls to
120  sig2coef.
121 
122 @param sig: input waveform
123 @param fv: output coefficients. These have been pre-allocated and the
124  number of channels in a indicates the order of the analysis.
125 @param op: Features structure containing options for analysis order,
126  frame shift etc.
127 @param slist: list of types of coefficients required, from the set of
128 possible types that sig2coef can take.
129 */
130 
131 void sigpr_base(EST_Wave &sig, EST_Track &fv, EST_Features &op,
132  const EST_StrList &slist);
133 
134 /** Calculate the power for each frame of the waveform.
135 
136 @param sig: input waveform
137 @param a: output power track
138 @param factor: the frame length factor, i.e. the analysis frame length
139  will be this times the local pitch period.
140 */
141 
142 void power(EST_Wave &sig, EST_Track &a, float factor);
143 
144 /** Calculate the rms energy for each frame of the waveform.
145 
146 This function calls
147 \Ref{sig2energy}
148 
149 
150 @param sig input waveform
151 @param a output coefficients
152 @param factor optional: the frame length factor, i.e. the analysis frame length
153  will be this times the local pitch period.
154 
155 */
156 
157 void energy(EST_Wave &sig, EST_Track &a, float factor);
158 
159 
160 /** Mel scale filter bank analysis. The Mel scale triangular filters
161 are computed via an FFT (see \Ref{fastFFT}). This routine is required
162 for Mel cepstral analysis (see \Ref{melcep}). The analysis of each
163 frame is done by \Ref{sig2fbank}.
164 
165 A typical filter bank analysis for speech recognition might use log
166 energy outputs from 20 filters.
167 
168 @param sig: input waveform
169 @param fbank: the output. The number of filters is determined from the number
170  size of this track.
171 @param factor: the frame length factor, i.e. the analysis frame length
172  will be this times the local pitch period
173 @param wf: function for windowing. See \Ref{Windowing mechanisms}
174 @param up: whether the filterbank analysis should use
175  power rather than energy.
176 @param take_log: whether to take logs of the filter outputs
177 
178 @see sig2fbank
179 @see melcep
180 */
181 
182 void fbank(EST_Wave &sig,
183  EST_Track &fbank,
184  const float factor,
185  EST_WindowFunc *wf = EST_Window::creator(DEFAULT_WINDOW_NAME),
186  const bool up = false,
187  const bool take_log = true);
188 
189 /** Mel scale cepstral analysis via filter bank analysis. Cepstral
190 parameters are computed for each frame of speech. The analysis
191 requires \Ref{fbank}. The cepstral analysis of the filterbank outputs
192 is performed by \Ref{fbank2melcep}.
193 
194 A typical Mel cepstral coefficient (MFCC) analysis for speech recognition
195 might use 12 cepstral coefficients computed from a 20 channel filterbank.
196 
197 
198 @param sig input: waveform
199 @param mfcc_track: the output
200 @param factor: the frame length factor, i.e. the analysis frame length
201  will be this times the local pitch period
202 @param fbank_order: the number of Mel scale filters used for the analysis
203 @param liftering_parameter: for filtering in the cepstral domain
204  See \Ref{fbank2melcep}
205 @param wf: function for windowing. See \Ref{Windowing mechanisms}
206 @param include_c0: whether the zero'th cepstral coefficient is to be included
207 @param up: whether the filterbank analysis should use
208  power rather than energy.
209 
210 @see fbank
211 @see fbank2melcep
212 */
213 
214 void melcep(EST_Wave &sig,
215  EST_Track &mfcc_track,
216  float factor,
217  int fbank_order,
218  float liftering_parameter,
219  EST_WindowFunc *wf = EST_Window::creator(DEFAULT_WINDOW_NAME),
220  const bool include_c0 = false,
221  const bool up = false);
222 
223 //@}
224 
225 
226 /**@name Pitch/F0 Detection Algorithm functions
227 
228 These functions are used to produce a track of fundamental frequency
229 (F0) against time of a waveform.
230 */
231 
232 //@{
233 
234 
235 /** Top level pitch (F0) detection algorithm. Returns a track
236 containing evenly spaced frames of speech, each containing a F0 value
237 for that point.
238 
239 At present, only the \Rref{srpd} pitch tracker is implemented, so
240 this is always called regardless of what <parameter>method</parameter>
241 is set to.
242 
243 @param sig: input waveform
244 @param fz: output f0 contour
245 @param op: parameters for pitch tracker
246 @param method: pda method to be used.
247 */
248 
249 
250 void pda(EST_Wave &sig, EST_Track &fz, EST_Features &op, EST_String method="");
251 
252 
253 /** Top level intonation contour detection algorithm. Returns a track
254 containing evenly spaced frames of speech, each containing a F0 for that point. {\tt icda} differs from \Ref{pda} in that the contour is
255 smoothed, and unvoiced portions have interpolated F0
256 values.
257 
258 @param sig: input waveform
259 @param fz: output f0 contour
260 @param speech: Interpolation is controlled by the <tt>speech</tt> track. When
261 a point has a positive value in the speech track, it is a candidate
262 for interpolation.
263 @param op: parameters for pitch tracker
264 @param method: pda method to be used.
265 */
266 
267 void icda(EST_Wave &sig, EST_Track &fz, EST_Track &speech,
268  EST_Option &op, EST_String method = "");
269 
270 /** Create a set sensible defaults for use in pda and icda.
271 
272 */
273 void default_pda_options(EST_Features &al);
274 
275 
276 /** Super resolution pitch tracker.
277 
278 srpd is a pitch detection algorithm that produces a fundamental
279 frequency contour from a speech waveform. At present only the super
280 resolution pitch determination algorithm is implemented. See (Medan,
281 Yair, and Chazan, 1991) and (Bagshaw et al., 1993) for a detailed
282 description of the algorithm. </para><para>
283 
284 Frames of data are read in from <parameter>sig</parameter> in
285 chronological order such that each frame is shifted in time from its
286 predecessor by <parameter>pda_frame_shift</parameter>. Each frame is
287 analysed in turn.
288 
289 </para><para>
290 
291 The maximum and minimum signal amplitudes are initially found over the
292 duration of two segments, each of length N_min samples. If the sum of
293 their absolute values is below two times
294 <parameter>noise_floor</parameter>, the frame is classified as
295 representing silence and no coefficients are calculated. Otherwise, a
296 cross correlation coefficient is calculated for all n from a period in
297 samples corresponding to <parameter>min_pitch
298 </parameter> to a period in samples corresponding to
299 <parameter>max_pitch</parameter>, in steps
300 of <parameter>decimation_factor</parameter>. In calculating the
301 coefficient only one in <parameter>decimation_factor</parameter>
302 samples of the two segments are used. Such down-sampling permits rapid
303 estimates of the coefficients to be calculated over the range
304 N_min <= n <= N_max. This results in a cross-correlation track for the
305 frame being analysed. </para><para>
306 
307 Local maxima of the track with a coefficient value above a specified
308 threshold form candidates for the fundamental period. The threshold is
309 adaptive and dependent upon the values <parameter>v2uv_coeff_thresh
310 </parameter>, <parameter>min_v2uv_coef_thresh </parameter>, and
311 <parameter> v2uv_coef_thresh_rati_ratio</parameter>. If the previously
312 analysed frame was classified as unvoiced or silent (which is the
313 initial state) then the threshold is set to
314 <parameter>v2uv_coef_thresh</parameter>. Otherwise, the previous
315 frame was classified as being voiced, and the threshold is set equal
316 to [\-r] <parameter>v2uv_coef_thresh_rati_ratio
317 </parameter> times the cross-correlation coefficient
318 value at the point of the previous fundamental period in the former
319 coefficients track. This product is not permitted to drop below
320 <parameter>v2uv_coef_thresh</parameter>.
321 
322 </para><para>
323 
324 If no candidates for the fundamental period are found, the frame is classified
325 as being unvoiced. Otherwise, the candidates are further processed to identify
326 the most likely true pitch period. During this additional processing, a
327 threshold given by <parameter>anti_doubling_thres</parameter> is used.
328 
329 </para><para>
330 
331 If the <parameter>peak_tracking</parameter> flag is set to true,
332 biasing is applied to the cross-correlation track as described in
333 (Bagshaw et al., 1993). </para><para> </para><para>
334 
335 
336 @param sig: input waveform
337 @param op: options regarding pitch tracking parameters
338 @param op.min_pitch: minimum permitted F0 value
339 @param op.max_pitch: maximum permitted F0 value
340 @param op.pda_frame_shift: analysis frame shift
341 @param op.pda_frame_length: analysis frame length
342 @param op.lpf_cutoff: cut off frequency for low pass filtering
343 @param op.lpf_order: order of low pass filtering (must be odd)
344 @param op.decimation
345 @param op.noise_floor
346 @param op.min_v2uv_coef_thresh
347 @param op.v2uv_coef_thresh_ratio
348 @param op.v2uv_coef_thresh
349 @param op.anti_doubling_thresh
350 @param op.peak_tracking
351 
352 */
353 void srpd(EST_Wave &sig, EST_Track &fz, EST_Features &options);
354 
355 /** Smooth selected parts of an f0 contour. Interpolation is
356 controlled by the <tt>speech</tt> track. When a point has a positive
357 value in the speech track, it is a candidate for interpolation.
358 */
359 void smooth_phrase(EST_Track &c, EST_Track &speech, EST_Features &options,
360  EST_Track &sm);
361 
362 /** Smooth all the points in an F0 contour*/
363 void smooth_portion(EST_Track &c, EST_Option &op);
364 
365 //@}
366 
367 
368 /**@name Delta and Acceleration coefficients
369 
370 Produce delta and acceleration coefficients from a set of coefficients
371 or the waveform.
372 */
373 
374 //@{
375 
376 /** Produce a set of delta coefficients for a track
377 
378 The delta function is used to produce a set of coefficients which
379 estimate the rate of change of a set of parameters. The output track
380 <parameter>d<parameter> must be setup before hand, i.e. it must have
381 the same number of frames and channels as <parameter>tr</parameter>.
382 
383 @param tr: input track of base coefficients
384 @param d: output track of delta coefficients.
385 @param regression_length: number of previous frames on which delta
386  estimation is calculated on.
387 */
388 
389 void delta(EST_Track &tr, EST_Track &d, int regression_length = 3);
390 
391 /** Produce multiple sets of delta coefficients from a waveform.
392 
393  Calculate specified types of delta coefficients. This function is
394  used when the base types of coefficients haven't been calculated.
395  This function calls sig2coef to calculate the base types from which
396  the deltas are calculated, and hence the requirements governing the
397  setup of <parameter>fv</parameter> for sig2coef also hold here.
398 
399 @param sig: input waveform
400 @param fv: output coefficients. These have been pre-allocated and the
401  number of channels in a indicates the order of the analysis.
402 @param op: Features structure containing options for analysis order,
403  frame shift etc.
404 @param slist: list of types of delta coefficients required.
405 */
406 
407 void sigpr_delta(EST_Wave &sig, EST_Track &fv, EST_Features &op,
408  const EST_StrList &slist);
409 
410 /** Produce multiple sets of acceleration coefficients from a waveform
411 
412  Calculate specified types of acceleration coefficients. This function
413  is used when the base types of coefficient haven't been calculated.
414  This function calls sig2coef to calculate the base types from which
415  the deltas are calculated, and hence the requirements governing the
416  setup of <parameter>fv</parameter> for sig2coef also hold here.
417 
418 @param sig: input waveform
419 @param fv: output coefficients. These have been pre-allocated and the
420  number of channels in a indicates the order of the analysis.
421 @param op: Features structure containing options for analysis order,
422  frame shift etc.
423 @param slist: list of types of acceleration coefficients required.
424 
425 
426 The delta function is used to produce a set of coefficients which
427 estimate the rate of change of a set of parameters.
428 */
429 
430 void sigpr_acc(EST_Wave &sig, EST_Track &fv, EST_Features &op,
431  const EST_StrList &slist);
432 
433 //@}
434 
435 /* Convert a track containing coefficients of one type to a track
436 containing coefficients of another.
437 
438 @param in_track input set of coefficients
439 @param out_track input set of coefficients
440 @param out_name name of desired output coefficients.
441 @param in_name optional: often it is possible to determine the type of
442 the input coefficients from the channel names. If this is not possible or
443 these names should be ignored, the {\tt in_type} parameter can be used.
444 
445 */
446 
447 void convert_track(EST_Track &in_track, EST_Track &out_track,
448  const EST_String &out_type,
449  const EST_String &in_type = "");
450 
451 
452 
453 #endif /* __EST_SIGPR_UTT_H__ */
454