Edinburgh Speech Tools
2.4-release
All
Classes
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Pages
EST_sigpr_utt.h
1
/*************************************************************************/
2
/* */
3
/* Centre for Speech Technology Research */
4
/* University of Edinburgh, UK */
5
/* Copyright (c) 1995,1996 */
6
/* All Rights Reserved. */
7
/* */
8
/* Permission is hereby granted, free of charge, to use and distribute */
9
/* this software and its documentation without restriction, including */
10
/* without limitation the rights to use, copy, modify, merge, publish, */
11
/* distribute, sublicense, and/or sell copies of this work, and to */
12
/* permit persons to whom this work is furnished to do so, subject to */
13
/* the following conditions: */
14
/* 1. The code must retain the above copyright notice, this list of */
15
/* conditions and the following disclaimer. */
16
/* 2. Any modifications must be clearly marked as such. */
17
/* 3. Original authors' names are not deleted. */
18
/* 4. The authors' names are not used to endorse or promote products */
19
/* derived from this software without specific prior written */
20
/* permission. */
21
/* */
22
/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25
/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30
/* THIS SOFTWARE. */
31
/* */
32
/*************************************************************************/
33
34
#ifndef __EST_SIGPR_UTT_H__
35
#define __EST_SIGPR_UTT_H__
36
37
#include "sigpr/EST_sigpr_frame.h"
38
#include "sigpr/EST_Window.h"
39
#include "EST_Track.h"
40
#include "EST_Wave.h"
41
42
#define DEFAULT_WINDOW_NAME "hamming"
43
#define DEFAULT_FRAME_FACTOR 2.0
44
45
/* Note: some of these functions deliberately don't have
46
doc++ style comments, mainly because they are, or will be
47
superseded soon.
48
*/
49
50
/**@name Functions for use with frame based processing
51
52
In the following functions, the input is a \Ref{EST_Wave} waveform,
53
and the output is a (usually multi-channel) \Ref{EST_Track}. The
54
track must be set up appropriately before hand. This means the track
55
must be resized accordingly with the correct numbers of frame and
56
channels.
57
58
The positions of the frames are found by examination of the {\bf time}
59
array in the EST_Track, which must be filled prior to the function
60
call. The usual requirement is for fixed frame analysis, where each
61
analysis frame is, say, 10ms after the previous one.
62
63
A common alternative is to perform pitch-synchronous
64
analysis where the time shift is related to the local pitch period.
65
66
*/
67
68
//@{
69
70
/** Produce a single set of coefficients from a waveform. The type of
71
coefficient required is given in the argument <parameter>type</parameter>.
72
Possible types are:
73
74
<variablelist>
75
76
<varlistentry><term>lpc</term><listitem>linear predictive coding</listitem></varlistentry>
77
78
<varlistentry><term>cep</term><listitem>cepstrum coding from lpc coefficients</listitem></varlistentry>
79
80
<varlistentry><term>melcep</term><listitem>Mel scale cepstrum coding via fbank</listitem></varlistentry>
81
82
<varlistentry><term>fbank</term><listitem>Mel scale log filterbank analysis</listitem></varlistentry>
83
84
<varlistentry><term>lsf</term><listitem>line spectral frequencies</listitem></varlistentry>
85
86
<varlistentry><term>ref</term><listitem>Linear prediction reflection coefficients</listitem></varlistentry>
87
88
<varlistentry><term>power</term><listitem></listitem></varlistentry>
89
90
<varlistentry><term>f0</term><listitem>srpd algorithm</listitem></varlistentry>
91
92
<varlistentry><term>energy</term><listitem>root mean square energy</listitem></varlistentry>
93
94
</variablelist>
95
96
The order of the analysis is calculated from the number of
97
channels in <parameter>fv</parameter>. The positions of the analysis
98
windows must be given by filling in the track's time array.
99
100
This function windows the waveform at the intervals given by the track
101
time array. The length of each window is <parameter>factor<parameter>
102
* the local time shift. The windowing function is giveb by
103
<parameter>wf</parameter>.
104
105
@param sig: input waveform
106
@param fv: output coefficients. These have been pre-allocated and the
107
number of channels in a indicates the order of the analysis.
108
@param type: the types of coefficients to be produced. "lpc", "cep" etc
109
@param factor: the frame length factor, i.e. the analysis frame length
110
will be this times the local pitch period.
111
112
@param wf: function for windowing. See \Ref{Windowing mechanisms}
113
*/
114
115
void
sig2coef(
EST_Wave
&sig,
EST_Track
&a,
EST_String
type,
116
float
factor = 2.0,
117
EST_WindowFunc *wf =
EST_Window::creator
(DEFAULT_WINDOW_NAME));
118
119
/** Produce multiple coefficients from a waveform by repeated calls to
120
sig2coef.
121
122
@param sig: input waveform
123
@param fv: output coefficients. These have been pre-allocated and the
124
number of channels in a indicates the order of the analysis.
125
@param op: Features structure containing options for analysis order,
126
frame shift etc.
127
@param slist: list of types of coefficients required, from the set of
128
possible types that sig2coef can take.
129
*/
130
131
void
sigpr_base(
EST_Wave
&sig,
EST_Track
&fv,
EST_Features
&op,
132
const
EST_StrList
&slist);
133
134
/** Calculate the power for each frame of the waveform.
135
136
@param sig: input waveform
137
@param a: output power track
138
@param factor: the frame length factor, i.e. the analysis frame length
139
will be this times the local pitch period.
140
*/
141
142
void
power(
EST_Wave
&sig,
EST_Track
&a,
float
factor);
143
144
/** Calculate the rms energy for each frame of the waveform.
145
146
This function calls
147
\Ref{sig2energy}
148
149
150
@param sig input waveform
151
@param a output coefficients
152
@param factor optional: the frame length factor, i.e. the analysis frame length
153
will be this times the local pitch period.
154
155
*/
156
157
void
energy(
EST_Wave
&sig,
EST_Track
&a,
float
factor);
158
159
160
/** Mel scale filter bank analysis. The Mel scale triangular filters
161
are computed via an FFT (see \Ref{fastFFT}). This routine is required
162
for Mel cepstral analysis (see \Ref{melcep}). The analysis of each
163
frame is done by \Ref{sig2fbank}.
164
165
A typical filter bank analysis for speech recognition might use log
166
energy outputs from 20 filters.
167
168
@param sig: input waveform
169
@param fbank: the output. The number of filters is determined from the number
170
size of this track.
171
@param factor: the frame length factor, i.e. the analysis frame length
172
will be this times the local pitch period
173
@param wf: function for windowing. See \Ref{Windowing mechanisms}
174
@param up: whether the filterbank analysis should use
175
power rather than energy.
176
@param take_log: whether to take logs of the filter outputs
177
178
@see sig2fbank
179
@see melcep
180
*/
181
182
void
fbank(
EST_Wave
&sig,
183
EST_Track
&fbank,
184
const
float
factor,
185
EST_WindowFunc *wf =
EST_Window::creator
(DEFAULT_WINDOW_NAME),
186
const
bool
up =
false
,
187
const
bool
take_log =
true
);
188
189
/** Mel scale cepstral analysis via filter bank analysis. Cepstral
190
parameters are computed for each frame of speech. The analysis
191
requires \Ref{fbank}. The cepstral analysis of the filterbank outputs
192
is performed by \Ref{fbank2melcep}.
193
194
A typical Mel cepstral coefficient (MFCC) analysis for speech recognition
195
might use 12 cepstral coefficients computed from a 20 channel filterbank.
196
197
198
@param sig input: waveform
199
@param mfcc_track: the output
200
@param factor: the frame length factor, i.e. the analysis frame length
201
will be this times the local pitch period
202
@param fbank_order: the number of Mel scale filters used for the analysis
203
@param liftering_parameter: for filtering in the cepstral domain
204
See \Ref{fbank2melcep}
205
@param wf: function for windowing. See \Ref{Windowing mechanisms}
206
@param include_c0: whether the zero'th cepstral coefficient is to be included
207
@param up: whether the filterbank analysis should use
208
power rather than energy.
209
210
@see fbank
211
@see fbank2melcep
212
*/
213
214
void
melcep(
EST_Wave
&sig,
215
EST_Track
&mfcc_track,
216
float
factor,
217
int
fbank_order,
218
float
liftering_parameter,
219
EST_WindowFunc *wf =
EST_Window::creator
(DEFAULT_WINDOW_NAME),
220
const
bool
include_c0 =
false
,
221
const
bool
up =
false
);
222
223
//@}
224
225
226
/**@name Pitch/F0 Detection Algorithm functions
227
228
These functions are used to produce a track of fundamental frequency
229
(F0) against time of a waveform.
230
*/
231
232
//@{
233
234
235
/** Top level pitch (F0) detection algorithm. Returns a track
236
containing evenly spaced frames of speech, each containing a F0 value
237
for that point.
238
239
At present, only the \Rref{srpd} pitch tracker is implemented, so
240
this is always called regardless of what <parameter>method</parameter>
241
is set to.
242
243
@param sig: input waveform
244
@param fz: output f0 contour
245
@param op: parameters for pitch tracker
246
@param method: pda method to be used.
247
*/
248
249
250
void
pda(
EST_Wave
&sig,
EST_Track
&fz,
EST_Features
&op,
EST_String
method=
""
);
251
252
253
/** Top level intonation contour detection algorithm. Returns a track
254
containing evenly spaced frames of speech, each containing a F0 for that point. {\tt icda} differs from \Ref{pda} in that the contour is
255
smoothed, and unvoiced portions have interpolated F0
256
values.
257
258
@param sig: input waveform
259
@param fz: output f0 contour
260
@param speech: Interpolation is controlled by the <tt>speech</tt> track. When
261
a point has a positive value in the speech track, it is a candidate
262
for interpolation.
263
@param op: parameters for pitch tracker
264
@param method: pda method to be used.
265
*/
266
267
void
icda(
EST_Wave
&sig,
EST_Track
&fz,
EST_Track
&speech,
268
EST_Option
&op,
EST_String
method =
""
);
269
270
/** Create a set sensible defaults for use in pda and icda.
271
272
*/
273
void
default_pda_options(
EST_Features
&al);
274
275
276
/** Super resolution pitch tracker.
277
278
srpd is a pitch detection algorithm that produces a fundamental
279
frequency contour from a speech waveform. At present only the super
280
resolution pitch determination algorithm is implemented. See (Medan,
281
Yair, and Chazan, 1991) and (Bagshaw et al., 1993) for a detailed
282
description of the algorithm. </para><para>
283
284
Frames of data are read in from <parameter>sig</parameter> in
285
chronological order such that each frame is shifted in time from its
286
predecessor by <parameter>pda_frame_shift</parameter>. Each frame is
287
analysed in turn.
288
289
</para><para>
290
291
The maximum and minimum signal amplitudes are initially found over the
292
duration of two segments, each of length N_min samples. If the sum of
293
their absolute values is below two times
294
<parameter>noise_floor</parameter>, the frame is classified as
295
representing silence and no coefficients are calculated. Otherwise, a
296
cross correlation coefficient is calculated for all n from a period in
297
samples corresponding to <parameter>min_pitch
298
</parameter> to a period in samples corresponding to
299
<parameter>max_pitch</parameter>, in steps
300
of <parameter>decimation_factor</parameter>. In calculating the
301
coefficient only one in <parameter>decimation_factor</parameter>
302
samples of the two segments are used. Such down-sampling permits rapid
303
estimates of the coefficients to be calculated over the range
304
N_min <= n <= N_max. This results in a cross-correlation track for the
305
frame being analysed. </para><para>
306
307
Local maxima of the track with a coefficient value above a specified
308
threshold form candidates for the fundamental period. The threshold is
309
adaptive and dependent upon the values <parameter>v2uv_coeff_thresh
310
</parameter>, <parameter>min_v2uv_coef_thresh </parameter>, and
311
<parameter> v2uv_coef_thresh_rati_ratio</parameter>. If the previously
312
analysed frame was classified as unvoiced or silent (which is the
313
initial state) then the threshold is set to
314
<parameter>v2uv_coef_thresh</parameter>. Otherwise, the previous
315
frame was classified as being voiced, and the threshold is set equal
316
to [\-r] <parameter>v2uv_coef_thresh_rati_ratio
317
</parameter> times the cross-correlation coefficient
318
value at the point of the previous fundamental period in the former
319
coefficients track. This product is not permitted to drop below
320
<parameter>v2uv_coef_thresh</parameter>.
321
322
</para><para>
323
324
If no candidates for the fundamental period are found, the frame is classified
325
as being unvoiced. Otherwise, the candidates are further processed to identify
326
the most likely true pitch period. During this additional processing, a
327
threshold given by <parameter>anti_doubling_thres</parameter> is used.
328
329
</para><para>
330
331
If the <parameter>peak_tracking</parameter> flag is set to true,
332
biasing is applied to the cross-correlation track as described in
333
(Bagshaw et al., 1993). </para><para> </para><para>
334
335
336
@param sig: input waveform
337
@param op: options regarding pitch tracking parameters
338
@param op.min_pitch: minimum permitted F0 value
339
@param op.max_pitch: maximum permitted F0 value
340
@param op.pda_frame_shift: analysis frame shift
341
@param op.pda_frame_length: analysis frame length
342
@param op.lpf_cutoff: cut off frequency for low pass filtering
343
@param op.lpf_order: order of low pass filtering (must be odd)
344
@param op.decimation
345
@param op.noise_floor
346
@param op.min_v2uv_coef_thresh
347
@param op.v2uv_coef_thresh_ratio
348
@param op.v2uv_coef_thresh
349
@param op.anti_doubling_thresh
350
@param op.peak_tracking
351
352
*/
353
void
srpd(
EST_Wave
&sig,
EST_Track
&fz,
EST_Features
&options);
354
355
/** Smooth selected parts of an f0 contour. Interpolation is
356
controlled by the <tt>speech</tt> track. When a point has a positive
357
value in the speech track, it is a candidate for interpolation.
358
*/
359
void
smooth_phrase(
EST_Track
&c,
EST_Track
&speech,
EST_Features
&options,
360
EST_Track
&sm);
361
362
/** Smooth all the points in an F0 contour*/
363
void
smooth_portion(
EST_Track
&c,
EST_Option
&op);
364
365
//@}
366
367
368
/**@name Delta and Acceleration coefficients
369
370
Produce delta and acceleration coefficients from a set of coefficients
371
or the waveform.
372
*/
373
374
//@{
375
376
/** Produce a set of delta coefficients for a track
377
378
The delta function is used to produce a set of coefficients which
379
estimate the rate of change of a set of parameters. The output track
380
<parameter>d<parameter> must be setup before hand, i.e. it must have
381
the same number of frames and channels as <parameter>tr</parameter>.
382
383
@param tr: input track of base coefficients
384
@param d: output track of delta coefficients.
385
@param regression_length: number of previous frames on which delta
386
estimation is calculated on.
387
*/
388
389
void
delta(
EST_Track
&tr,
EST_Track
&d,
int
regression_length = 3);
390
391
/** Produce multiple sets of delta coefficients from a waveform.
392
393
Calculate specified types of delta coefficients. This function is
394
used when the base types of coefficients haven't been calculated.
395
This function calls sig2coef to calculate the base types from which
396
the deltas are calculated, and hence the requirements governing the
397
setup of <parameter>fv</parameter> for sig2coef also hold here.
398
399
@param sig: input waveform
400
@param fv: output coefficients. These have been pre-allocated and the
401
number of channels in a indicates the order of the analysis.
402
@param op: Features structure containing options for analysis order,
403
frame shift etc.
404
@param slist: list of types of delta coefficients required.
405
*/
406
407
void
sigpr_delta(
EST_Wave
&sig,
EST_Track
&fv,
EST_Features
&op,
408
const
EST_StrList
&slist);
409
410
/** Produce multiple sets of acceleration coefficients from a waveform
411
412
Calculate specified types of acceleration coefficients. This function
413
is used when the base types of coefficient haven't been calculated.
414
This function calls sig2coef to calculate the base types from which
415
the deltas are calculated, and hence the requirements governing the
416
setup of <parameter>fv</parameter> for sig2coef also hold here.
417
418
@param sig: input waveform
419
@param fv: output coefficients. These have been pre-allocated and the
420
number of channels in a indicates the order of the analysis.
421
@param op: Features structure containing options for analysis order,
422
frame shift etc.
423
@param slist: list of types of acceleration coefficients required.
424
425
426
The delta function is used to produce a set of coefficients which
427
estimate the rate of change of a set of parameters.
428
*/
429
430
void
sigpr_acc(
EST_Wave
&sig,
EST_Track
&fv,
EST_Features
&op,
431
const
EST_StrList
&slist);
432
433
//@}
434
435
/* Convert a track containing coefficients of one type to a track
436
containing coefficients of another.
437
438
@param in_track input set of coefficients
439
@param out_track input set of coefficients
440
@param out_name name of desired output coefficients.
441
@param in_name optional: often it is possible to determine the type of
442
the input coefficients from the channel names. If this is not possible or
443
these names should be ignored, the {\tt in_type} parameter can be used.
444
445
*/
446
447
void
convert_track(
EST_Track
&in_track,
EST_Track
&out_track,
448
const
EST_String
&out_type,
449
const
EST_String
&in_type =
""
);
450
451
452
453
#endif
/* __EST_SIGPR_UTT_H__ */
454
include
sigpr
EST_sigpr_utt.h
Generated on Wed Dec 24 2014 09:16:35 for Edinburgh Speech Tools by
1.8.3.1