37 #include "EST_speech_class.h"
38 #include "sigpr/EST_sigpr_utt.h"
39 #include "sigpr/EST_filter.h"
41 #include "EST_error.h"
42 #include "EST_string_aux.h"
56 method = op.
S(
"pda_method");
60 else if (method ==
"srpd")
63 EST_error(
"Unknown pda %s\n", (
const char *)method);
73 method = op.
S(
"pda_method");
76 srpd(sig, raw_fz, op);
77 else if (method ==
"srpd")
78 srpd(sig, raw_fz, op);
80 EST_error(
"Unknown pda %s\n", (
const char *)method);
82 smooth_phrase(raw_fz, speech, op, fz);
89 default_srpd_op(&srpd_op);
90 parse_srpd_list(op, &srpd_op);
92 if (op.
I(
"do_low_pass",0))
93 FIRlowpass_filter(sig, op.
I(
"lpf_cutoff"),op.
I(
"lpf_order"));
95 srpd(sig, fz, srpd_op, op.
I(
"srpd_resize", 0));
108 int i, rns, tracklen, j = 0;
111 STATUS_ pda_status, held_status;
114 min = srpd_op.min_pitch;
115 max = srpd_op.max_pitch;
117 initialise_structures (&srpd_op, &segment, &cc);
118 initialise_status (&srpd_op, &pda_status);
119 initialise_status (&srpd_op, &held_status);
121 tracklen = (sig.
num_samples() - segment.length) / segment.shift + 1;
125 fz.set_equal_space(
true);
132 EST_error(
"Pitch tracking algorithm must have equal spaced track\n");
134 while ((rns = read_next_wave_segment (sig, &srpd_op, &segment)) != 0)
138 for (i = 0; i < cc.size; cc.coeff[i++] = 0.0);
139 initialise_status (&srpd_op, &pda_status);
142 super_resolution_pda (&srpd_op, segment, &cc, &pda_status);
143 if (pda_status.s_h == HOLD)
145 held_status.pitch_freq = pda_status.pitch_freq;
146 held_status.v_uv = VOICED;
147 held_status.s_h = HELD;
148 held_status.cc_max = pda_status.cc_max;
149 held_status.threshold = pda_status.threshold;
152 if (held_status.s_h == HELD)
154 if (pda_status.pitch_freq == BREAK_NUMBER)
156 held_status.pitch_freq = BREAK_NUMBER;
157 held_status.v_uv = UNVOICED;
159 held_status.s_h = SENT;
160 if (held_status.v_uv != VOICED)
162 fz.
a(j++) = held_status.pitch_freq;
165 if (pda_status.v_uv != VOICED)
167 fz.
a(j++) = pda_status.pitch_freq;
170 if (held_status.s_h == HELD)
172 held_status.pitch_freq = BREAK_NUMBER;
173 held_status.v_uv = UNVOICED;
175 fz.
a(j++) = held_status.pitch_freq;
177 end_structure_use (&segment, &cc);
182 srpd->L = DEFAULT_DECIMATION;
183 srpd->min_pitch = DEFAULT_MIN_PITCH;
184 srpd->max_pitch = DEFAULT_MAX_PITCH;
185 srpd->shift = DEFAULT_SHIFT;
186 srpd->length = DEFAULT_LENGTH;
187 srpd->Tsilent = DEFAULT_TSILENT;
188 srpd->Tmin = DEFAULT_TMIN;
189 srpd->Tmax_ratio = DEFAULT_TMAX_RATIO;
190 srpd->Thigh = DEFAULT_THIGH;
191 srpd->Tdh = DEFAULT_TDH;
192 srpd->make_ascii = 0;
193 srpd->peak_tracking = 0;
194 srpd->sample_freq = DEFAULT_SF;
202 srpd->L = al.
I(
"decimation");
204 srpd->min_pitch = al.
F(
"min_pitch");
206 srpd->max_pitch = al.
F(
"max_pitch");
207 if (al.
present(
"pda_frame_shift"))
208 srpd->shift = al.
F(
"pda_frame_shift") * 1000.0;
209 if (al.
present(
"pda_frame_length"))
210 srpd->length = al.
F(
"pda_frame_length") * 1000.0;
212 srpd->Tsilent = al.
I(
"noise_floor");
213 if (al.
present(
"v2uv_coeff_thresh"))
214 srpd->Thigh = al.
F(
"v2uv_coef_thresh");
215 if (al.
present(
"min_v2uv_coef_thresh"))
216 srpd->Tmin = al.
F(
"min_v2uv_coef_thresh");
217 if (al.
present(
"v2uv_coef_thresh_ratio"))
218 srpd->Tmax_ratio = al.
F(
"v2uv_coef_thresh_ratio");
219 if (al.
present(
"anti_doubling_thresh"))
220 srpd->Tdh = al.
F(
"anti_doubling_thresh");
221 if (al.
present(
"peak_tracking"))
222 srpd->peak_tracking = al.
I(
"peak_tracking");
223 if (al.
present(
"sample_frequency"))
224 srpd->sample_freq = al.
I(
"sample_frequency");
229 al.
set(
"min_pitch",
"40.0");
230 al.
set(
"max_pitch",
"400.0");
231 al.
set(
"pda_frame_shift",
"0.005");
232 al.
set(
"pda_frame_length", DEFAULT_LENGTH / 1000.0);
233 al.
set(
"lpf_cutoff",
"600");
234 al.
set(
"lpf_order",
"49");
235 al.
set(
"f0_file_type",
"esps");
236 al.
set(
"decimation", DEFAULT_DECIMATION);
237 al.
set(
"noise_floor", DEFAULT_TSILENT);
238 al.
set(
"min_v2uv_coef_thresh", DEFAULT_TMIN);
239 al.
set(
"v2uv_coef_thresh_ratio", DEFAULT_TMAX_RATIO);
240 al.
set(
"v2uv_coef_thresh", DEFAULT_THIGH);
241 al.
set(
"anti_doubling_thresh", DEFAULT_TDH);
242 al.
set(
"peak_tracking", 0);
250 "-L Perform low pass filtering on input. This option should always \n"
251 " be used in normal processing as it usually increases \n"
252 " performance considerably\n\n"
253 "-P perform peak tracking\n\n"
254 "-fmin <float> miniumum F0 value. Sets the minimum allowed F0 in \n"
255 " output track. Default is "+ftoString(DEFAULT_MIN_PITCH)+
".\n "
256 " Changing this to suit the speaker usually increases \n"
257 " performance. Typical recommended values are 60-90Hz for\n"
258 " males and 120-150Hz for females\n\n"
259 "-fmax <float> maxiumum F0 value. Sets the maximum allowed F0 in \n"
260 " output track. Default is "+ftoString(DEFAULT_MAX_PITCH)+
". \n"
261 " Changing this to suit the speaker usually increases \n"
262 " performance. Typical recommended values are 200Hz for \n"
263 " males and 300-400Hz for females\n\n"
264 "-shift <float> frame spacing in seconds for fixed frame analysis. \n"
265 " This doesn't have to be the same as the output file spacing - \n"
266 " the -S option can be used to resample the track before saving \n"
267 " default: "+ftoString(DEFAULT_SHIFT/1000.0) +
"\n\n"
268 "-length <float> analysis frame length in seconds.\n"
269 " default: "+ftoString(DEFAULT_LENGTH/1000.0) +
"\n\n"
270 "-lpfilter <int> Low pass filter, with cutoff frequency in Hz \n"
271 " Filtering is performed by a FIR filter which is built at run \n"
272 " time. The order of the filter can be given by -forder. The \n"
273 " default value is 199\n\n"
274 "-forder <int> Order of FIR filter used for lpfilter and \n"
275 " hpfilter. This must be ODD. Sensible values range \n"
276 " from 19 (quick but with a shallow rolloff) to 199 \n"
277 " (slow but with a steep rolloff). The default is 199.\n\n";
285 "-d <float> decimation factor\n"
286 " set down-sampling for quicker computation so that only one in \n"
287 " <parameter>decimation factor</parameter> samples are used in the first instance. \n"
288 " Must be in the range of one to ten inclusive. Default is four. \n"
289 " For data sampled at 10kHz, it is advised that a decimation \n"
290 " factor of two isselected.\n\n"
292 "-n <float> Inoise floor.\n"
293 " Set the maximum absolute signal amplitude that represents \n"
294 " silence to <parameter>Inoise floor</parameter>. If the absolute amplitude of \n"
295 " the first segment in a given frame is below this level at all \n"
296 " times, then the frame is classified as representing silence. \n"
297 " Must be a positive number. Default is 120 ADC units.\n\n"
299 "-H <float> unvoiced to voiced coeff threshold\n"
300 " set the correlation coefficient threshold which must be \n"
301 " exceeded in a transition from an unvoiced classified frame \n"
302 " of speech to a voiced frame as the unvoiced to voiced coeff \n"
303 " threshold. Must be in the range zero to one inclusive. \n"
304 " Default is 0.88.\n\n"
306 "-m <float> min voiced to unvoiced coeff threshold \n"
307 " set the minimum allowed correlation coefficient threshold \n"
308 " which must not be exceeded in a transition from a voiced \n"
309 " classified frame of speech to an unvoiced frame, as \n"
310 " <parameter>min voiced to unvoiced coeff threshold</parameter>. Must be in the \n"
311 " range zero to <parameter>unvoiced to voiced coeff threshold</parameter> \n"
312 " inclusive. Default is 0.75.\n\n"
314 "-R <float> voiced to unvoiced coeff threshold-ratio \n"
315 " set the scaling factor used in determining the correlation\n"
316 " coefficient threshold which must not be exceeded in a voiced \n"
317 " frame to unvoiced frame transition, as <parameter>voiced to unvoiced</parameter> \n"
318 " coeff threshold -ratio. The voiced to unvoiced coefficient \n"
319 " threshold is determined by multiplying this scaling factor \n"
320 " with the maximum cross-correlation coefficient of the \n"
321 " previously voiced frame. If this product is less than \n"
322 " <parameter>min voiced to unvoiced coeff threshold</parameter> then this is used \n"
323 " instead. Must be in the range zero to one inclusive. \n"
324 " Default is 0.85.\n\n"
326 "-t <float> anti pitch doubling/halving threshold\n"
327 " set the threshold used in eliminating (as far as possible) \n"
328 " pitch doubling and pitch halving errors as <parameter>anti pitch \n"
329 " double/halving threshold</parameter>. Must be in the range zero to \n"
330 " one inclusive. Default is 0.77.\n\n";