73 #include "EST_cutils.h"
77 #define MAXSHORT 32767
84 static int zx_lft_N, zx_rht_N;
85 static double prev_pf = BREAK_NUMBER;
87 int n, j, k, N0 = 0, N1, N2, N_, q, lower_found = 0, score = 1, apply_bias;
88 int x_index, y_index, z_index;
89 int zx_rate = 0, zx_at_N0 = 0, prev_sign;
90 int seg1_zxs = 0, seg2_zxs = 0, total_zxs;
91 short prev_seg1, prev_seg2;
92 short x_max = -MAXSHORT, x_min = MAXSHORT;
93 short y_max = -MAXSHORT, y_min = MAXSHORT;
94 double xx = 0.0, yy = 0.0, zz = 0.0, xy = 0.0, yz = 0.0, xz = 0.0;
95 double max_cc = 0.0, coefficient, coeff_weight;
96 double xx_N, yy_N, xy_N, y1y1_N, xy1_N, yy1_N, beta;
97 LIST_ *sig_pks_hd, *sig_pks_tl, *sig_peak, *head, *tail;
99 sig_pks_hd = head = NULL;
100 sig_pks_tl = tail = NULL;
102 if (p_status->v_uv == UNVOICED || p_status->v_uv == SILENT)
103 p_status->threshold = paras->Thigh;
105 p_status->threshold = (paras->Tmin > paras->Tmax_ratio *
106 p_status->cc_max) ? paras->Tmin : paras->Tmax_ratio *
109 if (paras->peak_tracking && prev_pf != BREAK_NUMBER &&
110 p_status->v_uv == VOICED && p_status->s_h != HOLD &&
111 p_status->pitch_freq < 1.75 * prev_pf &&
112 p_status->pitch_freq > 0.625 * prev_pf)
117 prev_seg1 = seg.data[paras->Nmax - paras->Nmin] < 0 ? -1 : 1;
118 prev_seg2 = seg.data[paras->Nmax] < 0 ? -1 : 1;
119 for (j = 0; j < paras->Nmin; j += paras->L) {
121 x_index = paras->Nmax - paras->Nmin + j;
122 y_index = paras->Nmax + j;
123 if (seg.data[x_index] > x_max) x_max = seg.data[x_index];
124 if (seg.data[x_index] < x_min) x_min = seg.data[x_index];
125 if (seg.data[y_index] > y_max) y_max = seg.data[y_index];
126 if (seg.data[y_index] < y_min) y_min = seg.data[y_index];
128 if (seg.data[x_index] * prev_seg1 < 0) {
132 if (seg.data[y_index] * prev_seg2 < 0) {
137 xx += (double) seg.data[x_index] * seg.data[x_index];
138 yy += (
double) seg.data[y_index] * seg.data[y_index];
139 xy += (double) seg.data[x_index] * seg.data[y_index];
142 if (abs (x_max) + abs (x_min) < 2 * paras->Tsilent ||
143 abs (y_max) + abs (y_min) < 2 * paras->Tsilent) {
144 for (q = 0; q < p_cc->size; p_cc->coeff[q++] = 0.0);
145 prev_pf = p_status->pitch_freq;
146 p_status->pitch_freq = BREAK_NUMBER;
147 p_status->v_uv = SILENT;
148 p_status->s_h = SEND;
149 p_status->cc_max = 0.0;
153 p_cc->coeff[0] = p_status->cc_max = xy / sqrt (xx) / sqrt (yy);
154 for (q = 1; q < p_cc->size && q < paras->L; p_cc->coeff[q++] = 0.0);
155 total_zxs = seg1_zxs + seg2_zxs;
156 prev_sign = p_cc->coeff[0] < 0.0 ? -1 : 1;
157 prev_seg1 = seg.data[paras->Nmax - paras->Nmin] < 0 ? -1 : 1;
159 for (n = paras->Nmin + paras->L; n <= paras->Nmax; n += paras->L,
161 x_index = paras->Nmax - n;
162 y_index = paras->Nmax + j;
164 if (seg.data[x_index] * prev_seg1 < 0) {
168 if (seg.data[y_index] * prev_seg2 < 0) {
173 xx += (double) seg.data[x_index] * seg.data[x_index];
174 yy += (
double) seg.data[y_index] * seg.data[y_index];
175 for (k = 0, xy = 0.0; k < n; k += paras->L)
176 xy += (
double) seg.data[paras->Nmax - n + k] * seg.data[paras->Nmax + k];
177 p_cc->coeff[n - paras->Nmin] = xy / sqrt (xx) / sqrt (yy);
178 if (p_cc->coeff[n - paras->Nmin] > p_status->cc_max)
179 p_status->cc_max = p_cc->coeff[n - paras->Nmin];
181 for (q = n - paras->Nmin + 1;
182 q < p_cc->size && q < n - paras->Nmin + paras->L;
183 p_cc->coeff[q++] = 0.0);
185 if (p_cc->coeff[n - paras->Nmin] > p_cc->coeff[n - paras->Nmin - paras->L])
188 if (p_cc->coeff[n - paras->Nmin] * prev_sign < 0.0) {
193 if (N0 != 0 && zx_rate > zx_at_N0) {
194 add_to_list (&sig_pks_hd, &sig_pks_tl, N0, 1);
198 if (apply_bias && n > zx_lft_N && n < zx_rht_N)
202 if (p_cc->coeff[n - paras->Nmin] > max_cc && total_zxs > 3 && lower_found) {
203 max_cc = p_cc->coeff[n - paras->Nmin];
204 if (max_cc * coeff_weight >= p_status->threshold) {
211 if (sig_pks_hd == NULL) {
212 prev_pf = p_status->pitch_freq;
213 p_status->pitch_freq = BREAK_NUMBER;
214 p_status->v_uv = UNVOICED;
215 p_status->s_h = SEND;
219 sig_peak = sig_pks_hd;
220 while (sig_peak != NULL) {
222 for (j = 0; j < sig_peak->N0; j++) {
223 y_index = paras->Nmax + j;
224 z_index = paras->Nmax + sig_peak->N0 + j;
225 yy += (double) seg.data[y_index] * seg.data[y_index];
226 zz += (
double) seg.data[z_index] * seg.data[z_index];
227 yz += (double) seg.data[y_index] * seg.data[z_index];
229 if (yy == 0.0 || zz == 0.0)
232 coefficient = yz / sqrt (yy) / sqrt (zz);
233 if (apply_bias && sig_peak->N0 > zx_lft_N && sig_peak->N0 < zx_rht_N)
237 if (coefficient * coeff_weight >= p_status->threshold) {
245 sig_peak = sig_peak->next_item;
247 if (head == NULL) head = sig_pks_hd;
248 if (tail == NULL) tail = sig_pks_tl;
252 for (j = 0; j < tail->N0; j++)
253 xx += (
double) seg.data[paras->Nmax - tail->N0 + j] *
254 seg.data[paras->Nmax - tail->N0 + j];
256 while (sig_peak != NULL) {
257 if (sig_peak->score == score) {
259 for (j = 0; j < tail->N0; j++) {
260 z_index = paras->Nmax + sig_peak->N0 + j;
261 xz += (double) seg.data[paras->Nmax - tail->N0 + j] *
263 zz += (
double) seg.data[z_index] * seg.data[z_index];
265 coefficient = xz / sqrt (xx) / sqrt (zz);
266 if (sig_peak == head)
267 max_cc = coefficient;
268 else if (coefficient * paras->Tdh > max_cc) {
270 max_cc = coefficient;
273 sig_peak = sig_peak->next_item;
276 p_status->cc_max = p_cc->coeff[N0 - paras->Nmin];
278 if ((tail == head && score == 1 && p_status->v_uv != VOICED) ||
279 p_cc->coeff[N0 - paras->Nmin] < p_status->threshold)
280 p_status->s_h = HOLD;
282 p_status->s_h = SEND;
284 zx_lft_N = zx_rht_N = 0;
285 for (q = N0; q >= paras->Nmin; q -= paras->L)
286 if (p_cc->coeff[q - paras->Nmin] < 0.0) {
290 for (q = N0; q <= paras->Nmax; q += paras->L)
291 if (p_cc->coeff[q - paras->Nmin] < 0.0) {
296 if (N0 - paras->L < paras->Nmin) {
298 N2 = N0 + 2 * paras->L;
300 else if (N0 + paras->L > paras->Nmax) {
301 N1 = N0 - 2 * paras->L;
311 for (j = 0; j < N1; j++) {
312 x_index = paras->Nmax - N1 + j;
313 y_index = paras->Nmax + j;
314 xx += (double) seg.data[x_index] * seg.data[x_index];
315 xy += (
double) seg.data[x_index] * seg.data[y_index];
316 yy += (double) seg.data[y_index] * seg.data[y_index];
318 p_cc->coeff[N1 - paras->Nmin] = p_status->cc_max =
319 xy / sqrt (xx) / sqrt (yy);
321 for (n = N1 + 1; n <= N2; n++, j++) {
322 xx += (double) seg.data[paras->Nmax - n] * seg.data[paras->Nmax - n];
323 yy += (
double) seg.data[paras->Nmax + j] * seg.data[paras->Nmax + j];
324 for (k = 0, xy = 0.0; k < n; k++)
325 xy += (
double) seg.data[paras->Nmax - n + k] * seg.data[paras->Nmax + k];
326 p_cc->coeff[n - paras->Nmin] = xy / sqrt (xx) / sqrt (yy);
327 if (p_cc->coeff[n - paras->Nmin] > p_status->cc_max) {
328 p_status->cc_max = p_cc->coeff[n - paras->Nmin];
334 if (N0 - 1 < paras->Nmin || N0 == N1) N_ = N0;
335 else if (N0 + 1 > paras->Nmax || N0 == N2) N_ = N0 - 1;
336 else if (p_cc->coeff[N0 - paras->Nmin] - p_cc->coeff[N0 - paras->Nmin - 1] <
337 p_cc->coeff[N0 - paras->Nmin] - p_cc->coeff[N0 - paras->Nmin + 1])
341 xx_N = yy_N = xy_N = y1y1_N = xy1_N = yy1_N = 0.0;
342 for (j = 0; j < N_; j++) {
343 x_index = paras->Nmax - N_ + j;
344 y_index = paras->Nmax + j;
345 xx_N += (double) seg.data[x_index] * seg.data[x_index];
346 yy_N += (
double) seg.data[y_index] * seg.data[y_index];
347 xy_N += (double) seg.data[x_index] * seg.data[y_index];
348 y1y1_N += (
double) seg.data[y_index + 1] * seg.data[y_index + 1];
349 xy1_N += (double) seg.data[x_index] * seg.data[y_index + 1];
350 yy1_N += (
double) seg.data[y_index] * seg.data[y_index + 1];
352 beta = (xy1_N * yy_N - xy_N * yy1_N) /
353 (xy1_N * (yy_N - yy1_N) + xy_N * (y1y1_N - yy1_N));
358 else if (beta >= 1.0) {
363 p_status->cc_max = ((1.0 - beta) * xy_N + beta * xy1_N) /
364 sqrt (xx_N * ((1.0 - beta) * (1.0 - beta) * yy_N +
365 2.0 * beta * (1.0 - beta) * yy1_N +
366 beta * beta * y1y1_N));
367 prev_pf = p_status->pitch_freq;
368 p_status->pitch_freq = (double) (paras->sample_freq) / (double) (N_ + beta);
369 p_status->v_uv = VOICED;
370 free_list (&sig_pks_hd);
377 void add_to_list (
LIST_ **p_list_hd,
LIST_ **p_list_tl,
int N_val,
381 LIST_ *new_node, *last_node;
383 new_node = walloc(
LIST_ ,1);
384 last_node = *p_list_tl;
385 new_node->N0 = N_val;
386 new_node->score = score_val;
387 new_node->next_item = NULL;
388 if (*p_list_hd == NULL)
389 *p_list_hd = new_node;
391 last_node->next_item = new_node;
392 *p_list_tl = new_node;
402 void error (error_flags err_type)
407 strcpy (prog,
"srpd");
408 fprintf (stderr,
"%s: ", prog);
411 fprintf (stderr,
"cannot write to output file");
414 fprintf (stderr,
"decimation factor not set");
417 fprintf (stderr,
"insufficient memory available");
423 fprintf (stderr,
"improper fseek () to reposition a stream");
426 fprintf (stderr,
"artificial frame length set out of range");
429 fprintf (stderr,
"maximum pitch frequency value (Hz) not set");
432 fprintf (stderr,
"minimum pitch frequency value (Hz) not set");
435 fprintf (stderr,
"usage: %s -i lpf_sample_file ", prog);
436 fprintf (stderr,
"-o pitch_file [options]\n");
437 fprintf (stderr,
"\nOptions {with default values}\n");
438 fprintf (stderr,
"-a form pitch_file in ascii format\n");
439 fprintf (stderr,
"-l 'lower pitch frequency limit' {%f (Hz)}\n",
441 fprintf (stderr,
"-u 'upper pitch frequency limit' {%f (Hz)}\n",
443 fprintf (stderr,
"-d 'decimation factor' {%d (samples)}\n",
445 fprintf (stderr,
"-n 'noise floor (abs. amplitude)' {%d}\n",
447 fprintf (stderr,
"-h 'unvoiced to voiced coeff threshold' {%f}\n",
449 fprintf (stderr,
"-m 'min. voiced to unvoiced coeff threshold' {%f}\n",
451 fprintf (stderr,
"-r 'voiced to unvoiced coeff threshold ratio' {%f}\n",
453 fprintf (stderr,
"-t 'anti pitch doubling/halving threshold' {%f}\n",
455 fprintf (stderr,
"-p perform peak tracking\n");
456 fprintf (stderr,
"-f 'sampling frequency' {%d (Hz)}\n", DEFAULT_SF);
457 fprintf (stderr,
"-s 'frame shift' {%f (ms)}\n", DEFAULT_SHIFT);
458 fprintf (stderr,
"-w 'artificial frame length' {%f (ms)}\n",
462 fprintf (stderr,
"noise floor set below minimum amplitude");
465 fprintf (stderr,
"attempt to set sampling frequency negative");
468 fprintf (stderr,
"frame shift set out of range");
471 fprintf (stderr,
"anti pitch doubling/halving threshold not set");
474 fprintf (stderr,
"unvoiced to voiced coeff threshold not set");
477 fprintf (stderr,
"voiced to unvoiced coeff threshold ratio not set");
480 fprintf (stderr,
"minimum voiced to unvoiced coeff threshold not set");
483 fprintf (stderr,
"undefined error, %u occurred", err_type);
486 fprintf (stderr,
"\n");
491 void initialise_parameters (
struct Srpd_Op *p_par)
493 p_par->L = DEFAULT_DECIMATION;
494 p_par->min_pitch = DEFAULT_MIN_PITCH;
495 p_par->max_pitch = DEFAULT_MAX_PITCH;
496 p_par->shift = DEFAULT_SHIFT;
497 p_par->length = DEFAULT_LENGTH;
498 p_par->Tsilent = DEFAULT_TSILENT;
499 p_par->Tmin = DEFAULT_TMIN;
500 p_par->Tmax_ratio = DEFAULT_TMAX_RATIO;
501 p_par->Thigh = DEFAULT_THIGH;
502 p_par->Tdh = DEFAULT_TDH;
503 p_par->make_ascii = 0;
504 p_par->peak_tracking = 0;
505 p_par->sample_freq = DEFAULT_SF;
514 p_par->Nmax = (int) ceil((
float)p_par->sample_freq / p_par->min_pitch);
515 p_par->Nmin = (int) floor((
float)p_par->sample_freq / p_par->max_pitch);
516 p_par->min_pitch = (float)p_par->sample_freq / (
float)p_par->Nmax;
517 p_par->max_pitch = (float)p_par->sample_freq / (
float)p_par->Nmin;
519 p_seg->size = 3 * p_par->Nmax;
520 p_seg->shift = (int) rint( p_par->shift / 1000.0 * (
float)p_par->sample_freq );
521 p_seg->length = (int) rint( p_par->length / 1000.0 * (
float)p_par->sample_freq );
522 p_seg->data = walloc(
short,p_seg->size);
524 p_cc->size = p_par->Nmax - p_par->Nmin + 1;
525 p_cc->coeff = walloc(
double,p_cc->size);
534 p_status->pitch_freq = BREAK_NUMBER;
535 p_status->v_uv = SILENT;
536 p_status->s_h = SEND;
537 p_status->cc_max = 0.0;
538 p_status->threshold = paras->Thigh;
556 int read_next_segment (FILE *voxfile,
struct Srpd_Op *paras,
SEGMENT_ *p_seg)
559 static int status = BEGINNING, padding= -1, tracklen = 0;
561 int samples_read = 0;
562 long init_file_position, offset;
564 if (status == BEGINNING) {
566 if (fseek (voxfile, 0L, 2)) error (FILE_SEEK);
567 tracklen = ((ftell (voxfile) /
sizeof (short)) - p_seg->length) /
569 cout <<
"track len " << tracklen;
571 if (paras->Nmax < p_seg->length / 2) {
572 offset = (long) (p_seg->length / 2 - paras->Nmax) *
sizeof (short);
573 if (fseek (voxfile, offset, 1)) error (FILE_SEEK);
577 if ((paras->Nmax - p_seg->length / 2) % p_seg->shift != 0) {
578 offset = (long) (p_seg->shift - ((paras->Nmax - p_seg->length / 2) %
579 p_seg->shift)) *
sizeof (short);
580 if (fseek (voxfile, offset, 1)) error (FILE_SEEK);
582 padding = (paras->Nmax - p_seg->length / 2) / p_seg->shift +
583 ((paras->Nmax - p_seg->length / 2) % p_seg->shift == 0 ? 0 : 1);
586 cout <<
"padding " << padding << endl;
589 else if (tracklen-- <= 0)
594 cout <<
"tl " << tracklen << endl;
595 if (status == MIDDLE_) {
597 init_file_position = ftell (voxfile);
598 offset = (long) (p_seg->shift * sizeof (
short));
599 samples_read = fread ((
short *) p_seg->data, sizeof (
short),
600 p_seg->size, voxfile);
601 if (samples_read == p_seg->size) {
602 if (fseek (voxfile, init_file_position + offset, 0)) error (FILE_SEEK);
625 static int status = BEGINNING, padding = -1, tracklen = 0;
633 if (status == BEGINNING)
639 if (paras->Nmax < p_seg->length / 2)
641 offset = p_seg->length / 2 - paras->Nmax;
647 if ((paras->Nmax - p_seg->length / 2) % p_seg->shift != 0) {
648 offset = p_seg->shift - ((paras->Nmax - p_seg->length / 2)%
652 padding = (paras->Nmax - p_seg->length / 2) / p_seg->shift +
653 ((paras->Nmax - p_seg->length / 2)
654 % p_seg->shift == 0 ? 0 : 1);
659 else if (tracklen-- <= 0) {
668 if (status == MIDDLE_)
672 offset = p_seg->shift;
673 for (i = 0; (i < p_seg->size) && (i+wave_pos)<sig.
num_samples();
675 p_seg->data[i] = sig.
a(i + wave_pos,0);
676 for ( ; i < p_seg->size; ++i)
712 void write_track(
STATUS_ status,
struct Srpd_Op paras, FILE *outfile)
714 if (paras.make_ascii)
716 if (fprintf(outfile,
"%7g\n",status.pitch_freq) != 8)
720 if (!fwrite ((
double *) &status.pitch_freq, sizeof (
double), 1, outfile))
726 void free_list (
LIST_ **p_list_hd)
731 while (*p_list_hd != NULL) {
732 next = (*p_list_hd)->next_item;