docs/speech_tools-2.4.0/pda_8cc_source.html

/*************************************************************************/

/*                                                                       */

/*                Centre for Speech Technology Research                  */

/*                     University of Edinburgh, UK                       */

/*                      Copyright (c) 1995,1996                          */

/*                        All Rights Reserved.                           */

/*                                                                       */

/*  Permission is hereby granted, free of charge, to use and distribute  */

/*  this software and its documentation without restriction, including   */

/*  without limitation the rights to use, copy, modify, merge, publish,  */

/*  distribute, sublicense, and/or sell copies of this work, and to      */

/*  permit persons to whom this work is furnished to do so, subject to   */

/*  the following conditions:                                            */

/*   1. The code must retain the above copyright notice, this list of    */

/*      conditions and the following disclaimer.                         */

/*   2. Any modifications must be clearly marked as such.                */

/*   3. Original authors' names are not deleted.                         */

/*   4. The authors' names are not used to endorse or promote products   */

/*      derived from this software without specific prior written        */

/*      permission.                                                      */

/*                                                                       */

/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */

/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */

/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */

/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */

/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */

/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */

/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */

/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */

/*  THIS SOFTWARE.                                                       */

/*                                                                       */

/*************************************************************************/

/*                   Author :  Paul Taylor                               */

/*                   Date   :  April 1994                                */

/*************************************************************************/


#include "EST_speech_class.h"

#include "sigpr/EST_sigpr_utt.h"

#include "sigpr/EST_filter.h"

#include "srpd.h"

#include "EST_error.h"

#include "EST_string_aux.h"


int read_next_wave_segment (EST_Wave &sig, struct Srpd_Op *paras,

                SEGMENT_ *p_seg);


static void srpd(EST_Wave &sig, EST_Track &fz, Srpd_Op &srpd_op, int resize);

static struct Srpd_Op *default_srpd_op(struct Srpd_Op *srpd);

static void parse_srpd_list(EST_Features &a_list, struct Srpd_Op *srpd);


void pda(EST_Wave &sig, EST_Track &fz, EST_Features &op, EST_String method)

{

    if (method == "")

    {

    if (op.present("pda_method"))

        method = op.S("pda_method");

    }

    if (method == "")

    srpd(sig, fz, op);

    else if  (method == "srpd")

    srpd(sig, fz, op);

    else

    EST_error("Unknown pda %s\n", (const char *)method);

}


void icda(EST_Wave &sig, EST_Track &fz, EST_Track &speech, EST_Features &op,

           EST_String method)

{ // intonation contour detection algorithm

    EST_Track raw_fz;

    if (method == "")

    {

    if (op.present("pda_method"))

        method = op.S("pda_method");

    }

    if (method == "")

    srpd(sig, raw_fz, op);

    else if  (method == "srpd")

    srpd(sig, raw_fz, op);

    else

    EST_error("Unknown pda %s\n", (const char *)method);


    smooth_phrase(raw_fz, speech, op, fz);

}


void srpd(EST_Wave &sig, EST_Track &fz, EST_Features &op)

{

    Srpd_Op srpd_op;


    default_srpd_op(&srpd_op); // default values

    parse_srpd_list(op, &srpd_op); // override with options


    if (op.I("do_low_pass",0))

    FIRlowpass_filter(sig, op.I("lpf_cutoff"),op.I("lpf_order"));


    srpd(sig, fz, srpd_op, op.I("srpd_resize", 0));

}


/*void do_srpd_fz(EST_Wave &sig, EST_Track &fz)

{

    Srpd_Op srpd_op;

    default_srpd_op(&srpd_op);

    srpd(sig, fz, srpd_op, 1);

}

*/


void srpd(EST_Wave &sig, EST_Track &fz, Srpd_Op &srpd_op, int resize)

{

    int i, rns, tracklen, j = 0;

    SEGMENT_ segment;

    CROSS_CORR_ cc;

    STATUS_ pda_status, held_status;

    srpd_op.sample_freq = sig.sample_rate();

    float min, max;

    min = srpd_op.min_pitch; // must store as set up routines corrupt

    max = srpd_op.max_pitch;


    initialise_structures (&srpd_op, &segment, &cc);

    initialise_status (&srpd_op, &pda_status);

    initialise_status (&srpd_op, &held_status);


    tracklen = (sig.num_samples() - segment.length) / segment.shift + 1;


    if (resize)

    {

    fz.set_equal_space(true);

    fz.resize(tracklen, 1);

    fz.set_channel_name("F0", 0);

    fz.fill_time(srpd_op.shift/1000);

    }


    if (!fz.equal_space())

    EST_error("Pitch tracking algorithm must have equal spaced track\n");


    while ((rns = read_next_wave_segment (sig, &srpd_op, &segment)) != 0)

    {

    if (rns == 2)

    {

        for (i = 0; i < cc.size; cc.coeff[i++] = 0.0);

        initialise_status (&srpd_op, &pda_status);

    }

    else

        super_resolution_pda (&srpd_op, segment, &cc, &pda_status);

    if (pda_status.s_h == HOLD)

    {

        held_status.pitch_freq = pda_status.pitch_freq;

        held_status.v_uv = VOICED;

        held_status.s_h = HELD;

        held_status.cc_max = pda_status.cc_max;

        held_status.threshold = pda_status.threshold;

        continue;

    }

    if (held_status.s_h == HELD)

    {

        if (pda_status.pitch_freq == BREAK_NUMBER)

        {

        held_status.pitch_freq = BREAK_NUMBER;

        held_status.v_uv = UNVOICED;

        }

        held_status.s_h = SENT;

        if (held_status.v_uv != VOICED)

        fz.set_break(j);

        fz.a(j++) = held_status.pitch_freq;

        //    printf( "track set:  %d (of %d) to %f\n", j-1, fz.length(), held_status.pitch_freq );

    }

    if (pda_status.v_uv != VOICED)

        fz.set_break(j);

    fz.a(j++) = pda_status.pitch_freq;

    //printf( "track set:  %d (of %d) to %f\n", j-1, fz.length(), pda_status.pitch_freq );

    }

    if (held_status.s_h == HELD)

    {

    held_status.pitch_freq = BREAK_NUMBER;

    held_status.v_uv = UNVOICED;

    fz.set_break(j);

    fz.a(j++) = held_status.pitch_freq;

    }

    end_structure_use (&segment, &cc);

}


static struct Srpd_Op *default_srpd_op(struct Srpd_Op *srpd)

{

    srpd->L = DEFAULT_DECIMATION;

    srpd->min_pitch = DEFAULT_MIN_PITCH;

    srpd->max_pitch = DEFAULT_MAX_PITCH;

    srpd->shift = DEFAULT_SHIFT;

    srpd->length = DEFAULT_LENGTH;

    srpd->Tsilent = DEFAULT_TSILENT;

    srpd->Tmin = DEFAULT_TMIN;

    srpd->Tmax_ratio = DEFAULT_TMAX_RATIO;

    srpd->Thigh = DEFAULT_THIGH;

    srpd->Tdh = DEFAULT_TDH;

    srpd->make_ascii = 0;

    srpd->peak_tracking = 0;

    srpd->sample_freq = DEFAULT_SF;

      /* p_par->Nmax and p_par->Nmin cannot be initialised */

    return(srpd);

}


static void parse_srpd_list(EST_Features &al, struct Srpd_Op *srpd)

{

    if (al.present("decimation"))

    srpd->L = al.I("decimation");

    if (al.present("min_pitch"))

    srpd->min_pitch = al.F("min_pitch");

    if (al.present("max_pitch"))

    srpd->max_pitch = al.F("max_pitch");

    if (al.present("pda_frame_shift"))

    srpd->shift = al.F("pda_frame_shift") * 1000.0;

    if (al.present("pda_frame_length"))

    srpd->length = al.F("pda_frame_length") * 1000.0;

    if (al.present("noise_floor"))

    srpd->Tsilent = al.I("noise_floor");

    if (al.present("v2uv_coeff_thresh"))

    srpd->Thigh = al.F("v2uv_coef_thresh");

    if (al.present("min_v2uv_coef_thresh"))

    srpd->Tmin = al.F("min_v2uv_coef_thresh");

    if (al.present("v2uv_coef_thresh_ratio"))

    srpd->Tmax_ratio = al.F("v2uv_coef_thresh_ratio");

    if (al.present("anti_doubling_thresh"))

    srpd->Tdh = al.F("anti_doubling_thresh");

    if (al.present("peak_tracking"))

    srpd->peak_tracking = al.I("peak_tracking");

    if (al.present("sample_frequency"))

    srpd->sample_freq = al.I("sample_frequency");

}


void default_pda_options(EST_Features &al)

{

    al.set("min_pitch", "40.0");

    al.set("max_pitch", "400.0");

    al.set("pda_frame_shift", "0.005");

    al.set("pda_frame_length", DEFAULT_LENGTH / 1000.0);

    al.set("lpf_cutoff", "600");

    al.set("lpf_order", "49");

    al.set("f0_file_type", "esps");

    al.set("decimation", DEFAULT_DECIMATION);

    al.set("noise_floor", DEFAULT_TSILENT);

    al.set("min_v2uv_coef_thresh", DEFAULT_TMIN);

    al.set("v2uv_coef_thresh_ratio", DEFAULT_TMAX_RATIO);

    al.set("v2uv_coef_thresh", DEFAULT_THIGH);

    al.set("anti_doubling_thresh", DEFAULT_TDH);

    al.set("peak_tracking", 0);

}


EST_String options_pda_general(void)

{

    // The standard waveform input options

    return

    EST_String("")+

    "-L  Perform low pass filtering on input. This option should always \n"

    "    be used in normal processing as it usually increases \n"

    "    performance considerably\n\n"

    "-P  perform peak tracking\n\n"

    "-fmin <float> miniumum F0 value. Sets the minimum allowed F0 in \n"

    "    output track. Default is "+ftoString(DEFAULT_MIN_PITCH)+".\n "

    "    Changing this to suit the speaker usually increases  \n"

    "    performance. Typical recommended values are 60-90Hz for\n"

    "    males and 120-150Hz  for females\n\n"

    "-fmax <float> maxiumum F0 value. Sets the maximum allowed F0 in \n"

    "    output track. Default is "+ftoString(DEFAULT_MAX_PITCH)+". \n"

    "    Changing this to suit the speaker usually increases \n"

    "    performance. Typical recommended values are 200Hz for \n"

    "    males and 300-400Hz for females\n\n"

    "-shift <float> frame spacing in seconds for fixed frame analysis. \n"

    "    This doesn't have to be the same as the output file spacing - \n"

    "    the -S option can be used to resample the track before saving \n"

    "    default: "+ftoString(DEFAULT_SHIFT/1000.0) +"\n\n"

    "-length <float> analysis frame length in seconds.\n"

    "    default: "+ftoString(DEFAULT_LENGTH/1000.0) +"\n\n"

    "-lpfilter <int>   Low pass filter, with cutoff frequency in Hz \n"

    "    Filtering is performed by a FIR filter which is built at run \n"

    "    time. The order of the filter can be given by -forder. The \n"

    "    default value is 199\n\n"

    "-forder <int>  Order of FIR filter used for lpfilter and \n"

    "    hpfilter. This must be ODD. Sensible values range \n"

    "    from 19 (quick but with a shallow rolloff) to 199 \n"

    "    (slow but with a steep rolloff). The default is 199.\n\n";

}


EST_String options_pda_srpd(void)

{

    // The standard waveform input options

    return

    EST_String("")+

    "-d <float> decimation factor\n"

    "    set down-sampling for quicker computation so that only one in \n"

    "    <parameter>decimation factor</parameter> samples are used in the first instance. \n"

    "    Must be in the range of one to ten inclusive. Default is four. \n"

    "    For data sampled at 10kHz, it is advised that a decimation \n"

    "    factor of two isselected.\n\n"


    "-n <float> Inoise floor.\n"

    "    Set the maximum absolute signal amplitude that represents  \n"

    "    silence to <parameter>Inoise floor</parameter>. If the absolute amplitude of \n"

    "    the first segment in a given frame is below this level at all \n"

    "    times, then the frame is classified as representing silence. \n"

    "    Must be a positive number. Default is 120 ADC units.\n\n"


    "-H <float> unvoiced to voiced coeff threshold\n"

    "    set the correlation coefficient threshold which must be \n"

    "    exceeded in a transition from an unvoiced classified frame \n"

    "    of speech to a voiced frame as the unvoiced to voiced coeff \n"

    "    threshold. Must be in the range zero to one inclusive. \n"

    "    Default is 0.88.\n\n"


    "-m <float> min voiced to unvoiced coeff threshold \n"

    "    set the minimum allowed correlation coefficient threshold \n"

    "    which must not be exceeded in a transition from a voiced \n"

    "    classified frame of speech to an unvoiced frame, as \n"

    "    <parameter>min voiced to unvoiced coeff threshold</parameter>. Must be in the \n"

    "    range zero to <parameter>unvoiced to voiced coeff threshold</parameter> \n"

    "    inclusive. Default is 0.75.\n\n"


    "-R <float> voiced to unvoiced coeff threshold-ratio  \n"

    "    set the scaling factor used in determining the correlation\n"

    "    coefficient threshold which must not be exceeded in a voiced \n"

    "    frame to unvoiced frame transition, as <parameter>voiced to unvoiced</parameter> \n"

    "    coeff threshold -ratio. The voiced to unvoiced coefficient \n"

    "    threshold is determined by multiplying this scaling factor \n"

    "    with the maximum cross-correlation coefficient of the \n"

    "    previously voiced frame. If this product is less than \n"

    "    <parameter>min voiced to unvoiced coeff threshold</parameter> then this is used \n"

    "    instead. Must be in the range zero to one inclusive. \n"

    "     Default is 0.85.\n\n"


    "-t <float> anti pitch doubling/halving threshold\n"

    "    set the threshold used in eliminating (as far as possible) \n"

    "    pitch doubling and pitch halving errors as <parameter>anti pitch \n"

    "    double/halving threshold</parameter>. Must be in the range zero to \n"

    "    one inclusive. Default is 0.77.\n\n";

}