docs/speech_tools-2.4.0/wagon__aux_8cc_source.html

/*************************************************************************/

/*                                                                       */

/*                Centre for Speech Technology Research                  */

/*                     University of Edinburgh, UK                       */

/*                      Copyright (c) 1996,1997                          */

/*                        All Rights Reserved.                           */

/*                                                                       */

/*  Permission is hereby granted, free of charge, to use and distribute  */

/*  this software and its documentation without restriction, including   */

/*  without limitation the rights to use, copy, modify, merge, publish,  */

/*  distribute, sublicense, and/or sell copies of this work, and to      */

/*  permit persons to whom this work is furnished to do so, subject to   */

/*  the following conditions:                                            */

/*   1. The code must retain the above copyright notice, this list of    */

/*      conditions and the following disclaimer.                         */

/*   2. Any modifications must be clearly marked as such.                */

/*   3. Original authors' names are not deleted.                         */

/*   4. The authors' names are not used to endorse or promote products   */

/*      derived from this software without specific prior written        */

/*      permission.                                                      */

/*                                                                       */

/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */

/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */

/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */

/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */

/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */

/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */

/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */

/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */

/*  THIS SOFTWARE.                                                       */

/*                                                                       */

/*************************************************************************/

/*                     Author :  Alan W Black                            */

/*                     Date   :  May 1996                                */

/*-----------------------------------------------------------------------*/

/*                                                                       */

/*  Various method functions                                             */

/*=======================================================================*/


#include <cstdlib>

#include <iostream>

#include <cstring>

#include "EST_unix.h"

#include "EST_cutils.h"

#include "EST_Token.h"

#include "EST_Wagon.h"

#include "EST_math.h"


EST_Val WNode::predict(const WVector &d)

{

    if (leaf())

    return impurity.value();

    else if (question.ask(d))

    return left->predict(d);

    else

    return right->predict(d);

}


WNode *WNode::predict_node(const WVector &d)

{

    if (leaf())

    return this;

    else if (question.ask(d))

    return left->predict_node(d);

    else

    return right->predict_node(d);

}


int WNode::pure(void)

{

    //  A node is pure if it has no sub-nodes or its not of type class


    if ((left == 0) && (right == 0))

    return TRUE;

    else if (get_impurity().type() != wnim_class)

    return TRUE;

    else

    return FALSE;

}


void WNode::prune(void)

{

    // Check all sub-nodes and if they are all of the same class

    // delete their sub nodes.  Returns pureness of this node


    if (pure() == FALSE)

    {

    // Ok lets try and make it pure

    if (left != 0) left->prune();

    if (right != 0) right->prune();


    // Have to check purity as well as values to ensure left and right

    // don't further split

    if ((left->pure() == TRUE) && ((right->pure() == TRUE)) &&

        (left->get_impurity().value() == right->get_impurity().value()))

    {

         delete left; left = 0;

         delete right; right = 0;

    }

    }


}


void WNode::held_out_prune()

{

    // prune tree with held out data

    // Check if node's questions differentiates for the held out data

    // if not, prune all sub_nodes


    // Rescore with prune data

    set_impurity(WImpurity(get_data()));  // for this new data


    if (left != 0)

    {

    wgn_score_question(question,get_data());

    if (question.get_score() < get_impurity().measure())

    {  // its worth goint ot the next level

        wgn_find_split(question,get_data(),

               left->get_data(),

               right->get_data());

        left->held_out_prune();

        right->held_out_prune();

    }

    else

    {  // not worth the split so prune both sub_nodes

        delete left; left = 0;

        delete right; right = 0;

    }

    }

}


void WNode::print_out(ostream &s, int margin)

{

    int i;


    s << endl;

    for (i=0;i<margin;i++) s << " ";

    s << "(";

    if (left==0) // base case

    s << impurity;

    else

    {

    s << question;

    left->print_out(s,margin+1);

    right->print_out(s,margin+1);

    }

    s << ")";

}


ostream & operator <<(ostream &s, WNode &n)

{

    // Output this node and its sub-node


    n.print_out(s,0);

    s << endl;

    return s;

}


void WDataSet::ignore_non_numbers()

{

    /* For ols we want to ignore anything that is categorial */

    int i;


    for (i=0; i<dlength; i++)

    {

        if ((p_type[i] == wndt_binary) ||

            (p_type[i] == wndt_float))

            continue;

        else

        {

            p_ignore[i] = TRUE;

        }

    }


    return;

}


void WDataSet::load_description(const EST_String &fname, LISP ignores)

{

    // Initialise a dataset with sizes and types

    EST_String tname;

    int i;

    LISP description,d;


    description = car(vload(fname,1));

    dlength = siod_llength(description);


    p_type.resize(dlength);

    p_ignore.resize(dlength);

    p_name.resize(dlength);


    if (wgn_predictee_name == "")

    wgn_predictee = 0;  // default predictee is first field

    else

    wgn_predictee = -1;


    for (i=0,d=description; d != NIL; d=cdr(d),i++)

    {

    p_name[i] = get_c_string(car(car(d)));

    tname = get_c_string(car(cdr(car(d))));

    p_ignore[i] = FALSE;

    if ((wgn_predictee_name != "") && (wgn_predictee_name == p_name[i]))

        wgn_predictee = i;

    if ((wgn_count_field_name != "") &&

        (wgn_count_field_name == p_name[i]))

        wgn_count_field = i;

    if ((tname == "count") || (i == wgn_count_field))

    {

        // The count must be ignored, repeat it if you want it too

        p_type[i] = wndt_ignore;  // the count must be ignored

        p_ignore[i] = TRUE;

        wgn_count_field = i;

    }

    else if ((tname == "ignore") || (siod_member_str(p_name[i],ignores)))

    {

        p_type[i] = wndt_ignore;  // user specified ignore

        p_ignore[i] = TRUE;

        if (i == wgn_predictee)

        wagon_error(EST_String("predictee \"")+p_name[i]+

                "\" can't be ignored \n");

    }

    else if (siod_llength(car(d)) > 2)

    {

        LISP rest = cdr(car(d));

        EST_StrList sl;

        siod_list_to_strlist(rest,sl);

        p_type[i] = wgn_discretes.def(sl);

        if (streq(get_c_string(car(rest)),"_other_"))

        wgn_discretes[p_type[i]].def_val("_other_");

    }

    else if (tname == "binary")

        p_type[i] = wndt_binary;

    else if (tname == "cluster")

        p_type[i] = wndt_cluster;

    else if (tname == "vector")

        p_type[i] = wndt_vector;

    else if (tname == "trajectory")

        p_type[i] = wndt_trajectory;

    else if (tname == "ols")

        p_type[i] = wndt_ols;

    else if (tname == "matrix")

        p_type[i] = wndt_matrix;

    else if (tname == "float")

        p_type[i] = wndt_float;

    else

    {

        wagon_error(EST_String("Unknown type \"")+tname+

            "\" for field number "+itoString(i)+

                        "/"+p_name[i]+" in description file \""+fname+"\"");

    }

    }


    if (wgn_predictee == -1)

    {

    wagon_error(EST_String("predictee field \"")+wgn_predictee_name+

            "\" not found in description ");

    }

}


const int WQuestion::ask(const WVector &w) const

{

    // Ask this question of the given vector

    switch (op)

    {

      case wnop_equal:    // for numbers

    if (w.get_flt_val(feature_pos) == operand1.Float())

        return TRUE;

    else

        return FALSE;

      case wnop_binary:    // for numbers

    if (w.get_int_val(feature_pos) == 1)

        return TRUE;

    else

        return FALSE;

      case wnop_greaterthan:

    if (w.get_flt_val(feature_pos) > operand1.Float())

        return TRUE;

    else

        return FALSE;

      case wnop_lessthan:

    if (w.get_flt_val(feature_pos) < operand1.Float())

        return TRUE;

    else

        return FALSE;

      case wnop_is:       // for classes

    if (w.get_int_val(feature_pos) == operand1.Int())

        return TRUE;

    else

        return FALSE;

      case wnop_in:       // for subsets -- note operand is list of ints

    if (ilist_member(operandl,w.get_int_val(feature_pos)))

        return TRUE;

    else

        return FALSE;

      default:

    wagon_error("Unknown test operator");

    }


    return FALSE;

}


ostream& operator<<(ostream& s, const WQuestion &q)

{

    EST_String name;

    static EST_Regex needquotes(".*[()'\";., \t\n\r].*");


    s << "(" << wgn_dataset.feat_name(q.get_fp());

    switch (q.get_op())

    {

      case wnop_equal:

    s << " = " << q.get_operand1().string();

    break;

      case wnop_binary:

    break;

      case wnop_greaterthan:

    s << " > " << q.get_operand1().Float();

    break;

      case wnop_lessthan:

    s << " < " << q.get_operand1().Float();

    break;

      case wnop_is:

    name = wgn_discretes[wgn_dataset.ftype(q.get_fp())].

        name(q.get_operand1().Int());

    s << " is ";

    if (name.matches(needquotes))

        s << quote_string(name,"\"","\\",1);

    else

        s << name;

    break;

      case wnop_matches:

    name = wgn_discretes[wgn_dataset.ftype(q.get_fp())].

        name(q.get_operand1().Int());

    s << " matches " << quote_string(name,"\"","\\",1);

    break;

      case wnop_in:

    s << " in (";

    for (int l=0; l < q.get_operandl().length(); l++)

    {

        name = wgn_discretes[wgn_dataset.ftype(q.get_fp())].

        name(q.get_operandl().nth(l));

        if (name.matches(needquotes))

        s << quote_string(name,"\"","\\",1);

        else

        s << name;

        s << " ";

    }

    s << ")";

    break;

        // SunCC wont let me add this

//      default:

//  s << " unknown operation ";

    }

    s << ")";


    return s;

}


EST_Val WImpurity::value(void)

{

    // Returns the recommended value for this

    EST_String s;

    double prob;


    if (t==wnim_unset)

    {

    cerr << "WImpurity: no value currently set\n";

    return EST_Val(0.0);

    }

    else if (t==wnim_class)

    return EST_Val(p.most_probable(&prob));

    else if (t==wnim_cluster)

    return EST_Val(a.mean());

    else if (t==wnim_ols)     /* OLS TBA */

    return EST_Val(a.mean());

    else if (t==wnim_vector)

    return EST_Val(a.mean()); /* wnim_vector */

    else if (t==wnim_trajectory)

    return EST_Val(a.mean()); /* NOT YET WRITTEN */

    else

    return EST_Val(a.mean());

}


double WImpurity::samples(void)

{

    if (t==wnim_float)

    return a.samples();

    else if (t==wnim_class)

    return (int)p.samples();

    else if (t==wnim_cluster)

    return members.length();

    else if (t==wnim_ols)

    return members.length();

    else if (t==wnim_vector)

    return members.length();

    else if (t==wnim_trajectory)

    return members.length();

    else

    return 0;

}


WImpurity::WImpurity(const WVectorVector &ds)

{

    int i;


    t=wnim_unset;

    a.reset(); trajectory=0; l=0; width=0;

    data = &ds;  // for ols, model calculation

    for (i=0; i < ds.n(); i++)

    {

        if (t == wnim_ols)

            cumulate(i,1);

        else if (wgn_count_field == -1)

        cumulate((*(ds(i)))[wgn_predictee],1);

        else

        cumulate((*(ds(i)))[wgn_predictee],

             (*(ds(i)))[wgn_count_field]);

    }

}


float WImpurity::measure(void)

{

    if (t == wnim_float)

    return a.variance()*a.samples();

    else if (t == wnim_vector)

    return vector_impurity();

    else if (t == wnim_trajectory)

    return trajectory_impurity();

    else if (t == wnim_matrix)

    return a.variance()*a.samples();

    else if (t == wnim_class)

    return p.entropy()*p.samples();

    else if (t == wnim_cluster)

    return cluster_impurity();

    else if (t == wnim_ols)

    return ols_impurity();  /* RMSE for OLS model */

    else

    {

    cerr << "WImpurity: can't measure unset object" << endl;

    return 0.0;

    }

}


float WImpurity::vector_impurity()

{

    // Find the mean/stddev for all values in all vectors

    // sum the variances and multiply them by the number of members

    EST_Litem *pp;

    EST_Litem *countpp;

    int i,j;

    EST_SuffStats b;

    double count = 1;


    a.reset();

#if 1

    /* simple distance */

    for (j=0; j<wgn_VertexFeats.num_channels(); j++)

    {

        if (wgn_VertexFeats.a(0,j) > 0.0)

        {

            b.reset();

            for (pp=members.head(), countpp=member_counts.head(); pp != 0; pp=pp->next(), countpp=countpp->next())

            {

                i = members.item(pp);


        // Accumulate the value with count

                b.cumulate(wgn_VertexTrack.a(i,j), member_counts.item(countpp)) ;

            }

            a += b.stddev();

            count = b.samples();

        }

    }

#endif


#if 0

    EST_SuffStats *c;

    float x, lshift, rshift, ushift;

    /* Find base mean, then measure do fshift to find best match */

    c = new EST_SuffStats[wgn_VertexTrack.num_channels()+1];

    for (j=0; j<wgn_VertexFeats.num_channels(); j++)

    {

        if (wgn_VertexFeats.a(0,j) > 0.0)

        {

            c[j].reset();

            for (pp=members.head(), countpp=member_counts.head(); pp != 0;

                 pp=pp->next(), countpp=countpp->next())

            {

                i = members.item(pp);

        // Accumulate the value with count

                c[j].cumulate(wgn_VertexTrack.a(i,j),member_counts.item(countpp));

            }

            count = c[j].samples();

        }

    }


    /* Pass through again but vary the num_channels offset (hardcoded) */

    for (pp=members.head(), countpp=member_counts.head(); pp != 0;

         pp=pp->next(), countpp=countpp->next())

    {

        int q;

        float bshift, qshift;

        /* For each sample */

        i = members.item(pp);

        /* Find the value left shifted, unshifted, and right shifted */

        lshift = 0; ushift = 0; rshift = 0;

        bshift = 0;

        for (q=-20; q<=20; q++)

        {

            qshift = 0;

            for (j=67+q; j<147+q/*hardcoded*/; j++)

            {

                x = c[j].mean() - wgn_VertexTrack(i,j);

                qshift += sqrt(x*x);

                if ((bshift > 0) && (qshift > bshift))

                    break;

            }

            if ((bshift == 0) || (qshift < bshift))

                bshift = qshift;

        }

        a += bshift;

    }


#endif


#if 0

    /* full covariance */

    /* worse in listening experiments */

    EST_SuffStats **cs;

    int mmm;

    cs = new EST_SuffStats *[wgn_VertexTrack.num_channels()+1];

    for (j=0; j<=wgn_VertexTrack.num_channels(); j++)

        cs[j] = new EST_SuffStats[wgn_VertexTrack.num_channels()+1];

    /* Find means for diagonal */

    for (j=0; j<wgn_VertexFeats.num_channels(); j++)

    {

        if (wgn_VertexFeats.a(0,j) > 0.0)

        {

            for (pp=members.head(); pp != 0; pp=pp->next())

                cs[j][j] += wgn_VertexTrack.a(members.item(pp),j);

        }

    }

    for (j=0; j<wgn_VertexFeats.num_channels(); j++)

    {

        for (i=j+1; i<wgn_VertexFeats.num_channels(); i++)

            if (wgn_VertexFeats.a(0,j) > 0.0)

            {

                for (pp=members.head(); pp != 0; pp=pp->next())

                {

                    mmm = members.item(pp);

                    cs[i][j] += (wgn_VertexTrack.a(mmm,i)-cs[j][j].mean())*

                        (wgn_VertexTrack.a(mmm,j)-cs[j][j].mean());

                }

            }

    }

    for (j=0; j<wgn_VertexFeats.num_channels(); j++)

    {

        for (i=j+1; i<wgn_VertexFeats.num_channels(); i++)

            if (wgn_VertexFeats.a(0,j) > 0.0)

                a += cs[i][j].stddev();

    }

    count = cs[0][0].samples();

#endif


#if 0

    // look at mean euclidean distance between vectors

    EST_Litem *qq;

    int x,y;

    double d,q;

    count = 0;

    for (pp=members.head(); pp != 0; pp=pp->next())

    {

        x = members.item(pp);

        count++;

        for (qq=pp->next(); qq != 0; qq=qq->next())

        {

            y = members.item(qq);

            for (q=0.0,j=0; j<wgn_VertexFeats.num_channels(); j++)

                if (wgn_VertexFeats.a(0,j) > 0.0)

                {

                    d = wgn_VertexTrack(x,j)-wgn_VertexTrack(y,j);

                    q += d*d;

                }

            a += sqrt(q);

        }


    }

#endif


    // This is sum of stddev*samples

    return a.mean() * count;

}


WImpurity::~WImpurity()

{

    int j;


    if (trajectory != 0)

    {

        for (j=0; j<l; j++)

            delete [] trajectory[j];

        delete [] trajectory;

        trajectory = 0;

        l = 0;

    }

}


float WImpurity::trajectory_impurity()

{

    // Find the mean length of all the units in the cluster

    // Create that number of points

    // Interpolate each unit to that number of points

    // collect means and standard deviations for each point

    // impurity is sum of the variance for each point and each coef

    // multiplied by the number of units.

    EST_Litem *pp;

    int i, j;

    int s, ti, ni, q;

    int s1l, s2l;

    double n, m, m1, m2, w;

    EST_SuffStats lss, stdss;

    EST_SuffStats l1ss, l2ss;

    int l1, l2;

    int ola=0;


    if (trajectory != 0)

    {   /* already done this */

        return score;

    }


    lss.reset();

    l = 0;

    for (pp=members.head(); pp != 0; pp=pp->next())

    {

        i = members.item(pp);

        for (q=0; q<wgn_UnitTrack.a(i,1); q++)

        {

            ni = (int)wgn_UnitTrack.a(i,0)+q;

            if (wgn_VertexTrack.a(ni,0) == -1.0)

            {

                l1ss += q;

                ola = 1;

                break;

            }

        }

        if (q==wgn_UnitTrack.a(i,1))

        {   /* can't find -1 center point, so put all in l2 */

            l1ss += 0;

            l2ss += q;

        }

        else

            l2ss += wgn_UnitTrack.a(i,1) - (q+1) - 1;

        lss += wgn_UnitTrack.a(i,1); /* length of each unit in the cluster */

        if (wgn_UnitTrack.a(i,1) > l)

            l = (int)wgn_UnitTrack.a(i,1);

    }


    if (ola==0)  /* no -1's so its not an ola type cluster */

    {

        l = ((int)lss.mean() < 7) ? 7 : (int)lss.mean();


        /* a list of SuffStats on for each point in the trajectory */

        trajectory = new EST_SuffStats *[l];

        width = wgn_VertexTrack.num_channels()+1;

        for (j=0; j<l; j++)

            trajectory[j] = new EST_SuffStats[width];


        for (pp=members.head(); pp != 0; pp=pp->next())

        {   /* for each unit */

            i = members.item(pp);

            m = (float)wgn_UnitTrack.a(i,1)/(float)l; /* find interpolation */

            s = (int)wgn_UnitTrack.a(i,0); /* start point */

            for (ti=0,n=0.0; ti<l; ti++,n+=m)

            {

                ni = (int)n;  // hmm floor or nint ??

                for (j=0; j<wgn_VertexFeats.num_channels(); j++)

                {

                    if (wgn_VertexFeats.a(0,j) > 0.0)

                        trajectory[ti][j] += wgn_VertexTrack.a(s+ni,j);

                }

            }

        }


        /* find sum of sum of stddev for all coefs of all traj points */

        stdss.reset();

        for (ti=0; ti<l; ti++)

            for (j=0; j<wgn_VertexFeats.num_channels(); j++)

            {

                if (wgn_VertexFeats.a(0,j) > 0.0)

                    stdss += trajectory[ti][j].stddev();

            }


        // This is sum of all stddev * samples

        score = stdss.mean() * members.length();

    }

    else

    {   /* OLA model */

        l1 = (l1ss.mean() < 10.0) ? 10 : (int)l1ss.mean();

        l2 = (l2ss.mean() < 10.0) ? 10 : (int)l2ss.mean();

        l = l1 + l2 + 1 + 1;


        /* a list of SuffStats on for each point in the trajectory */

        trajectory = new EST_SuffStats *[l];

        for (j=0; j<l; j++)

            trajectory[j] = new EST_SuffStats[wgn_VertexTrack.num_channels()+1];


        for (pp=members.head(); pp != 0; pp=pp->next())

        {   /* for each unit */

            i = members.item(pp);

            s1l = 0;

            s = (int)wgn_UnitTrack.a(i,0); /* start point */

            for (q=0; q<wgn_UnitTrack.a(i,1); q++)

                if (wgn_VertexTrack.a(s+q,0) == -1.0)

                {

                    s1l = q; /* printf("awb q is -1 at %d\n",q); */

                    break;

                }

            s2l = (int)wgn_UnitTrack.a(i,1) - (s1l + 2);

            m1 = (float)(s1l)/(float)l1; /* find interpolation step */

            m2 = (float)(s2l)/(float)l2; /* find interpolation step */

            /* First half */

            for (ti=0,n=0.0; s1l > 0 && ti<l1; ti++,n+=m1)

            {

                ni = s + (((int)n < s1l) ? (int)n : s1l - 1);

                for (j=0; j<wgn_VertexFeats.num_channels(); j++)

                    if (wgn_VertexFeats.a(0,j) > 0.0)

                        trajectory[ti][j] += wgn_VertexTrack.a(ni,j);

            }

            ti = l1; /* do it explicitly in case s1l < 1 */

            for (j=0; j<wgn_VertexFeats.num_channels(); j++)

                if (wgn_VertexFeats.a(0,j) > 0.0)

                    trajectory[ti][j] += -1;

            /* Second half */

            s += s1l+1;

            for (ti++,n=0.0; s2l > 0 && ti<l-1; ti++,n+=m2)

            {

                ni = s + (((int)n < s2l) ? (int)n : s2l - 1);

                for (j=0; j<wgn_VertexFeats.num_channels(); j++)

                    if (wgn_VertexFeats.a(0,j) > 0.0)

                        trajectory[ti][j] += wgn_VertexTrack.a(ni,j);

            }

            for (j=0; j<wgn_VertexFeats.num_channels(); j++)

                if (wgn_VertexFeats.a(0,j) > 0.0)

                    trajectory[ti][j] += -2;

        }


        /* find sum of sum of stddev for all coefs of all traj points */

        /* windowing the sums with a triangular weight window         */

        stdss.reset();

        m = 1.0/(float)l1;

        for (w=0.0,ti=0; ti<l1; ti++,w+=m)

            for (j=0; j<wgn_VertexFeats.num_channels(); j++)

                if (wgn_VertexFeats.a(0,j) > 0.0)

                stdss += trajectory[ti][j].stddev() * w;

        m = 1.0/(float)l2;

        for (w=1.0,ti++; ti<l-1; ti++,w-=m)

            for (j=0; j<wgn_VertexFeats.num_channels(); j++)

                if (wgn_VertexFeats.a(0,j) > 0.0)

                    stdss += trajectory[ti][j].stddev() * w;


        // This is sum of all stddev * samples

        score = stdss.mean() * members.length();

    }

    return score;

}


static void part_to_ols_data(EST_FMatrix &X, EST_FMatrix &Y,

                             EST_IVector &included,

                             EST_StrList &feat_names,

                             const EST_IList &members,

                             const WVectorVector &d)

{

    int m,n,p;

    int w, xm=0;

    EST_Litem *pp;

    WVector *wv;


    w = wgn_dataset.width();

    included.resize(w);

    X.resize(members.length(),w);

    Y.resize(members.length(),1);

    feat_names.append("Intercept");

    included[0] = TRUE;


    for (p=0,pp=members.head(); pp; p++,pp=pp->next())

    {

        n = members.item(pp);

        if (n < 0)

        {

            p--;

            continue;

        }

        wv = d(n);

    Y.a_no_check(p,0) = (*wv)[0];

    X.a_no_check(p,0) = 1;

    for (m=1,xm=1; m < w; m++)

        {

            if (wgn_dataset.ftype(m) == wndt_float)

            {

                if (p == 0) // only do this once

                {

                    feat_names.append(wgn_dataset.feat_name(m));

                }

                X.a_no_check(p,xm) = (*wv)[m];

                included.a_no_check(xm) = FALSE;

                included.a_no_check(xm) = TRUE;

                xm++;

            }

        }

    }


    included.resize(xm);

    X.resize(p,xm);

    Y.resize(p,1);

}


float WImpurity::ols_impurity()

{

    // Build an OLS model for the current data and measure it against

    // the data itself and give a RMSE

    EST_FMatrix X,Y;

    EST_IVector included;

    EST_FMatrix coeffs;

    EST_StrList feat_names;

    float best_score;

    EST_FMatrix coeffsl;

    EST_FMatrix pred;

    float cor,rmse;


    // Load the sample members into matrices for ols

    part_to_ols_data(X,Y,included,feat_names,members,*data);


    // Find the best ols model.

    // Far too computationally expensive

    //    if (!stepwise_ols(X,Y,feat_names,0.0,coeffs,

    //                      X,Y,included,best_score))

    //  return WGN_HUGE_VAL;  // couldn't find a model


    // Non stepwise model

    if (!robust_ols(X,Y,included,coeffsl))

    {

        //        printf("no robust ols\n");

        return WGN_HUGE_VAL;

    }

    ols_apply(X,coeffsl,pred);

    ols_test(Y,pred,cor,rmse);

    best_score = cor;


    printf("Impurity OLS X(%d,%d) Y(%d,%d) %f, %f, %f\n",

             X.num_rows(),X.num_columns(),Y.num_rows(),Y.num_columns(),

             rmse,cor,

             1-best_score);

    if (fabs(coeffsl[0]) > 10000)

    {

        // printf("weird sized Intercept %f\n",coeffsl[0]);

        return WGN_HUGE_VAL;

    }


    return (1-best_score) *members.length();

}


float WImpurity::cluster_impurity()

{

    // Find the mean distance between all members of the dataset

    // Uses the global DistMatrix for distances between members of

    // the cluster set.  Distances are assumed to be symmetric thus only

    // the bottom half of the distance matrix is filled

    EST_Litem *pp, *q;

    int i,j;

    double dist;


    a.reset();

    for (pp=members.head(); pp != 0; pp=pp->next())

    {

    i = members.item(pp);

    for (q=pp->next(); q != 0; q=q->next())

    {

        j = members.item(q);

        dist = (j < i ? wgn_DistMatrix.a_no_check(i,j) :

                    wgn_DistMatrix.a_no_check(j,i));

        a+=dist;  // cumulate for whole cluster

    }

    }


    // This is sum distance between cross product of members

//    return a.sum();

    if (a.samples() > 1)

        return a.stddev() * a.samples();

    else

        return 0.0;

}


float WImpurity::cluster_distance(int i)

{

    // Distance this unit is from all others in this cluster

    // in absolute standard deviations from the the mean.

    float dist = cluster_member_mean(i);

    float mdist = dist-a.mean();


    if (mdist == 0.0)

    return 0.0;

    else

    return fabs((dist-a.mean())/a.stddev());


}


int WImpurity::in_cluster(int i)

{

    // Would this be a member of this cluster?.  Returns 1 if

    // its distance is less than at least one other

    float dist = cluster_member_mean(i);

    EST_Litem *pp;


    for (pp=members.head(); pp != 0; pp=pp->next())

    {

    if (dist < cluster_member_mean(members.item(pp)))

        return 1;

    }

    return 0;

}


float WImpurity::cluster_ranking(int i)

{

    // Position in ranking closest to centre

    float dist = cluster_distance(i);

    EST_Litem *pp;

    int ranking = 1;


    for (pp=members.head(); pp != 0; pp=pp->next())

    {

    if (dist >= cluster_distance(members.item(pp)))

        ranking++;

    }


    return ranking;

}


float WImpurity::cluster_member_mean(int i)

{

    // Returns the mean difference between this member and all others

    // in cluster

    EST_Litem *q;

    int j,n;

    double dist,sum;


    for (sum=0.0,n=0,q=members.head(); q != 0; q=q->next())

    {

    j = members.item(q);

    if (i != j)

    {

        dist = (j < i ? wgn_DistMatrix(i,j) : wgn_DistMatrix(j,i));

        sum += dist;

        n++;

    }

    }


    return ( n == 0 ? 0.0 : sum/n );

}


void WImpurity::cumulate(const float pv,double count)

{

    // Cumulate data for impurity calculation


    if (wgn_dataset.ftype(wgn_predictee) == wndt_cluster)

    {

    t = wnim_cluster;

    members.append((int)pv);

    }

    else if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)

    {

    t = wnim_ols;

    members.append((int)pv);

    }

    else if (wgn_dataset.ftype(wgn_predictee) == wndt_vector)

    {

    t = wnim_vector;


    // AUP: Implement counts in vectors

    members.append((int)pv);

    member_counts.append((float)count);

    }

    else if (wgn_dataset.ftype(wgn_predictee) == wndt_trajectory)

    {

    t = wnim_trajectory;

    members.append((int)pv);

    }

    else if (wgn_dataset.ftype(wgn_predictee) >= wndt_class)

    {

    if (t == wnim_unset)

        p.init(&wgn_discretes[wgn_dataset.ftype(wgn_predictee)]);

    t = wnim_class;

    p.cumulate((int)pv,count);

    }

    else if (wgn_dataset.ftype(wgn_predictee) == wndt_binary)

    {

    t = wnim_float;

    a.cumulate((int)pv,count);

    }

    else if (wgn_dataset.ftype(wgn_predictee) == wndt_float)

    {

    t = wnim_float;

    a.cumulate(pv,count);

    }

    else

    {

    wagon_error("WImpurity: cannot cumulate EST_Val type");

    }

}


ostream & operator <<(ostream &s, WImpurity &imp)

{

    int j,i;

    EST_SuffStats b;


    if (imp.t == wnim_float)

    s << "(" << imp.a.stddev() << " " << imp.a.mean() << ")";

    else if (imp.t == wnim_vector)

    {

      EST_Litem *p, *countp;

    s << "((";

        imp.vector_impurity();

        if (wgn_vertex_output == "mean")  //output means

        {

            for (j=0; j<wgn_VertexTrack.num_channels(); j++)

            {

                b.reset();

                for (p=imp.members.head(), countp=imp.member_counts.head(); p != 0; p=p->next(), countp=countp->next())

                {

          // Accumulate the members with their counts

          b.cumulate(wgn_VertexTrack.a(imp.members.item(p),j), imp.member_counts.item(countp));

          //b += wgn_VertexTrack.a(imp.members.item(p),j);

                }

                s << "(" << b.mean() << " ";

                if (isfinite(b.stddev()))

                    s << b.stddev() << ")";

                else

                    s << "0.001" << ")";

                if (j+1<wgn_VertexTrack.num_channels())

                    s << " ";

            }

        }

        else /* output best in the cluster */

        {

            /* print out vector closest to center, rather than average */

            double best = WGN_HUGE_VAL;

            double x,d;

            int bestp = 0;

            EST_SuffStats *cs;


            cs = new EST_SuffStats [wgn_VertexTrack.num_channels()+1];


            for (j=0; j<wgn_VertexFeats.num_channels(); j++)

                if (wgn_VertexFeats.a(0,j) > 0.0)

                {

                    cs[j].reset();

                    for (p=imp.members.head(); p != 0; p=p->next())

                    {

                        cs[j] += wgn_VertexTrack.a(imp.members.item(p),j);

                    }

                }


            for (p=imp.members.head(); p != 0; p=p->next())

            {

                for (x=0.0,j=0; j<wgn_VertexFeats.num_channels(); j++)

                    if (wgn_VertexFeats.a(0,j) > 0.0)

                    {

                        d = (wgn_VertexTrack.a(imp.members.item(p),j)-cs[j].mean())

                            /* / cs[j].stddev() */ ; /* seems worse 061218 */

                        x += d*d;

                    }

                if (x < best)

                {

                    bestp = imp.members.item(p);

                    best = x;

                }

            }

            for (j=0; j<wgn_VertexTrack.num_channels(); j++)

            {

                s << "( ";

                s << wgn_VertexTrack.a(bestp,j);

                //                s << " 0 "; // fake stddev

                s << " ";

                if (isfinite(cs[j].stddev()))

                    s << cs[j].stddev();

                else

                    s << "0";

                s << " ) ";

                if (j+1<wgn_VertexTrack.num_channels())

                    s << " ";

            }


            delete [] cs;

        }

    s << ") ";

    s << imp.a.mean() << ")";

    }

    else if (imp.t == wnim_trajectory)

    {

    s << "((";

        imp.trajectory_impurity();

        for (i=0; i<imp.l; i++)

        {

            s << "(";

            for (j=0; j<wgn_VertexTrack.num_channels(); j++)

            {

                s << "(" << imp.trajectory[i][j].mean() << " "

                  << imp.trajectory[i][j].stddev() << " " << ")";

            }

            s << ")\n";

        }

    s << ") ";

    // Mean of cross product of distances (cluster score)

    s << imp.a.mean() << ")";

    }

    else if (imp.t == wnim_cluster)

    {

    EST_Litem *p;

    s << "((";

    for (p=imp.members.head(); p != 0; p=p->next())

    {

        // Ouput cluster member and its mean distance to others

        s << "(" << imp.members.item(p) << " " <<

        imp.cluster_member_mean(imp.members.item(p)) << ")";

        if (p->next() != 0)

        s << " ";

    }

    s << ") ";

    // Mean of cross product of distances (cluster score)

    s << imp.a.mean() << ")";

    }

    else if (imp.t == wnim_ols)

    {

        /* Output intercept, feature names and coefficients for ols model */

        EST_FMatrix X,Y;

        EST_IVector included;

        EST_FMatrix coeffs;

        EST_StrList feat_names;

        EST_FMatrix coeffsl;

        EST_FMatrix pred;

        float cor=0.0,rmse;


        s << "((";

        // Load the sample members into matrices for ols

        part_to_ols_data(X,Y,included,feat_names,imp.members,*(imp.data));

        if (!robust_ols(X,Y,included,coeffsl))

        {

            printf("no robust ols\n");

            // shouldn't happen

        }

        else

        {

            ols_apply(X,coeffsl,pred);

            ols_test(Y,pred,cor,rmse);

            for (i=0; i<coeffsl.num_rows(); i++)

            {

                s << "(";

                s << feat_names.nth(i);

                s << " ";

                s << coeffsl[i];

                s << ") ";

            }

        }


    // Mean of cross product of distances (cluster score)

    s << ") " << cor << ")";

    }

    else if (imp.t == wnim_class)

    {

    EST_Litem *i;

    EST_String name;

    double prob;


    s << "(";

    for (i=imp.p.item_start(); !imp.p.item_end(i); i=imp.p.item_next(i))

    {

        imp.p.item_prob(i,name,prob);

        s << "(" << name << " " << prob << ") ";

    }

    s << imp.p.most_probable(&prob) << ")";

    }

    else

    s << "([WImpurity unset])";


    return s;

}