docs/speech_tools-2.4.0/wagon_8cc_source.html

/*************************************************************************/

/*                                                                       */

/*                Centre for Speech Technology Research                  */

/*                     University of Edinburgh, UK                       */

/*                      Copyright (c) 1996,1997                          */

/*                        All Rights Reserved.                           */

/*                                                                       */

/*  Permission is hereby granted, free of charge, to use and distribute  */

/*  this software and its documentation without restriction, including   */

/*  without limitation the rights to use, copy, modify, merge, publish,  */

/*  distribute, sublicense, and/or sell copies of this work, and to      */

/*  permit persons to whom this work is furnished to do so, subject to   */

/*  the following conditions:                                            */

/*   1. The code must retain the above copyright notice, this list of    */

/*      conditions and the following disclaimer.                         */

/*   2. Any modifications must be clearly marked as such.                */

/*   3. Original authors' names are not deleted.                         */

/*   4. The authors' names are not used to endorse or promote products   */

/*      derived from this software without specific prior written        */

/*      permission.                                                      */

/*                                                                       */

/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */

/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */

/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */

/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */

/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */

/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */

/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */

/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */

/*  THIS SOFTWARE.                                                       */

/*                                                                       */

/*************************************************************************/

/*                     Author :  Alan W Black                            */

/*                     Date   :  May 1996                                */

/*-----------------------------------------------------------------------*/

/*  A Classification and Regression Tree (CART) Program                  */

/*  A basic implementation of many of the techniques in                  */

/*  Briemen et al. 1984                                                  */

/*                                                                       */

/*  Added decision list support, Feb 1997                                */

/*  Added stepwise use of features, Oct 1997                             */

/*                                                                       */

/*=======================================================================*/


#include <cstdlib>

#include <iostream>

#include <fstream>

#include <cstring>

#include "EST_Token.h"

#include "EST_FMatrix.h"

#include "EST_multistats.h"

#include "EST_Wagon.h"

#include "EST_math.h"


Discretes wgn_discretes;


WDataSet wgn_dataset;

WDataSet wgn_test_dataset;

EST_FMatrix wgn_DistMatrix;

EST_Track wgn_VertexTrack;

EST_Track wgn_VertexFeats;

EST_Track wgn_UnitTrack;


int wgn_min_cluster_size = 50;

int wgn_held_out = 0;

int wgn_prune = TRUE;

int wgn_quiet = FALSE;

int wgn_verbose = FALSE;

int wgn_count_field = -1;

EST_String wgn_count_field_name = "";

int wgn_predictee = 0;

EST_String wgn_predictee_name = "";

float wgn_float_range_split = 10;

float wgn_balance = 0;

EST_String wgn_opt_param = "";

EST_String wgn_vertex_output = "mean";

EST_String wgn_vertex_otype = "mean";


static float do_summary(WNode &tree,WDataSet &ds,ostream *output);

static float test_tree_float(WNode &tree,WDataSet &ds,ostream *output);

static float test_tree_class(WNode &tree,WDataSet &ds,ostream *output);

static float test_tree_cluster(WNode &tree,WDataSet &dataset, ostream *output);

static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output);

static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output);

static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output);

static int wagon_split(int margin,WNode &node);

static WQuestion find_best_question(WVectorVector &dset);

static void construct_binary_ques(int feat,WQuestion &test_ques);

static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds);

static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds);

static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in);

static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat);


Declare_TList_T(WVector *, WVectorP)


Declare_TVector_Base_T(WVector *,NULL,NULL,WVectorP)


#if defined(INSTANTIATE_TEMPLATES)

// Instantiate class

#include "../base_class/EST_TList.cc"

#include "../base_class/EST_TVector.cc"


Instantiate_TList_T(WVector *, WVectorP)


Instantiate_TVector(WVector *)


#endif


void wgn_load_datadescription(EST_String fname,LISP ignores)

{

    // Load field description for a file

    wgn_dataset.load_description(fname,ignores);

    wgn_test_dataset.load_description(fname,ignores);

}


void wgn_load_dataset(WDataSet &dataset,EST_String fname)

{

    // Read the data set from a filename.  One vector per line

    // Assume all numbers are numbers and non-nums are categorical

    EST_TokenStream ts;

    WVector *v;

    int nvec=0,i;


    if (ts.open(fname) == -1)

    wagon_error(EST_String("unable to open data file \"")+

            fname+"\"");

    ts.set_PunctuationSymbols("");

    ts.set_PrePunctuationSymbols("");

    ts.set_SingleCharSymbols("");


    for ( ;!ts.eof(); )

    {

    v = new WVector(dataset.width());

    i = 0;

    do

    {

        int type = dataset.ftype(i);

        if ((type == wndt_float) ||

                (type == wndt_ols) ||

                (wgn_count_field == i))

        {

        // need to ensure this is not NaN or Infinity

        float f = atof(ts.get().string());

        if (isfinite(f))

            v->set_flt_val(i,f);

        else

        {

            cout << fname << ": bad float " << f

            << " in field " <<

            dataset.feat_name(i) << " vector " <<

                dataset.samples() << endl;

            v->set_flt_val(i,0.0);

        }

        }

        else if (type == wndt_binary)

        v->set_int_val(i,atoi(ts.get().string()));

        else if (type == wndt_cluster)  /* index into distmatrix */

        v->set_int_val(i,atoi(ts.get().string()));

        else if (type == wndt_vector)   /* index into VertexTrack */

        v->set_int_val(i,atoi(ts.get().string()));

        else if (type == wndt_trajectory) /* index to index and length */

            {   /* a number pointing to a vector in UnitTrack that */

                /* has an idex into VertexTrack and a number of Vertices */

                /* Thus if its 15, UnitTrack.a(15,0) is the start frame in */

                /* VertexTrack and UnitTrack.a(15,1) is the number of */

                /* frames in the unit                                 */

        v->set_int_val(i,atoi(ts.get().string()));

            }

        else if (type == wndt_ignore)

        {

        ts.get();  // skip it

        v->set_int_val(i,0);

        }

        else // should check the different classes

        {

        EST_String s = ts.get().string();

        int n = wgn_discretes.discrete(type).name(s);

        if (n == -1)

        {

            cout << fname << ": bad value " << s << " in field " <<

            dataset.feat_name(i) << " vector " <<

                dataset.samples() << endl;

            n = 0;

        }

        v->set_int_val(i,n);

        }

        i++;

    }

    while (!ts.eoln() && i<dataset.width());

    nvec ++;

    if (i != dataset.width())

    {

        wagon_error(fname+": data vector "+itoString(nvec)+" contains "

            +itoString(i)+" parameters instead of "+

            itoString(dataset.width()));

    }

    if (!ts.eoln())

    {

        cerr << fname << ": data vector " << nvec <<

        " contains too many parameters instead of "

        << dataset.width() << endl;

        wagon_error(EST_String("extra parameter(s) from ")+

            ts.peek().string());

    }

    dataset.append(v);

    }


    cout << "Dataset of " << dataset.samples() << " vectors of " <<

    dataset.width() << " parameters from: " << fname << endl;

    ts.close();

}


float summary_results(WNode &tree,ostream *output)

{

    if (wgn_test_dataset.samples() != 0)

    return do_summary(tree,wgn_test_dataset,output);

    else

    return do_summary(tree,wgn_dataset,output);

}


static float do_summary(WNode &tree,WDataSet &ds,ostream *output)

{

    if (wgn_dataset.ftype(wgn_predictee) == wndt_cluster)

    return test_tree_cluster(tree,ds,output);

    else if (wgn_dataset.ftype(wgn_predictee) == wndt_vector)

    return test_tree_vector(tree,ds,output);

    else if (wgn_dataset.ftype(wgn_predictee) == wndt_trajectory)

    return test_tree_trajectory(tree,ds,output);

    else if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)

    return test_tree_ols(tree,ds,output);

    else if (wgn_dataset.ftype(wgn_predictee) >= wndt_class)

    return test_tree_class(tree,ds,output);

    else

    return test_tree_float(tree,ds,output);

}


WNode *wgn_build_tree(float &score)

{

    // Build init node and split it while reducing the impurity

    WNode *top = new WNode();

    int margin = 0;


    wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,TRUE);


    margin = 0;

    wagon_split(margin,*top);  // recursively split data;


    if (wgn_held_out > 0)

    {

    wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,FALSE);

    top->held_out_prune();

    }


    if (wgn_prune)

    top->prune();


    score = summary_results(*top,0);


    return top;

}


static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in)

{

    // Set data ommitting held_out percent if in is true

    // or only including 100-held_out percent if in is false

    int i,j;

    EST_Litem *d;


    // Make it definitely big enough

    data.resize(ds.length());


    for (j=i=0,d=ds.head(); d != 0; d=d->next(),j++)

    {

    if ((in) && ((j%100) >= held_out))

        data[i++] = ds(d);

//  else if ((!in) && ((j%100 < held_out)))

//      data[i++] = ds(d);

    else if (!in)

        data[i++] = ds(d);

//  if ((in) && (j < held_out))

//      data[i++] = ds(d);

//  else if ((!in) && (j >=held_out))

//      data[i++] = ds(d);

    }

    // make it the actual size, but don't reset values

    data.resize(i,1);

}


static float test_tree_class(WNode &tree,WDataSet &dataset,ostream *output)

{

    // Test tree against data to get summary of results

    EST_StrStr_KVL pairs;

    EST_StrList lex;

    EST_Litem *p;

    EST_String predict,real;

    WNode *pnode;

    double H=0,prob;

    int i,type;

    float correct=0,total=0, count=0;


    float bcorrect=0, bpredicted=0, bactual=0;

    float precision=0, recall=0;


    for (p=dataset.head(); p != 0; p=p->next())

    {

    pnode = tree.predict_node((*dataset(p)));

    predict = (EST_String)pnode->get_impurity().value();

    if (wgn_count_field == -1)

        count = 1.0;

    else

        count = dataset(p)->get_flt_val(wgn_count_field);

    prob = pnode->get_impurity().pd().probability(predict);

    H += (log(prob))*count;

    type = dataset.ftype(wgn_predictee);

    real = wgn_discretes[type].name(dataset(p)->get_int_val(wgn_predictee));


    if (wgn_opt_param == "B_NB_F1")

      {

        //cout << real << " " << predict << endl;

        if (real == "B")

          bactual +=count;

        if (predict == "B")

          {

        bpredicted += count;

        if (real == predict)

          bcorrect += count;

          }

        //      cout <<bactual << " " << bpredicted << " " << bcorrect << endl;

      }

    if (real == predict)

        correct += count;

    total += count;

    pairs.add_item(real,predict,1);

    }

    for (i=0; i<wgn_discretes[dataset.ftype(wgn_predictee)].length(); i++)

    lex.append(wgn_discretes[dataset.ftype(wgn_predictee)].name(i));


    const EST_FMatrix &m = confusion(pairs,lex);


    if (output != NULL)

    {

    print_confusion(m,pairs,lex);  // should be to output not stdout

    *output << ";; entropy " << (-1*(H/total)) << " perplexity " <<

        pow(2.0,(-1*(H/total))) << endl;

    }


    // Minus it so bigger is better

    if (wgn_opt_param == "entropy")

    return -pow(2.0,(-1*(H/total)));

    else if(wgn_opt_param == "B_NB_F1")

      {

    if(bpredicted == 0)

      precision = 1;

    else

      precision = bcorrect/bpredicted;

    if(bactual == 0)

      recall = 1;

    else

      recall = bcorrect/bactual;

    float fmeasure = 0;

    if((precision+recall) !=0)

      fmeasure = 2* (precision*recall)/(precision+recall);

    cout<< "F1 :" << fmeasure << " Prec:" << precision << " Rec:" << recall << " B-Pred:" << bpredicted << " B-Actual:" << bactual << " B-Correct:" << bcorrect << endl;

    return fmeasure;

      }

    else

    return (float)correct/(float)total;

}


static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output)

{

    // Test tree against data to get summary of results VECTOR

    // distance is calculated in zscores (as the values in vector may

    // have quite different ranges

    WNode *leaf;

    EST_Litem *p;

    float predict, actual;

    EST_SuffStats x,y,xx,yy,xy,se,e;

    EST_SuffStats b;

    int i,j,pos;

    double cor,error;

    double count;

    EST_Litem *pp;


    for (p=dataset.head(); p != 0; p=p->next())

    {

    leaf = tree.predict_node((*dataset(p)));

    pos = dataset(p)->get_int_val(wgn_predictee);

        for (j=0; j<wgn_VertexFeats.num_channels(); j++)

            if (wgn_VertexFeats.a(0,j) > 0.0)

            {

                b.reset();

                for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())

                {

                    i = leaf->get_impurity().members.item(pp);

                    b += wgn_VertexTrack.a(i,j);

                }

                predict = b.mean();

                actual = wgn_VertexTrack.a(pos,j);

                if (wgn_count_field == -1)

                    count = 1.0;

                else

                    count = dataset(p)->get_flt_val(wgn_count_field);

                x.cumulate(predict,count);

                y.cumulate(actual,count);

                /* Normalized the error by the standard deviation */

                if (b.stddev() == 0)

                    error = predict-actual;

                else

                    error = (predict-actual)/b.stddev();

                error = predict-actual; /* awb_debug */

                se.cumulate((error*error),count);

                e.cumulate(fabs(error),count);

                xx.cumulate(predict*predict,count);

                yy.cumulate(actual*actual,count);

                xy.cumulate(predict*actual,count);

            }

    }


    // Pearson's product moment correlation coefficient

//    cor = (xy.mean() - (x.mean()*y.mean()))/

//  (sqrt(xx.mean()-(x.mean()*x.mean())) *

//   sqrt(yy.mean()-(y.mean()*y.mean())));

    // Because when the variation is X is very small we can

    // go negative, thus cause the sqrt's to give FPE

    double v1 = xx.mean()-(x.mean()*x.mean());

    double v2 = yy.mean()-(y.mean()*y.mean());


    double v3 = v1*v2;


    if (v3 <= 0)

    // happens when there's very little variation in x

    cor = 0;

    else

    cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);


    if (output != NULL)

    {

    if (output != &cout)   // save in output file

        *output

        << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)

        << " Correlation is " << ftoString(cor,4,1)

        << " Mean (abs) Error " << ftoString(e.mean(),4,1)

        << " (" << ftoString(e.stddev(),4,1) << ")" << endl;


    cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)

        << " Correlation is " << ftoString(cor,4,1)

        << " Mean (abs) Error " << ftoString(e.mean(),4,1)

        << " (" << ftoString(e.stddev(),4,1) << ")" << endl;

    }


    if (wgn_opt_param == "rmse")

    return -sqrt(se.mean());  // * -1 so bigger is better

    else

    return cor;  // should really be % variance, I think

}


static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output)

{

    // Test tree against data to get summary of results TRAJECTORY

    // distance is calculated in zscores (as the values in vector may

    // have quite different ranges)

    // NOT WRITTEN YET

    WNode *leaf;

    EST_Litem *p;

    float predict, actual;

    EST_SuffStats x,y,xx,yy,xy,se,e;

    EST_SuffStats b;

    int i,j,pos;

    double cor,error;

    double count;

    EST_Litem *pp;


    for (p=dataset.head(); p != 0; p=p->next())

    {

    leaf = tree.predict_node((*dataset(p)));

    pos = dataset(p)->get_int_val(wgn_predictee);

        for (j=0; j<wgn_VertexFeats.num_channels(); j++)

            if (wgn_VertexFeats.a(0,j) > 0.0)

            {

                b.reset();

                for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())

                {

                    i = leaf->get_impurity().members.item(pp);

                    b += wgn_VertexTrack.a(i,j);

                }

                predict = b.mean();

                actual = wgn_VertexTrack.a(pos,j);

                if (wgn_count_field == -1)

                    count = 1.0;

                else

                    count = dataset(p)->get_flt_val(wgn_count_field);

                x.cumulate(predict,count);

                y.cumulate(actual,count);

                /* Normalized the error by the standard deviation */

                if (b.stddev() == 0)

                    error = predict-actual;

                else

                    error = (predict-actual)/b.stddev();

                error = predict-actual; /* awb_debug */

                se.cumulate((error*error),count);

                e.cumulate(fabs(error),count);

                xx.cumulate(predict*predict,count);

                yy.cumulate(actual*actual,count);

                xy.cumulate(predict*actual,count);

            }

    }


    // Pearson's product moment correlation coefficient

//    cor = (xy.mean() - (x.mean()*y.mean()))/

//  (sqrt(xx.mean()-(x.mean()*x.mean())) *

//   sqrt(yy.mean()-(y.mean()*y.mean())));

    // Because when the variation is X is very small we can

    // go negative, thus cause the sqrt's to give FPE

    double v1 = xx.mean()-(x.mean()*x.mean());

    double v2 = yy.mean()-(y.mean()*y.mean());


    double v3 = v1*v2;


    if (v3 <= 0)

    // happens when there's very little variation in x

    cor = 0;

    else

    cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);


    if (output != NULL)

    {

    if (output != &cout)   // save in output file

        *output

        << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)

        << " Correlation is " << ftoString(cor,4,1)

        << " Mean (abs) Error " << ftoString(e.mean(),4,1)

        << " (" << ftoString(e.stddev(),4,1) << ")" << endl;


    cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)

        << " Correlation is " << ftoString(cor,4,1)

        << " Mean (abs) Error " << ftoString(e.mean(),4,1)

        << " (" << ftoString(e.stddev(),4,1) << ")" << endl;

    }


    if (wgn_opt_param == "rmse")

    return -sqrt(se.mean());  // * -1 so bigger is better

    else

    return cor;  // should really be % variance, I think

}


static float test_tree_cluster(WNode &tree,WDataSet &dataset,ostream *output)

{

    // Test tree against data to get summary of results for cluster trees

    WNode *leaf;

    int real;

    int right_cluster=0;

    EST_SuffStats ranking, meandist;

    EST_Litem *p;


    for (p=dataset.head(); p != 0; p=p->next())

    {

    leaf = tree.predict_node((*dataset(p)));

    real = dataset(p)->get_int_val(wgn_predictee);

    meandist += leaf->get_impurity().cluster_distance(real);

    right_cluster += leaf->get_impurity().in_cluster(real);

    ranking += leaf->get_impurity().cluster_ranking(real);

    }


    if (output != NULL)

    {

    // Want number in right class, mean distance in sds, mean ranking

    if (output != &cout)   // save in output file

        *output << ";; Right cluster " << right_cluster << " (" <<

        (int)(100.0*(float)right_cluster/(float)dataset.length()) <<

            "%) mean ranking " << ranking.mean() << " mean distance "

            << meandist.mean() << endl;

    cout << "Right cluster " << right_cluster << " (" <<

        (int)(100.0*(float)right_cluster/(float)dataset.length()) <<

        "%) mean ranking " << ranking.mean() << " mean distance "

            << meandist.mean() << endl;

    }


    return 10000-meandist.mean();  // this doesn't work but I tested it

}


static float test_tree_float(WNode &tree,WDataSet &dataset,ostream *output)

{

    // Test tree against data to get summary of results FLOAT

    EST_Litem *p;

    float predict,real;

    EST_SuffStats x,y,xx,yy,xy,se,e;

    double cor,error;

    double count;


    for (p=dataset.head(); p != 0; p=p->next())

    {

    predict = tree.predict((*dataset(p)));

    real = dataset(p)->get_flt_val(wgn_predictee);

    if (wgn_count_field == -1)

        count = 1.0;

    else

        count = dataset(p)->get_flt_val(wgn_count_field);

    x.cumulate(predict,count);

    y.cumulate(real,count);

    error = predict-real;

    se.cumulate((error*error),count);

    e.cumulate(fabs(error),count);

    xx.cumulate(predict*predict,count);

    yy.cumulate(real*real,count);

    xy.cumulate(predict*real,count);

    }


    // Pearson's product moment correlation coefficient

//    cor = (xy.mean() - (x.mean()*y.mean()))/

//  (sqrt(xx.mean()-(x.mean()*x.mean())) *

//   sqrt(yy.mean()-(y.mean()*y.mean())));

    // Because when the variation is X is very small we can

    // go negative, thus cause the sqrt's to give FPE

    double v1 = xx.mean()-(x.mean()*x.mean());

    double v2 = yy.mean()-(y.mean()*y.mean());


    double v3 = v1*v2;


    if (v3 <= 0)

    // happens when there's very little variation in x

    cor = 0;

    else

    cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);


    if (output != NULL)

    {

    if (output != &cout)   // save in output file

        *output

        << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)

        << " Correlation is " << ftoString(cor,4,1)

        << " Mean (abs) Error " << ftoString(e.mean(),4,1)

        << " (" << ftoString(e.stddev(),4,1) << ")" << endl;


    cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)

        << " Correlation is " << ftoString(cor,4,1)

        << " Mean (abs) Error " << ftoString(e.mean(),4,1)

        << " (" << ftoString(e.stddev(),4,1) << ")" << endl;

    }


    if (wgn_opt_param == "rmse")

    return -sqrt(se.mean());  // * -1 so bigger is better

    else

    return cor;  // should really be % variance, I think

}


static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output)

{

    // Test tree against data to get summary of results OLS

    EST_Litem *p;

    WNode *leaf;

    float predict,real;

    EST_SuffStats x,y,xx,yy,xy,se,e;

    double cor,error;

    double count;


    for (p=dataset.head(); p != 0; p=p->next())

    {

    leaf = tree.predict_node((*dataset(p)));

        // do ols to get predict;

        predict = 0.0;

    real = dataset(p)->get_flt_val(wgn_predictee);

    if (wgn_count_field == -1)

        count = 1.0;

    else

        count = dataset(p)->get_flt_val(wgn_count_field);

    x.cumulate(predict,count);

    y.cumulate(real,count);

    error = predict-real;

    se.cumulate((error*error),count);

    e.cumulate(fabs(error),count);

    xx.cumulate(predict*predict,count);

    yy.cumulate(real*real,count);

    xy.cumulate(predict*real,count);

    }


    // Pearson's product moment correlation coefficient

//    cor = (xy.mean() - (x.mean()*y.mean()))/

//  (sqrt(xx.mean()-(x.mean()*x.mean())) *

//   sqrt(yy.mean()-(y.mean()*y.mean())));

    // Because when the variation is X is very small we can

    // go negative, thus cause the sqrt's to give FPE

    double v1 = xx.mean()-(x.mean()*x.mean());

    double v2 = yy.mean()-(y.mean()*y.mean());


    double v3 = v1*v2;


    if (v3 <= 0)

    // happens when there's very little variation in x

    cor = 0;

    else

    cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);


    if (output != NULL)

    {

    if (output != &cout)   // save in output file

        *output

        << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)

        << " Correlation is " << ftoString(cor,4,1)

        << " Mean (abs) Error " << ftoString(e.mean(),4,1)

        << " (" << ftoString(e.stddev(),4,1) << ")" << endl;


    cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)

        << " Correlation is " << ftoString(cor,4,1)

        << " Mean (abs) Error " << ftoString(e.mean(),4,1)

        << " (" << ftoString(e.stddev(),4,1) << ")" << endl;

    }


    if (wgn_opt_param == "rmse")

    return -sqrt(se.mean());  // * -1 so bigger is better

    else

    return cor;  // should really be % variance, I think

}


static int wagon_split(int margin, WNode &node)

{

    // Split given node (if possible)

    WQuestion q;

    WNode *l,*r;


    node.set_impurity(WImpurity(node.get_data()));

    q = find_best_question(node.get_data());


/*    printf("q.score() %f impurity %f\n",

       q.get_score(),

       node.get_impurity().measure()); */


    double impurity_measure = node.get_impurity().measure();

    double question_score = q.get_score();


    if ((question_score < WGN_HUGE_VAL) &&

        (question_score < impurity_measure))


    {

    // Ok its worth a split

    l = new WNode();

    r = new WNode();

    wgn_find_split(q,node.get_data(),l->get_data(),r->get_data());

    node.set_subnodes(l,r);

    node.set_question(q);

    if (wgn_verbose)

    {

        int i;

        for (i=0; i < margin; i++)

        cout << " ";

        cout << q << endl;

    }

    margin++;

    wagon_split(margin,*l);

    margin++;

    wagon_split(margin,*r);

    margin--;

    return TRUE;

    }

    else

    {

    if (wgn_verbose)

    {

        int i;

        for (i=0; i < margin; i++)

        cout << " ";

        cout << "stopped samples: " << node.samples() << " impurity: "

        << node.get_impurity() << endl;

    }

    margin--;

    return FALSE;

    }

}


void wgn_find_split(WQuestion &q,WVectorVector &ds,

            WVectorVector &y,WVectorVector &n)

{

    int i, iy, in;


    y.resize(q.get_yes());

    n.resize(q.get_no());


    for (iy=in=i=0; i < ds.n(); i++)

    if (q.ask(*ds(i)) == TRUE)

        y[iy++] = ds(i);

    else

        n[in++] = ds(i);


}


static WQuestion find_best_question(WVectorVector &dset)

{

    //  Ask all possible questions and find the best one

    int i;

    float bscore,tscore;

    WQuestion test_ques, best_ques;


    bscore = tscore = WGN_HUGE_VAL;

    best_ques.set_score(bscore);

    // test each feature with each possible question

    for (i=0;i < wgn_dataset.width(); i++)

    {

    if ((wgn_dataset.ignore(i) == TRUE) ||

        (i == wgn_predictee))

        tscore = WGN_HUGE_VAL;     // ignore this feature this time

    else if (wgn_dataset.ftype(i) == wndt_binary)

    {

        construct_binary_ques(i,test_ques);

        tscore = wgn_score_question(test_ques,dset);

    }

    else if (wgn_dataset.ftype(i) == wndt_float)

    {

        tscore = construct_float_ques(i,test_ques,dset);

    }

    else if (wgn_dataset.ftype(i) == wndt_ignore)

        tscore = WGN_HUGE_VAL;    // always ignore this feature

#if 0

    // This doesn't work reasonably

    else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))

    {

        wagon_error("subset selection temporarily deleted");

        tscore = construct_class_ques_subset(i,test_ques,dset);

    }

#endif

    else if (wgn_dataset.ftype(i) >= wndt_class)

        tscore = construct_class_ques(i,test_ques,dset);

    if (tscore < bscore)

    {

        best_ques = test_ques;

        best_ques.set_score(tscore);

        bscore = tscore;

    }

    }


    return best_ques;

}


static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds)

{

    // Find out which member of a class gives the best split

    float tscore,bscore = WGN_HUGE_VAL;

    int cl;

    WQuestion test_q;


    test_q.set_fp(feat);

    test_q.set_oper(wnop_is);

    ques = test_q;


    for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)

    {

    test_q.set_operand1(EST_Val(cl));

    tscore = wgn_score_question(test_q,ds);

    if (tscore < bscore)

    {

        ques = test_q;

        bscore = tscore;

    }

    }


    return bscore;

}


#if 0

static float construct_class_ques_subset(int feat,WQuestion &ques,

                     WVectorVector &ds)

{

    // Find out which subset of a class gives the best split.

    // We first measure the subset of the data for each member of

    // of the class.  Then order those splits.  Then go through finding

    // where the best split of that ordered list is.  This is described

    // on page 247 of Breiman et al.

    float tscore,bscore = WGN_HUGE_VAL;

    LISP l;

    int cl;


    ques.set_fp(feat);

    ques.set_oper(wnop_is);

    float *scores = new float[wgn_discretes[wgn_dataset.ftype(feat)].length()];


    // Only do it for exists values

    for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)

    {

    ques.set_operand(flocons(cl));

    scores[cl] = wgn_score_question(ques,ds);

    }


    LISP order = sort_class_scores(feat,scores);

    if (order == NIL)

    return WGN_HUGE_VAL;

    if (siod_llength(order) == 1)

    {   // Only one so we know the best "split"

    ques.set_oper(wnop_is);

    ques.set_operand(car(order));

    return scores[get_c_int(car(order))];

    }


    ques.set_oper(wnop_in);

    LISP best_l = NIL;

    for (l=cdr(order); CDR(l) != NIL; l = cdr(l))

    {

    ques.set_operand(l);

    tscore = wgn_score_question(ques,ds);

    if (tscore < bscore)

    {

        best_l = l;

        bscore = tscore;

    }


    }


    if (best_l != NIL)

    {

    if (siod_llength(best_l) == 1)

    {

        ques.set_oper(wnop_is);

        ques.set_operand(car(best_l));

    }

    else if (equal(cdr(order),best_l) != NIL)

    {

        ques.set_oper(wnop_is);

        ques.set_operand(car(order));

    }

    else

    {

        cout << "Found a good subset" << endl;

        ques.set_operand(best_l);

    }

    }

    return bscore;

}


static LISP sort_class_scores(int feat,float *scores)

{

    // returns sorted list of (non WGN_HUGE_VAL) items

    int i;

    LISP items = NIL;

    LISP l;


    for (i=0; i < wgn_discretes[wgn_dataset.ftype(feat)].length(); i++)

    {

    if (scores[i] != WGN_HUGE_VAL)

    {

        if (items == NIL)

        items = cons(flocons(i),NIL);

        else

        {

        for (l=items; l != NIL; l=cdr(l))

        {

            if (scores[i] < scores[get_c_int(car(l))])

            {

            CDR(l) = cons(car(l),cdr(l));

            CAR(l) = flocons(i);

            break;

            }

        }

        if (l == NIL)

            items = l_append(items,cons(flocons(i),NIL));

        }

    }

    }

    return items;

}

#endif


static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds)

{

    // Find out a split of the range that gives the best score

    // Naively does this by partitioning the range into float_range_split slots

    float tscore,bscore = WGN_HUGE_VAL;

    int d, i;

    float p;

    WQuestion test_q;

    float max,min,val,incr;


    test_q.set_fp(feat);

    test_q.set_oper(wnop_lessthan);

    ques = test_q;


    min = max = ds(0)->get_flt_val(feat);  /* set up some value */

    for (d=0; d < ds.n(); d++)

    {

    val = ds(d)->get_flt_val(feat);

    if (val < min)

        min = val;

    else if (val > max)

        max = val;

    }

    if (max == min)  // we're pure

    return WGN_HUGE_VAL;

    incr = (max-min)/wgn_float_range_split;

    // so do float_range-1 splits

    /* We calculate this based on the number splits, not the increments, */

    /* becuase incr can be so small it doesn't increment p */

    for (i=0,p=min+incr; i < wgn_float_range_split; i++,p += incr )

    {

    test_q.set_operand1(EST_Val(p));

    tscore = wgn_score_question(test_q,ds);

    if (tscore < bscore)

    {

        ques = test_q;

        bscore = tscore;

    }

    }


    return bscore;

}


static void construct_binary_ques(int feat,WQuestion &test_ques)

{

    // construct a question.  Not sure about this in general

    // of course continuous/categorical features will require different

    // rule and non-binary ones will require some test point


    test_ques.set_fp(feat);

    test_ques.set_oper(wnop_binary);

    test_ques.set_operand1(EST_Val(""));

}


static float score_question_set(WQuestion &q, WVectorVector &ds, int ignorenth)

{

    // score this question as a possible split by finding

    // the sum of the impurities when ds is split with this question

    WImpurity y,n;

    int d, num_yes, num_no;

    float count;

    WVector *wv;


    num_yes = num_no = 0;

    y.data = &ds;

    n.data = &ds;

    for (d=0; d < ds.n(); d++)

    {

    if ((ignorenth < 2) ||

        (d%ignorenth != ignorenth-1))

    {

        wv = ds(d);

        if (wgn_count_field == -1)

        count = 1.0;

        else

        count = (*wv)[wgn_count_field];


        if (q.ask(*wv) == TRUE)

        {

        num_yes++;

                if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)

                    y.cumulate(d,count);  // note the sample number not value

                else

                    y.cumulate((*wv)[wgn_predictee],count);

        }

        else

        {

        num_no++;

                if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)

                    n.cumulate(d,count);  // note the sample number not value

                else

                    n.cumulate((*wv)[wgn_predictee],count);

        }

    }

    }


    q.set_yes(num_yes);

    q.set_no(num_no);


    int min_cluster;


    if ((wgn_balance == 0.0) ||

    (ds.n()/wgn_balance < wgn_min_cluster_size))

    min_cluster = wgn_min_cluster_size;

    else

    min_cluster = (int)(ds.n()/wgn_balance);


    if ((y.samples() < min_cluster) ||

    (n.samples() < min_cluster))

    return WGN_HUGE_VAL;


    float ym,nm,bm;

    //    printf("awb_debug score_question_set X %f Y %f\n",

    //    y.samples(), n.samples());

    ym = y.measure();

    nm = n.measure();

    bm = ym + nm;


    /*    cout << q << endl;

    printf("test question y %f n %f b %f\n",

    ym, nm, bm); */


    return bm/2.0;

}


float wgn_score_question(WQuestion &q, WVectorVector &ds)

{

    // This level of indirection was introduced for later expansion


    return score_question_set(q,ds,1);

}


WNode *wagon_stepwise(float limit)

{

    // Find the best single features and incrementally add features

    // that best improve result until it doesn't improve.

    // This is basically to automate what Kurt was doing in building

    // trees, he then automated it in PERL and as it seemed to work

    // I put it into wagon itself.

    // This can be pretty computationally intensive.

    WNode *best = 0,*new_best = 0;

    float bscore,best_score = -WGN_HUGE_VAL;

    int best_feat,i;

    int nf = 1;


    // Set all features to ignore

    for (i=0; i < wgn_dataset.width(); i++)

    wgn_dataset.set_ignore(i,TRUE);


    for (i=0; i < wgn_dataset.width(); i++)

    {

    if ((wgn_dataset.ftype(i) == wndt_ignore) || (i == wgn_predictee))

    {

        // This skips the round not because this has anything to

        // do with this feature being (user specified) ignored

        // but because it indicates there is one less cycle that is

        // necessary

        continue;

    }

    new_best = wagon_stepwise_find_next_best(bscore,best_feat);


    if ((bscore - fabs(bscore * (limit/100))) <= best_score)

    {

        // gone as far as we can

        delete new_best;

        break;

    }

    else

    {

        best_score = bscore;

        delete best;

        best = new_best;

        wgn_dataset.set_ignore(best_feat,FALSE);

        if (!wgn_quiet)

        {

        fprintf(stdout,"FEATURE    %d %s: %2.4f\n",

            nf,

            (const char *)wgn_dataset.feat_name(best_feat),

            best_score);

        fflush(stdout);

        nf++;

        }

    }

    }


    return best;

}


static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat)

{

    // Find which of the currently ignored features will best improve

    // the result

    WNode *best = 0;

    float best_score = -WGN_HUGE_VAL;

    int best_new_feat = -1;

    int i;


    for (i=0; i < wgn_dataset.width(); i++)

    {

    if (wgn_dataset.ftype(i) == wndt_ignore)

        continue; // user wants me to ignore this completely

    else if (i == wgn_predictee) // can't use the answer

        continue;

    else if (wgn_dataset.ignore(i) == TRUE)

    {

        WNode *current;

        float score;


        // Allow this feature to participate

        wgn_dataset.set_ignore(i,FALSE);


        current = wgn_build_tree(score);


        if (score > best_score)

        {

        best_score = score;

        delete best;

        best = current;

        best_new_feat = i;

//      fprintf(stdout,"BETTER FEATURE    %d %s: %2.4f\n",

//          i,

//          (const char *)wgn_dataset.feat_name(i),

//          best_score);

//      fflush(stdout);

        }

        else

        delete current;


        // switch it off again

        wgn_dataset.set_ignore(i,TRUE);

    }

    }


    bscore = best_score;

    best_feat = best_new_feat;

    return best;

}