docs/speech_tools-2.4.0/EST__Ngrammar_8h_source.html

/*************************************************************************/

/*                                                                       */

/*                Centre for Speech Technology Research                  */

/*                     University of Edinburgh, UK                       */

/*                         Copyright (c) 1996                            */

/*                        All Rights Reserved.                           */

/*                                                                       */

/*  Permission is hereby granted, free of charge, to use and distribute  */

/*  this software and its documentation without restriction, including   */

/*  without limitation the rights to use, copy, modify, merge, publish,  */

/*  distribute, sublicense, and/or sell copies of this work, and to      */

/*  permit persons to whom this work is furnished to do so, subject to   */

/*  the following conditions:                                            */

/*   1. The code must retain the above copyright notice, this list of    */

/*      conditions and the following disclaimer.                         */

/*   2. Any modifications must be clearly marked as such.                */

/*   3. Original authors' names are not deleted.                         */

/*   4. The authors' names are not used to endorse or promote products   */

/*      derived from this software without specific prior written        */

/*      permission.                                                      */

/*                                                                       */

/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */

/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */

/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */

/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */

/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */

/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */

/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */

/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */

/*  THIS SOFTWARE.                                                       */

/*                                                                       */

/*************************************************************************/

/*                     Author :  Simon King & Alan W Black               */

/*                     Date   :  February 1997                           */

/*-----------------------------------------------------------------------*/

/*                                                                       */

/* A general class for ngrams (bi-gram, tri-gram etc)                    */

/*                                                                       */

/*=======================================================================*/

#ifndef __EST_NGRAMMAR_H__

#define __EST_NGRAMMAR_H__


#include <cstdarg>

#include <cstdlib>


using namespace std;


#include "EST_String.h"

#include "EST_Val.h"

#include "EST_rw_status.h"

#include "EST_types.h"

#include "EST_FMatrix.h"

#include "EST_TList.h"

#include "EST_StringTrie.h"

#include "EST_simplestats.h"

#include "EST_PST.h"

#include "EST_string_aux.h"

#include "EST_math.h"


// HTK style

#define SENTENCE_START_MARKER "!ENTER"

#define SENTENCE_END_MARKER "!EXIT"

#define OOV_MARKER "!OOV"


#define EST_NGRAMBIN_MAGIC 1315402337


// for compressed save/load

#define GZIP_FILENAME_EXTENSION "gz"

#define COMPRESS_FILENAME_EXTENSION "Z"


// Ultimate floor

#define TINY_FREQ 1.0e-10


// ngram state - represents the N-1 word history and contains

// the pdf of the next word


class EST_NgrammarState {


private:


protected:

    EST_DiscreteProbDistribution p_pdf;

    int p_id; // a 'name'


public:

    EST_NgrammarState() :


      p_pdf()


      {

    init();

      };

    EST_NgrammarState(int id,EST_Discrete *d){clear();init(id,d);};

    EST_NgrammarState(int id,const EST_DiscreteProbDistribution &pdf)

              {clear();init(id,pdf);};

    EST_NgrammarState(const EST_NgrammarState &s);

    EST_NgrammarState(const EST_NgrammarState *const s);

    ~EST_NgrammarState();


    EST_IVector path;  // how we got here


    // initialise

    void clear();

    void init();

    void init(int id, EST_Discrete *d);

    void init(int id, const EST_DiscreteProbDistribution &pdf);


    // build

    void cumulate(const int index, const double count=1)

                  {p_pdf.cumulate(index,count);};

    void cumulate(const EST_String &word, const double count=1)

                  {p_pdf.cumulate(word,count);};


    // access

    int id() const {return p_id; };

    const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };

    EST_DiscreteProbDistribution &pdf() {return p_pdf; };

    double probability(const EST_String &w) const

      {return p_pdf.probability(w);}

    double probability(int w) const {return p_pdf.probability(w);}

    double frequency(const EST_String &w) const

      {return p_pdf.frequency(w);}

    double frequency(int w) const {return p_pdf.frequency(w);}

    const EST_String &most_probable(double *prob = NULL) const

      {return p_pdf.most_probable(prob);}


friend ostream&  operator<<(ostream& s, const EST_NgrammarState &a);


};


class EST_BackoffNgrammarState {


private:


protected:

  int p_level; // = 0 for root node

  double backoff_weight;

  EST_DiscreteProbDistribution p_pdf;

  EST_StringTrie children;


  EST_BackoffNgrammarState* add_child(const EST_Discrete *d,

                      const EST_StrVector &words);

  EST_BackoffNgrammarState* add_child(const EST_Discrete *d,

                      const EST_IVector &words);

public:

  EST_BackoffNgrammarState()

    { init(); };

  EST_BackoffNgrammarState(const EST_Discrete *d,int level)

    {clear();init(d,level);};

  EST_BackoffNgrammarState(const EST_DiscreteProbDistribution &pdf,int level)

    {clear();init(pdf,level);};

  EST_BackoffNgrammarState(const EST_BackoffNgrammarState &s);

  EST_BackoffNgrammarState(const EST_BackoffNgrammarState *const s);

  ~EST_BackoffNgrammarState();


  // initialise

  void clear();

  void init();

  void init(const EST_Discrete *d, int level);

  void init(const EST_DiscreteProbDistribution &pdf, int level);


  // build

  bool accumulate(const EST_StrVector &words,

          const double count=1);

  bool accumulate(const EST_IVector &words,

          const double count=1);

  // access

  const EST_DiscreteProbDistribution &pdf_const() const {return p_pdf; };

  EST_DiscreteProbDistribution &pdf() {return p_pdf; };

  double probability(const EST_String &w) const

    {return p_pdf.probability(w);}

  double frequency(const EST_String &w) const

    {return p_pdf.frequency(w);}

  const EST_String &most_probable(double *prob = NULL) const

    {return p_pdf.most_probable(prob);}


  const int level() const {return p_level;}


  EST_BackoffNgrammarState* get_child(const EST_String &word) const

    {

    return (EST_BackoffNgrammarState*)children.lookup(word);

    }

  EST_BackoffNgrammarState* get_child(const int word) const

    {

    return (EST_BackoffNgrammarState*)children.lookup(p_pdf.get_discrete()->name(word));

    }


  void remove_child(EST_BackoffNgrammarState *child,

            const EST_String &name);


  // recursive delete of contents and children

  void zap();


  const EST_BackoffNgrammarState *const get_state(const EST_StrVector &words) const;


  bool ngram_exists(const EST_StrVector &words,

            const double threshold) const;

  const double get_backoff_weight() const {return backoff_weight; }

  const double get_backoff_weight(const EST_StrVector &words) const;

  bool set_backoff_weight(const EST_StrVector &words, const double w);

  void frequency_of_frequencies(EST_DVector &ff);


  void print_freqs(ostream &os,const int order,EST_String followers="");


friend ostream&  operator<<(ostream& s, const EST_BackoffNgrammarState &a);


};


class EST_Ngrammar {


public:


    // 3 representations : sparse, dense and backed off. User specifies which.

    enum representation_t {sparse, dense, backoff};


    // now only keep frequencies (or log frequencies)

    // probabilities (or log probabilities) can be done

    // on the fly quickly enough

    enum entry_t {frequencies, log_frequencies};


protected:


    // each instance of an EST_Ngrammar is a grammar of fixed order

    // e.g. a bigram (order = 2)

    int p_order;

    int p_num_samples;


    double p_number_of_sentences; // which were used to build this grammar


    EST_String p_sentence_start_marker;

    EST_String p_sentence_end_marker;


    // only one representation in use at a time

    representation_t p_representation;

    entry_t p_entry_type;


    // sparse representation is a tree structure

    // holding only those N-grams which were seen

    EST_PredictionSuffixTree sparse_representation;

    bool init_sparse_representation();


    // dense representation is just an array of all states

    bool init_dense_representation();


    // backoff representation is also a tree structure

    // but the root state pdf is the most recent word in the

    // ngram and going down the tree is going back in time....

    // here is the root node :

    EST_BackoffNgrammarState *backoff_representation;


    double backoff_threshold;


    // need a non-zero unigram floor to enable backing off

    double backoff_unigram_floor_freq;


    // instead of simple discounting, we have a (possibly) different

    // discount per order and per frequency

    // e.g. backoff_discount[2](4) contains the discount to be

    // applied to a trigram frequency of 4

    // backoff_discount[0] is unused (we don't discount unigrams)

    EST_DVector *backoff_discount;

    const double get_backoff_discount(const int order, const double freq) const;


    bool init_backoff_representation();

    void prune_backoff_representation(EST_BackoffNgrammarState *start_state=NULL); // remove any zero frequency branches

    void backoff_restore_unigram_states();

    int p_num_states; // == p_vocab_size ^ (p_ord-1) if fully dense

    EST_NgrammarState *p_states; // state id is index into this array

    int find_dense_state_index(const EST_IVector &words, int index=0) const;


    // and the reverse

    const EST_StrVector &make_ngram_from_index(const int i) const;


    // vocabulary

    EST_Discrete *vocab;

    EST_Discrete *pred_vocab;  // may be different from state vocab

    bool init_vocab(const EST_StrList &wordlist);

    bool init_vocab(const EST_StrList &word_list,

            const EST_StrList &pred_list);


    // make sure vocab matches a given wordlist

    bool check_vocab(const EST_StrList &wordlist);


    EST_DiscreteProbDistribution vocab_pdf;  // overall pdf


    const EST_String &lastword(const EST_StrVector &words) const

        { return words(p_order-1); }

    const int lastword(const EST_IVector &words) const

        { return words(p_order-1); }

    // are we allowing out-of-vocabulary words, or is the vocabulary closed?

    bool allow_oov;


    bool sparse_to_dense();

    bool dense_to_sparse();


    // these aren't sorted yet ...

    void take_logs();

    void take_exps();

    void freqs_to_probs(); // just calls normalise


    bool build_sparse(const EST_String &filename,

              const EST_String &prev,

              const EST_String &prev_prev,

              const EST_String &last);

    // for dense and backoff

    bool build_ngram(const EST_String &filename,

             const EST_String &prev,

             const EST_String &prev_prev,

             const EST_String &last,

             const EST_String &input_format);


    // go through all matching ngrams ( *(ngram[i])="" matches anything )

    void iterate(EST_StrVector &words,

                 void (*function)(EST_Ngrammar *n,

                                  EST_StrVector &words,

                                  void *params),

         void *params);


    // same, but with a constant Ngrammar

    void const_iterate(EST_StrVector &words,

               void (*function)(const EST_Ngrammar *const n,

                    EST_StrVector &words,

                    void *params),

               void *params) const;


    bool p_init(int o, representation_t r);


    // new filename returned of we had to copy stdin to a

    // temporary file - must delete it later !

    bool oov_preprocess(const EST_String &filename,

            EST_String &new_filename,

            const EST_String &what);


    const EST_NgrammarState &find_state_const(const EST_StrVector &words)const;

    EST_NgrammarState &find_state(const EST_StrVector &words);

    const EST_NgrammarState &find_state_const(const EST_IVector &words) const;

    EST_NgrammarState &find_state(const EST_IVector &words);


    // special versions for backoff grammars

    const EST_DiscreteProbDistribution &backoff_prob_dist(const EST_StrVector &words) const;

    const double backoff_reverse_probability_sub(const EST_StrVector &words,

                    const EST_BackoffNgrammarState *root) const;

    const double backoff_probability(const EST_StrVector &words,

                     const bool trace=false) const;

    const double backoff_reverse_probability(const EST_StrVector &words) const;

    const EST_String & backoff_most_probable(const EST_StrVector &words,

                         double *prob = NULL) const;


    // backoff representation isn't a nice array of states

    // so use this to visit every node in the tree

    // and apply the function to that node

    void backoff_traverse(EST_BackoffNgrammarState *start_state,

              void (*function)(EST_BackoffNgrammarState *s,

                       void *params),

              void *params);


    // visit every node at a given level

    void backoff_traverse(EST_BackoffNgrammarState *start_state,

              void (*function)(EST_BackoffNgrammarState *s,

                       void *params),

              void *params, const int level);

public:


    EST_Ngrammar() {default_values();}


    EST_Ngrammar(int o, representation_t r,

         const EST_StrList &wordlist)

    {

    default_values(); init(o,r,wordlist);

    }


    // When state trans vocab differs from prediction vocab

    EST_Ngrammar(int o, representation_t r,

         const EST_StrList &wordlist,

         const EST_StrList &predlist)

    {

    default_values(); init(o,r,wordlist,predlist);

    }


    EST_Ngrammar(int o, representation_t r, EST_Discrete &v)

    {

    default_values(); init(o,r,v);

    }

    ~EST_Ngrammar();


    void default_values();

    void clear();

    bool init(int o, representation_t r,

          const EST_StrList &wordlist);

    bool init(int o, representation_t r,

          const EST_StrList &wordlist,

          const EST_StrList &predlist);

    bool init(int o, representation_t r, EST_Discrete &v);

    bool init(int o, representation_t r,

          EST_Discrete &v,EST_Discrete &pv);


    // access

    int num_states(void) const { return p_num_states;}

    double samples(void) const { return p_num_samples;}

    int order() const { return p_order; }

    int get_vocab_length() const { return vocab?vocab->length():0; }

    EST_String get_vocab_word(int i) const;

    int get_vocab_word(const EST_String &s) const;

    int get_pred_vocab_length() const { return pred_vocab->length(); }

    EST_String get_pred_vocab_word(int i) const { return pred_vocab->name(i); }

    int get_pred_vocab_word(const EST_String &s) const

       { return pred_vocab->name(s); }

    int closed_vocab() const {return !allow_oov; }

    entry_t entry_type() const {return p_entry_type;}

    representation_t representation() const

       { return p_representation;}


    // build

    bool build(const EST_StrList &filenames,

           const EST_String &prev = SENTENCE_START_MARKER,

           const EST_String &prev_prev = SENTENCE_END_MARKER,

           const EST_String &last = SENTENCE_END_MARKER,

           const EST_String &input_format = "",

           const EST_String &oov_mode = "",

           const int mincount=1,

           const int maxcount=10);


    // Accumulate ngrams

    void accumulate(const EST_StrVector &words,

            const double count=1);

    //const int index=0);

    void accumulate(const EST_IVector &words,

            const double count=1);

    //const int index=0);


    // hack - fix enter/exit probs s.t. P(...,!ENTER)=P(!EXIT,...)=0, for all x

    void make_htk_compatible();


    // I/O functions

    EST_read_status load(const EST_String &filename);

    EST_read_status load(const EST_String &filename, const EST_StrList &wordlist);

    EST_write_status save(const EST_String &filename,

              const EST_String type="cstr_ascii",

              const bool trace=false,

              double floor=0.0);


    int wordlist_index(const EST_String &word, const bool report=true) const;

    const EST_String &wordlist_index(int i) const;

    int predlist_index(const EST_String &word) const;

    const EST_String &predlist_index(int i) const;


    // set

    bool set_entry_type(entry_t new_type);

    bool set_representation(representation_t new_representation);


    // probability distributions

    // -------------------------

    // flag 'force' forces computation of probs on-the-fly if necessary

    double probability(const EST_StrVector &words, bool force=false,

               const bool trace=false) const;

    double frequency(const EST_StrVector &words, bool force=false,

             const bool trace=false) const;


    const EST_String &predict(const EST_StrVector &words,

                  double *prob,int *state) const;

    const EST_String &predict(const EST_StrVector &words) const

       {double p; int state; return predict(words,&p,&state); }

    const EST_String &predict(const EST_StrVector &words,double *prob) const

       {int state; return predict(words,prob,&state); }


    const EST_String &predict(const EST_IVector &words,double *prob,int *state) const;

    const EST_String &predict(const EST_IVector &words) const

       {double p; int state; return predict(words,&p,&state); }

    const EST_String &predict(const EST_IVector &words,double *prob) const

       {int state; return predict(words,prob,&state); }


    int find_state_id(const EST_StrVector &words) const;

    int find_state_id(const EST_IVector &words) const;

    int find_next_state_id(int state, int word) const;

    // fast versions for common N

    //const double probability(const EST_String w1);

    //const double probability(const EST_String w1,const EST_String w2);

    //const double probability(const EST_String w1,const EST_String w2,

    //const EST_String w2);


    // reverse - probability of words[0..order-2] given word[order-1]

    double reverse_probability(const EST_StrVector &words,

                   bool force=false) const;

    double reverse_probability(const EST_IVector &words,

                   bool force=false) const;


    // predict, where words has 'order' elements and the last one is "" or NULL

    const EST_DiscreteProbDistribution &prob_dist(const EST_StrVector &words) const;

    const EST_DiscreteProbDistribution &prob_dist(const EST_IVector &words) const;

    const EST_DiscreteProbDistribution &prob_dist(int state) const;


//    bool stats(const EST_String filename,

//         double &raw_entropy, double &count,

//         double &entropy, double &perplexity,

//         const EST_String &prev = SENTENCE_START_MARKER,

//         const EST_String &prev_prev = SENTENCE_END_MARKER,

//         const EST_String &last = SENTENCE_END_MARKER,

//               const EST_String &input_format = "") const;


    void fill_window_start(EST_IVector &window,

               const EST_String &prev,

               const EST_String &prev_prev) const;


    void fill_window_start(EST_StrVector &window,

               const EST_String &prev,

               const EST_String &prev_prev) const;


    // why anybody would want to do this ....

    //EST_Ngrammar &operator =(const EST_Ngrammar &a);


    bool ngram_exists(const EST_StrVector &words) const;

    bool ngram_exists(const EST_StrVector &words, const double threshold) const;

    const double get_backoff_weight(const EST_StrVector &words) const;

    bool set_backoff_weight(const EST_StrVector &words, const double w);


    void print_freqs(ostream &os,double floor=0.0);


    // i/o functions

    // -------------

    friend ostream& operator<<(ostream& s, EST_Ngrammar &n);

    friend EST_read_status load_ngram_htk_ascii(const EST_String filename,

                        EST_Ngrammar &n);

    friend EST_read_status load_ngram_htk_binary(const EST_String filename,

                         EST_Ngrammar &n);

    friend EST_read_status load_ngram_arpa(const EST_String filename,

                       EST_Ngrammar &n,

                       const EST_StrList &vocab);

    friend EST_read_status load_ngram_cstr_ascii(const EST_String filename,

                         EST_Ngrammar &n);

    friend EST_read_status load_ngram_cstr_bin(const EST_String filename,

                           EST_Ngrammar &n);


    friend EST_write_status save_ngram_htk_ascii_sub(const EST_String &word,

                             ostream *ost,

                             EST_Ngrammar &n,

                             double floor);

    friend EST_write_status save_ngram_htk_ascii(const EST_String filename,

                         EST_Ngrammar &n,

                         double floor);


    //friend EST_write_status save_ngram_htk_binary(const EST_String filename,

    //                    EST_Ngrammar &n);

    friend EST_write_status save_ngram_cstr_ascii(const EST_String filename,

                          EST_Ngrammar &n,

                          const bool trace,

                          double floor);

    friend EST_write_status save_ngram_cstr_bin(const EST_String filename,

                        EST_Ngrammar &n,

                        const bool trace,

                        double floor);

    friend EST_write_status save_ngram_arpa(const EST_String filename,

                        EST_Ngrammar &n);

    friend EST_write_status save_ngram_arpa_sub(ostream *ost,

                        EST_Ngrammar &n,

                        const EST_StrVector &words);

    friend EST_write_status save_ngram_wfst(const EST_String filename,

                        EST_Ngrammar &n);


    // Auxiliary functions


    // smoothing

friend void frequency_of_frequencies(EST_DVector &ff, EST_Ngrammar &n,int this_order);

friend void map_frequencies(EST_Ngrammar &n, const EST_DVector &map, const int this_order);

friend bool Good_Turing_smooth(EST_Ngrammar &n, int maxcount, int mincount);

friend void Good_Turing_discount(EST_Ngrammar &ngrammar, const int maxcount,

                 const double default_discount);


friend void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams,

                    EST_Ngrammar &ngram);

friend int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams,

                 EST_Ngrammar &ngram, int smooth_thresh);


    // frequencies below mincount get backed off

    // frequencies above maxcount are not smoothed(discounted)

    bool compute_backoff_weights(const int mincount=1,

                 const int maxcount=10);


    bool merge(EST_Ngrammar &n,float weight);


friend class EST_BackoffNgrammar;


};


void Ngram_freqsmooth(EST_Ngrammar &ngram,

              int smooth_thresh1,

              int smooth_thresh2);


// utils

void slide(EST_IVector &i, const int l);

void slide(EST_StrVector &i, const int l);


bool test_stats(EST_Ngrammar &ngram,

        const EST_String &filename,

        double &raw_entropy,

        double &count,

        double &entropy,

        double &perplexity,

        const EST_String &input_format,

        const EST_String &prev = SENTENCE_START_MARKER,

        const EST_String &prev_prev = SENTENCE_END_MARKER,

        const EST_String &last = SENTENCE_END_MARKER);


VAL_REGISTER_CLASS_DCLS(ngrammar,EST_Ngrammar)


#endif // __EST_NGRAMMAR_H__