Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
token_example.cc
1 /************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /************************************************************************/
33 /* Author: Alan W Black */
34 /* Date: May 1997 */
35 /************************************************************************/
36 /* */
37 /* Example of reading a file using the tokenizer */
38 /* */
39 /************************************************************************/
40 
41 #include <cstdlib>
42 #include "EST_Token.h"
43 
44 #if defined(DATAC)
45 # define __STRINGIZE(X) #X
46 # define DATA __STRINGIZE(DATAC)
47 #endif
48 
49 int main(int argc,char **argv)
50 {
51  // Simple program to read all the tokens in the named file
52  // a print a summary of them
53  EST_TokenStream ts;
54  int tokens, alices, quotes;
55  EST_Token t;
56  EST_String fname;
57 
58  if (argc > 2)
59  {
60  cerr << argv[0] << ": wrong number of arguments\n";
61  exit(-1);
62  }
63  else if (argc == 2)
64  fname = argv[1];
65  else
66  fname = DATA "/alice";
67 
68  if (ts.open(fname) == -1)
69  {
70  cerr << argv[0] << ": can't open input file \"" << argv[1] <<
71  "\"\n";
72  exit(-1);
73  }
74 
75  // Control of whitespace characters, single character symbols,
76  // pre and post punctuation may be set here.
77 
78  // The defaults are standard whitespace, and nothing for the rest
79  // (this is like awk's basic tokenizer). For language analysis
80  // you'll probably want to modify the punctuation
81  // \173 is '{', it is inserted by number because of a doc++ problem.
82 
83  ts.set_PrePunctuationSymbols("\173[(\"'");
84  ts.set_PunctuationSymbols(EST_Token_Default_PunctuationSymbols);
85 
86  // Note you may set quotes so quoted tokens are read as single
87  // tokens (a la C)
88 
89  for (tokens=quotes=alices=0; !ts.eof(); tokens++)
90  {
91  t = ts.get();
92  if (t == "Alice")
93  alices++;
94  if (t.prepunctuation().contains("\""))
95  quotes++;
96  }
97 
98  printf("Input file contains:\n");
99  printf(" %5d tokens\n",tokens);
100  printf(" %5d tokens preceeded by double quotes\n",quotes);
101  printf(" %5d occurrences of Alice\n",alices);
102 
103  return 0;
104 }
105 
106