Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
sig2fv_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1995,1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Authors: Paul Taylor and Simon King */
34 /* Date : April 1995 */
35 /*-----------------------------------------------------------------------*/
36 /* Generate feature vectors */
37 /* */
38 /*=======================================================================*/
39 
40 #include <cstdlib>
41 #include "EST_speech_class.h"
42 #include "EST_string_aux.h"
43 #include "EST_cmd_line.h"
44 #include "EST_cmd_line_options.h"
45 #include "sigpr/EST_sigpr_utt.h"
46 #include "sigpr/EST_filter.h"
47 
48 #define EPSILON (0.0001)
49 
50 #define DEFAULT_FRAME_SIZE 0.01
51 #define DEFAULT_FRAME_FACTOR 2.0
52 #define DEFAULT_LPC_ORDER 16
53 #define DEFAULT_REF_ORDER 16
54 #define DEFAULT_CEP_ORDER 12
55 #define DEFAULT_FBANK_ORDER 20
56 #define DEFAULT_MELCEP_ORDER 12
57 #define DEFAULT_WINDOW "hamming"
58 #define DEFAULT_PREEMPH 0
59 #define DEFAULT_LIFTER 0
60 
61 
62 // sane values for pitchmarks (in seconds)
63 
64 #define MINIMUM_PITCH_PERIOD (0.0033) // 300 hz
65 #define MAXIMUM_PITCH_PERIOD (0.02) // 50 Hz
66 #define DEFAULT_PITCH_PERIOD (0.01) // 100 Hz
67 
68 void calculate_orders(EST_StrList &clist, EST_IList &olist,
69  EST_Option &op);
70 
71 void add_channels_to_map(EST_StrList &map, EST_StrList &types,
72  EST_Features &op, int order);
73 
74 void set_options(EST_Features &op, EST_Option &al);
75 
76 EST_String sigpr_options_supported(void)
77 {
78  return
79  EST_String("")+
80  " lpc linear predictive coding\n"
81  " cep cepstrum coding from lpc coefficients\n"
82  " melcep Mel scale cepstrum coding via fbank\n"
83  " fbank Mel scale log filterbank analysis\n"
84  " lsf line spectral frequencies\n"
85  " ref Linear prediction reflection coefficients\n"
86  " power\n"
87  " f0\n"
88  " energy: root mean square energy\n";
89 };
90 
91 
92 
93 /** @name <command>sig2fv</command> <emphasis>Generate signal processing coefficients from waveforms</emphasis>
94  * @id sigfv-manual
95  * @toc
96  */
97 
98 //@{
99 
100 /**@name Synopsis
101  */
102 //@{
103 
104 //@synopsis
105 
106 /**
107 sig2fv is used to create signal processing feature vector analysis on speech
108 waveforms.
109 The following types of analysis are provided:
110 
111 <itemizedlist>
112 <listitem><para>Linear prediction (LPC)</para></listitem>
113 <listitem><para>Cepstrum coding from lpc coefficients</para></listitem>
114 <listitem><para>Mel scale cepstrum coding via fbank</para></listitem>
115 <listitem><para>Mel scale log filterbank analysis</para></listitem>
116 <listitem><para>Line spectral frequencies</para></listitem>
117 <listitem><para>Linear prediction reflection coefficients</para></listitem>
118 <listitem><para>Root mean square energy</para></listitem>
119 <listitem><para>Power</para></listitem>
120 <listitem><para>fundamental frequency (pitch)</para></listitem>
121 <listitem><para>calculation of delta and acceleration coefficients of all of the
122 above</para></listitem>
123 </itemizedlist>
124 
125 The -coefs option is used to specify a list of the names of what sort
126 of basic processing is required, and -delta and -acc are used for
127 delta and acceleration coefficients respectively.
128 
129 */
130 
131 //@}
132 
133 /**@name Options
134  */
135 //@{
136 
137 //@options
138 
139 //@}
140 
141 
142 int main(int argc, char *argv[])
143 {
144  EST_String out_file("-");
145  EST_StrList files;
146  EST_Option al;
147  EST_Features op;
148  EST_Wave sig;
149  EST_Track full;
150  EST_StrList coef_list, delta_list, acc_list, tlist, map;
151  EST_IList olist;
152 
153  parse_command_line
154  (argc, argv,
155  EST_String("[input file] -o [output file]\n")+
156  "Summary: generate acoustic feature vectors for a waveform file \n"
157  "use \"-\" to make input and output files stdin/out \n"
158  "-h Options help \n\n" +
159  options_wave_input() +
160  options_track_output() + " \n"
161  "-shift <float> frame spacing in seconds for fixed frame analysis. This \n"
162  " doesn't have to be the same as the output file spacing - the \n"
163  " S option can be used to resample the track before saving \n"
164  " default: "+ftoString(DEFAULT_FRAME_SIZE) +"\n\n"
165  "-factor <float> Frames lengths will be FACTOR times the \n"
166  " local pitch period. \n"
167  " default: "+ftoString(DEFAULT_FRAME_FACTOR) +"\n\n"
168  "-pm <ifile> Pitch mark file name. This is used to \n"
169  " specify the positions of the analysis frames for pitch \n"
170  " synchronous analysis. Pitchmark files are just standard \n"
171  " track files, but the channel information is ignored and \n"
172  " only the time positions are used\n"
173  "-size <float> If specified with pm, size is used as the \n"
174  " fixed window size (times factor) rather than size within \n"
175  " each the pms.\n\n"
176 
177  "-coefs <string> list of basic types of processing required. \n"
178  " Permissable types are: \n" + sigpr_options_supported()+" \n"
179  "-delta <string> list of delta types of processing required. Basic \n"
180  " processing does not need to be specified for this option to work. \n"
181  " Permissable types are: \n" + sigpr_options_supported()+" \n"
182  "-acc <string> list of acceleration (delta delta) processing \n"
183  " required. Basic processing does not need to be specified for \n"
184  " this option to work. \n"
185  " Permissable types are: \n"
186  + sigpr_options_supported()+"\n"
187  "-window_type <string> Type of window used on waveform. \n"
188  " Permissable types are: \n" +
190  " default: "DEFAULT_WINDOW"\n\n"
191  "-lpc_order <int> Order of lpc analysis. \n\n"
192  "-ref_order <int> Order of lpc reflection coefficient analysis. \n\n"
193  "-cep_order <int> Order of lpc cepstral analysis.\n\n"
194  "-melcep_order <int> Order of Mel cepstral analysis.\n\n"
195  "-fbank_order <int> Order of filter bank analysis.\n\n"
196  "-preemph <float> Perform pre-emphasis with this factor.\n\n"
197  "-lifter <float> lifter coefficient.\n\n"
198  "-usepower use power rather than energy in filter bank \n"
199  " analysis\n\n"+
200  "-include_c0 include cepstral coefficient 0\n\n"
201  "-order <string> order of analyses\n", files, al);
202 
203  out_file = al.present("-o") ? al.val("-o") : (EST_String)"-";
204  set_options(op, al);
205 
206  StringtoStrList(al.val("-coefs"), coef_list);
207  StringtoStrList(al.val("-delta"), delta_list);
208  StringtoStrList(al.val("-acc"), acc_list);
209 
210  StringtoStrList(al.val("-order"), tlist);
211  StrListtoIList(tlist, olist);
212 
213  if (read_wave(sig, files.first(), al) != read_ok)
214  exit(-1);
215 
216  // allocate and fill time axis
217  if (al.present("-pm"))
218  {
219  if (read_track(full, al.val("-pm"), al))
220  exit(1);
221  }
222  else
223  {
224  full.resize((int)ceil(sig.end() / op.F("frame_shift")), 0);
225  full.fill_time(op.F("frame_shift"));
226  }
227 
228  // allocate channels
229  add_channels_to_map(map, coef_list, op, 0);
230  add_channels_to_map(map, delta_list, op, 1);
231  add_channels_to_map(map, acc_list, op, 2);
232 
233  //cerr << "MAP " << map << endl;
234 
235  full.resize(EST_CURRENT, map);
236 
237  if (al.present("-preemph"))
238  pre_emphasis(sig, al.fval("-preemph"));
239 
240  if(al.present("-usepower"))
241  cerr << "sig2fv: -usepower currently not supported" << endl;
242 
243  sigpr_base(sig, full, op, coef_list);
244  sigpr_delta(sig, full, op, delta_list);
245  sigpr_acc(sig, full, op, acc_list);
246 
247  if (al.present("-S"))
248  {
249  cout << "-S " << al.fval("-S") << endl;
250  full.sample(al.fval("-S"));
251  }
252 
253  if (full.save(out_file, al.val("-otype", 0)) != write_ok)
254  {
255  cerr << "sig2fv: failed to write output to \"" << out_file
256  << "\"" << endl;
257  exit(-1);
258  }
259  return 0;
260 }
261 
262 
263 
264 void calculate_orders(EST_StrList &clist, EST_IList &olist,
265  EST_Option &op)
266 {
267  EST_Litem *c, *o;
268  EST_String k;
269  int v;
270 
271  for (c = clist.head(), o = olist.head(); c && o; c= c->next(), o = o->next())
272  {
273  k = clist(c) + "_order";
274  v = olist(o);
275  op.override_ival(k, v);
276  }
277 }
278 
279 void set_options(EST_Features &op, EST_Option &al)
280 {
281  op.set("frame_shift", DEFAULT_FRAME_SIZE);
282  op.set("frame_factor", DEFAULT_FRAME_FACTOR);
283  op.set("window_type", DEFAULT_WINDOW);
284 
285  op.set("preemph", DEFAULT_PREEMPH);
286  op.set("lifter", DEFAULT_LIFTER);
287 
288  op.set("lpc_order", DEFAULT_LPC_ORDER);
289  op.set("ref_order", DEFAULT_REF_ORDER);
290  op.set("cep_order", DEFAULT_CEP_ORDER);
291  op.set("fbank_order", DEFAULT_FBANK_ORDER);
292  op.set("melcep_order", DEFAULT_MELCEP_ORDER);
293 
294  op.set("max_period", MAXIMUM_PITCH_PERIOD);
295  op.set("min_period", MINIMUM_PITCH_PERIOD);
296  op.set("def_period", DEFAULT_PITCH_PERIOD);
297 
298  if (al.present("-max_period"))
299  op.set("max_period", al.fval("-max_period", 0));
300  if (al.present("-min_period"))
301  op.set("min_period", al.fval("-min_period", 0));
302  if (al.present("-def_period"))
303  op.set("def_period", al.fval("-def_period", 0));
304 
305  if (al.present("-window_type"))
306  op.set("window_type", al.sval("-window_type", 1));
307 
308  if (al.present("-shift"))
309  op.set("frame_shift", al.fval("-shift", 1));
310  if (al.present("-factor"))
311  op.set("frame_factor", al.fval("-factor", 1));
312  if (al.present("-size"))
313  op.set("frame_factor", op.F("frame_factor")*-1.0*al.fval("-size"));
314  if (al.present("-length"))
315  op.set("frame_factor",
316  al.fval("-length", est_errors_allowed)/op.F("frame_shift",est_errors_allowed));
317 
318  if (al.present("-preemph"))
319  op.set("preemph", al.fval("-preemph", 1));
320  if (al.present("-lifter"))
321  op.set("lifter", al.fval("-lifter", 1));
322 
323  if (al.present("-lpc_order"))
324  op.set("lpc_order", al.ival("-lpc_order", 1));
325  if (al.present("-ref_order"))
326  op.set("ref_order", al.ival("-ref_order", 1));
327  if (al.present("-cep_order"))
328  op.set("cep_order", al.ival("-cep_order", 1));
329  if (al.present("-fbank_order"))
330  op.set("fbank_order", al.ival("-fbank_order", 1));
331  if (al.present("-melcep_order"))
332  op.set("melcep_order", al.ival("-melcep_order", 1));
333 
334  if (al.present("-usepower"))
335  op.set("usepower", al.val("-usepower", 1));
336 
337  if (al.present("-include_c0"))
338  op.set("include_c0", al.val("-include_c0", 1));
339 
340 }
341 
342 /**@name Examples
343 
344 
345 Fixed frame basic linear prediction:
346 
347 To produce a set of linear prediction coefficients at every 10ms, using
348 pre-emphasis and saving in EST format:
349 
350 <para>
351 <screen>
352 $ sig2fv kdt_010.wav -o kdt_010.lpc -coefs "lpc" -otype est -shift 0.01 -preemph 0.5
353 </screen>
354 </para>
355 <formalpara><title>
356 Pitch Synchronous linear prediction</title><para>. The following used the set of pitchmarks
357 in kdt_010.pm as the centres of the analysis windows.
358 </para>
359 </formalpara>
360 
361 <para>
362 <screen>
363 $ sig2fv kdt_010.wav -pm kdt_010.pm -o kdt_010.lpc -coefs "lpc" -otype est -shift 0.01 -preemph 0.5
364 </screen>
365 </para>
366 
367 <para>
368 F0, Linear prediction and cepstral coefficients:
369 
370 <screen>
371 $ sig2fv kdt_010.wav -o kdt_010.lpc -coefs "f0 lpc cep" -otype est -shift 0.01
372 </screen>
373 
374 Note that pitchtracking can also be done with the
375 <command>pda</command> program. Both use the same underlying
376 technique, but the pda program offers much finer control over the
377 pitch track specific processing parameters.
378 
379 </para>
380 
381 <para>Energy, Linear Prediction and Cepstral coefficients, with a 10ms frame shift
382 during analysis but a 5ms frame shift in the output file:
383 
384 <para>
385 <screen>
386 $ sig2fv kdt_010.wav -o kdt_010.lpc -coefs "f0 lpc cep" -otype est -S 0.005
387  -shift 0.01
388 </screen>
389 </para>
390 
391 <para>Delta and acc coefficients can be calculated even if their base form is not
392 required. This produces normal energy coefficients and cepstral delta coefficients:
393 
394 <para>
395 <screen>
396 $ sig2fv ../kdt_010.wav -o kdt_010.lpc -coefs "energy" -delta "cep" -otype est
397 </screen>
398 </para>
399 
400 <para>Mel-scaled cepstra, Delta and acc coefficients, as is common in speech
401 recognition:
402 <para>
403 <screen>
404 $ sig2fv ../kdt_010.wav -o kdt_010.lpc -coefs "melcep" -delta "melcep" -acc "melcep" -otype est -preemph 0.96
405 </screen>
406 
407 */
408 //@{
409 //@}
410 
411 
412 
413 //@}