PocketSphinx  0.6
ngram_search_fwdflat.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2008 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
42 /* System headers. */
43 #include <string.h>
44 #include <assert.h>
45 
46 /* SphinxBase headers. */
47 #include <sphinxbase/ckd_alloc.h>
48 #include <sphinxbase/listelem_alloc.h>
49 #include <sphinxbase/err.h>
50 
51 /* Local headers. */
52 #include "ngram_search.h"
53 #include "ps_lattice_internal.h"
54 
55 /* Turn this on to dump channels for debugging */
56 #define __CHAN_DUMP__ 0
57 #if __CHAN_DUMP__
58 #define chan_v_eval(chan) hmm_dump_vit_eval(&(chan)->hmm, stderr)
59 #else
60 #define chan_v_eval(chan) hmm_vit_eval(&(chan)->hmm)
61 #endif
62 
63 static void
64 ngram_fwdflat_expand_all(ngram_search_t *ngs)
65 {
66  int n_words, i;
67 
68  /* For all "real words" (not fillers or <s>/</s>) in the dictionary,
69  *
70  * 1) Add the ones which are in the LM to the fwdflat wordlist
71  * 2) And to the expansion list (since we are expanding all)
72  */
73  ngs->n_expand_words = 0;
74  n_words = ps_search_n_words(ngs);
75  bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs));
76  for (i = 0; i < n_words; ++i) {
77  if (!ngram_model_set_known_wid(ngs->lmset,
78  dict_basewid(ps_search_dict(ngs),i)))
79  continue;
80  ngs->fwdflat_wordlist[ngs->n_expand_words] = i;
81  ngs->expand_word_list[ngs->n_expand_words] = i;
82  bitvec_set(ngs->expand_word_flag, i);
83  ngs->n_expand_words++;
84  }
85  E_INFO("Utterance vocabulary contains %d words\n", ngs->n_expand_words);
86  ngs->expand_word_list[ngs->n_expand_words] = -1;
87  ngs->fwdflat_wordlist[ngs->n_expand_words] = -1;
88 }
89 
90 static void
91 ngram_fwdflat_allocate_1ph(ngram_search_t *ngs)
92 {
93  dict_t *dict = ps_search_dict(ngs);
94  int n_words = ps_search_n_words(ngs);
95  int i, w;
96 
97  /* Allocate single-phone words, since they won't have
98  * been allocated for us by fwdtree initialization. */
99  ngs->n_1ph_words = 0;
100  for (w = 0; w < n_words; w++) {
101  if (dict_is_single_phone(dict, w))
102  ++ngs->n_1ph_words;
103  }
104  ngs->single_phone_wid = ckd_calloc(ngs->n_1ph_words,
105  sizeof(*ngs->single_phone_wid));
106  ngs->rhmm_1ph = ckd_calloc(ngs->n_1ph_words, sizeof(*ngs->rhmm_1ph));
107  i = 0;
108  for (w = 0; w < n_words; w++) {
109  if (!dict_is_single_phone(dict, w))
110  continue;
111 
112  /* DICT2PID location */
113  ngs->rhmm_1ph[i].ciphone = dict_first_phone(dict, w);
114  ngs->rhmm_1ph[i].ci2phone = bin_mdef_silphone(ps_search_acmod(ngs)->mdef);
115  hmm_init(ngs->hmmctx, &ngs->rhmm_1ph[i].hmm, TRUE,
116  /* ssid */ bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef,
117  ngs->rhmm_1ph[i].ciphone),
118  /* tmatid */ bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef,
119  ngs->rhmm_1ph[i].ciphone));
120  ngs->rhmm_1ph[i].next = NULL;
121  ngs->word_chan[w] = (chan_t *) &(ngs->rhmm_1ph[i]);
122  ngs->single_phone_wid[i] = w;
123  i++;
124  }
125 }
126 
127 static void
128 ngram_fwdflat_free_1ph(ngram_search_t *ngs)
129 {
130  int i, w;
131  int n_words = ps_search_n_words(ngs);
132 
133  for (i = w = 0; w < n_words; ++w) {
134  if (!dict_is_single_phone(ps_search_dict(ngs), w))
135  continue;
136  hmm_deinit(&ngs->rhmm_1ph[i].hmm);
137  ++i;
138  }
139  ckd_free(ngs->rhmm_1ph);
140  ngs->rhmm_1ph = NULL;
141  ckd_free(ngs->single_phone_wid);
142 }
143 
144 void
146 {
147  int n_words;
148 
149  n_words = ps_search_n_words(ngs);
150  ngs->fwdflat_wordlist = ckd_calloc(n_words + 1, sizeof(*ngs->fwdflat_wordlist));
151  ngs->expand_word_flag = bitvec_alloc(n_words);
152  ngs->expand_word_list = ckd_calloc(n_words + 1, sizeof(*ngs->expand_word_list));
153  ngs->frm_wordlist = ckd_calloc(ngs->n_frame_alloc, sizeof(*ngs->frm_wordlist));
154  ngs->min_ef_width = cmd_ln_int32_r(ps_search_config(ngs), "-fwdflatefwid");
155  ngs->max_sf_win = cmd_ln_int32_r(ps_search_config(ngs), "-fwdflatsfwin");
156  E_INFO("fwdflat: min_ef_width = %d, max_sf_win = %d\n",
157  ngs->min_ef_width, ngs->max_sf_win);
158 
159  /* No tree-search; pre-build the expansion list, including all LM words. */
160  if (!ngs->fwdtree) {
161  /* Build full expansion list from LM words. */
162  ngram_fwdflat_expand_all(ngs);
163  /* Allocate single phone words. */
164  ngram_fwdflat_allocate_1ph(ngs);
165  }
166 }
167 
168 void
170 {
171  double n_speech = (double)ngs->n_tot_frame
172  / cmd_ln_int32_r(ps_search_config(ngs), "-frate");
173 
174  E_INFO("TOTAL fwdflat %.2f CPU %.3f xRT\n",
175  ngs->fwdflat_perf.t_tot_cpu,
176  ngs->fwdflat_perf.t_tot_cpu / n_speech);
177  E_INFO("TOTAL fwdflat %.2f wall %.3f xRT\n",
178  ngs->fwdflat_perf.t_tot_elapsed,
179  ngs->fwdflat_perf.t_tot_elapsed / n_speech);
180 
181  /* Free single-phone words if we allocated them. */
182  if (!ngs->fwdtree) {
183  ngram_fwdflat_free_1ph(ngs);
184  }
185  ckd_free(ngs->fwdflat_wordlist);
186  bitvec_free(ngs->expand_word_flag);
187  ckd_free(ngs->expand_word_list);
188  ckd_free(ngs->frm_wordlist);
189 }
190 
191 int
193 {
194  /* Reallocate things that depend on the number of words. */
195  int n_words;
196 
197  ckd_free(ngs->fwdflat_wordlist);
198  ckd_free(ngs->expand_word_list);
199  bitvec_free(ngs->expand_word_flag);
200  n_words = ps_search_n_words(ngs);
201  ngs->fwdflat_wordlist = ckd_calloc(n_words + 1, sizeof(*ngs->fwdflat_wordlist));
202  ngs->expand_word_flag = bitvec_alloc(n_words);
203  ngs->expand_word_list = ckd_calloc(n_words + 1, sizeof(*ngs->expand_word_list));
204 
205  /* No tree-search; take care of the expansion list and single phone words. */
206  if (!ngs->fwdtree) {
207  /* Free single-phone words. */
208  ngram_fwdflat_free_1ph(ngs);
209  /* Reallocate word_chan. */
210  ckd_free(ngs->word_chan);
211  ngs->word_chan = ckd_calloc(dict_size(ps_search_dict(ngs)),
212  sizeof(*ngs->word_chan));
213  /* Rebuild full expansion list from LM words. */
214  ngram_fwdflat_expand_all(ngs);
215  /* Allocate single phone words. */
216  ngram_fwdflat_allocate_1ph(ngs);
217  }
218  /* Otherwise there is nothing to do since the wordlist is
219  * generated anew every utterance. */
220  return 0;
221 }
222 
226 static void
227 build_fwdflat_wordlist(ngram_search_t *ngs)
228 {
229  int32 i, f, sf, ef, wid, nwd;
230  bptbl_t *bp;
231  ps_latnode_t *node, *prevnode, *nextnode;
232 
233  /* No tree-search, use statically allocated wordlist. */
234  if (!ngs->fwdtree)
235  return;
236 
237  memset(ngs->frm_wordlist, 0, ngs->n_frame_alloc * sizeof(*ngs->frm_wordlist));
238 
239  /* Scan the backpointer table for all active words and record
240  * their exit frames. */
241  for (i = 0, bp = ngs->bp_table; i < ngs->bpidx; i++, bp++) {
242  sf = (bp->bp < 0) ? 0 : ngs->bp_table[bp->bp].frame + 1;
243  ef = bp->frame;
244  wid = bp->wid;
245 
246  /* Anything that can be transitioned to in the LM can go in
247  * the word list. */
248  if (!ngram_model_set_known_wid(ngs->lmset,
249  dict_basewid(ps_search_dict(ngs), wid)))
250  continue;
251 
252  /* Look for it in the wordlist. */
253  for (node = ngs->frm_wordlist[sf]; node && (node->wid != wid);
254  node = node->next);
255 
256  /* Update last end frame. */
257  if (node)
258  node->lef = ef;
259  else {
260  /* New node; link to head of list */
261  node = listelem_malloc(ngs->latnode_alloc);
262  node->wid = wid;
263  node->fef = node->lef = ef;
264 
265  node->next = ngs->frm_wordlist[sf];
266  ngs->frm_wordlist[sf] = node;
267  }
268  }
269 
270  /* Eliminate "unlikely" words, for which there are too few end points */
271  for (f = 0; f < ngs->n_frame; f++) {
272  prevnode = NULL;
273  for (node = ngs->frm_wordlist[f]; node; node = nextnode) {
274  nextnode = node->next;
275  /* Word has too few endpoints */
276  if ((node->lef - node->fef < ngs->min_ef_width) ||
277  /* Word is </s> and doesn't actually end in last frame */
278  ((node->wid == ps_search_finish_wid(ngs)) && (node->lef < ngs->n_frame - 1))) {
279  if (!prevnode)
280  ngs->frm_wordlist[f] = nextnode;
281  else
282  prevnode->next = nextnode;
283  listelem_free(ngs->latnode_alloc, node);
284  }
285  else
286  prevnode = node;
287  }
288  }
289 
290  /* Form overall wordlist for 2nd pass */
291  nwd = 0;
292  bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs));
293  for (f = 0; f < ngs->n_frame; f++) {
294  for (node = ngs->frm_wordlist[f]; node; node = node->next) {
295  if (!bitvec_is_set(ngs->word_active, node->wid)) {
296  bitvec_set(ngs->word_active, node->wid);
297  ngs->fwdflat_wordlist[nwd++] = node->wid;
298  }
299  }
300  }
301  ngs->fwdflat_wordlist[nwd] = -1;
302  E_INFO("Utterance vocabulary contains %d words\n", nwd);
303 }
304 
308 static void
309 build_fwdflat_chan(ngram_search_t *ngs)
310 {
311  int32 i, wid, p;
312  root_chan_t *rhmm;
313  chan_t *hmm, *prevhmm;
314  dict_t *dict;
315  dict2pid_t *d2p;
316 
317  dict = ps_search_dict(ngs);
318  d2p = ps_search_dict2pid(ngs);
319 
320  /* Build word HMMs for each word in the lattice. */
321  for (i = 0; ngs->fwdflat_wordlist[i] >= 0; i++) {
322  wid = ngs->fwdflat_wordlist[i];
323 
324  /* Single-phone words are permanently allocated */
325  if (dict_is_single_phone(dict, wid))
326  continue;
327 
328  assert(ngs->word_chan[wid] == NULL);
329 
330  /* Multiplex root HMM for first phone (one root per word, flat
331  * lexicon). diphone is irrelevant here, for the time being,
332  * at least. */
333  rhmm = listelem_malloc(ngs->root_chan_alloc);
334  rhmm->ci2phone = dict_second_phone(dict, wid);
335  rhmm->ciphone = dict_first_phone(dict, wid);
336  rhmm->next = NULL;
337  hmm_init(ngs->hmmctx, &rhmm->hmm, TRUE,
338  bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, rhmm->ciphone),
339  bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, rhmm->ciphone));
340 
341  /* HMMs for word-internal phones */
342  prevhmm = NULL;
343  for (p = 1; p < dict_pronlen(dict, wid) - 1; p++) {
344  hmm = listelem_malloc(ngs->chan_alloc);
345  hmm->ciphone = dict_pron(dict, wid, p);
346  hmm->info.rc_id = (p == dict_pronlen(dict, wid) - 1) ? 0 : -1;
347  hmm->next = NULL;
348  hmm_init(ngs->hmmctx, &hmm->hmm, FALSE,
349  dict2pid_internal(d2p,wid,p),
350  bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, hmm->ciphone));
351 
352  if (prevhmm)
353  prevhmm->next = hmm;
354  else
355  rhmm->next = hmm;
356 
357  prevhmm = hmm;
358  }
359 
360  /* Right-context phones */
361  ngram_search_alloc_all_rc(ngs, wid);
362 
363  /* Link in just allocated right-context phones */
364  if (prevhmm)
365  prevhmm->next = ngs->word_chan[wid];
366  else
367  rhmm->next = ngs->word_chan[wid];
368  ngs->word_chan[wid] = (chan_t *) rhmm;
369  }
370 
371 }
372 
373 void
375 {
376  root_chan_t *rhmm;
377  int i;
378 
379  ptmr_reset(&ngs->fwdflat_perf);
380  ptmr_start(&ngs->fwdflat_perf);
381  build_fwdflat_wordlist(ngs);
382  build_fwdflat_chan(ngs);
383 
384  ngs->bpidx = 0;
385  ngs->bss_head = 0;
386 
387  for (i = 0; i < ps_search_n_words(ngs); i++)
388  ngs->word_lat_idx[i] = NO_BP;
389 
390  /* Reset the permanently allocated single-phone words, since they
391  * may have junk left over in them from previous searches. */
392  for (i = 0; i < ngs->n_1ph_words; i++) {
393  int32 w = ngs->single_phone_wid[i];
394  rhmm = (root_chan_t *) ngs->word_chan[w];
395  hmm_clear(&rhmm->hmm);
396  }
397 
398  /* Start search with <s>; word_chan[<s>] is permanently allocated */
399  rhmm = (root_chan_t *) ngs->word_chan[ps_search_start_wid(ngs)];
400  hmm_enter(&rhmm->hmm, 0, NO_BP, 0);
401  ngs->active_word_list[0][0] = ps_search_start_wid(ngs);
402  ngs->n_active_word[0] = 1;
403 
404  ngs->best_score = 0;
405  ngs->renormalized = FALSE;
406 
407  for (i = 0; i < ps_search_n_words(ngs); i++)
408  ngs->last_ltrans[i].sf = -1;
409 
410  if (!ngs->fwdtree)
411  ngs->n_frame = 0;
412 
413  ngs->st.n_fwdflat_chan = 0;
414  ngs->st.n_fwdflat_words = 0;
415  ngs->st.n_fwdflat_word_transition = 0;
416  ngs->st.n_senone_active_utt = 0;
417 }
418 
419 static void
420 compute_fwdflat_sen_active(ngram_search_t *ngs, int frame_idx)
421 {
422  int32 i, w;
423  int32 *awl;
424  root_chan_t *rhmm;
425  chan_t *hmm;
426 
427  acmod_clear_active(ps_search_acmod(ngs));
428 
429  i = ngs->n_active_word[frame_idx & 0x1];
430  awl = ngs->active_word_list[frame_idx & 0x1];
431 
432  for (w = *(awl++); i > 0; --i, w = *(awl++)) {
433  rhmm = (root_chan_t *)ngs->word_chan[w];
434  if (hmm_frame(&rhmm->hmm) == frame_idx) {
435  acmod_activate_hmm(ps_search_acmod(ngs), &rhmm->hmm);
436  }
437 
438  for (hmm = rhmm->next; hmm; hmm = hmm->next) {
439  if (hmm_frame(&hmm->hmm) == frame_idx) {
440  acmod_activate_hmm(ps_search_acmod(ngs), &hmm->hmm);
441  }
442  }
443  }
444 }
445 
446 static void
447 fwdflat_eval_chan(ngram_search_t *ngs, int frame_idx)
448 {
449  int32 i, w, bestscore;
450  int32 *awl;
451  root_chan_t *rhmm;
452  chan_t *hmm;
453 
454  i = ngs->n_active_word[frame_idx & 0x1];
455  awl = ngs->active_word_list[frame_idx & 0x1];
456  bestscore = WORST_SCORE;
457 
458  ngs->st.n_fwdflat_words += i;
459 
460  /* Scan all active words. */
461  for (w = *(awl++); i > 0; --i, w = *(awl++)) {
462  rhmm = (root_chan_t *) ngs->word_chan[w];
463  if (hmm_frame(&rhmm->hmm) == frame_idx) {
464  int32 score = chan_v_eval(rhmm);
465  if ((score BETTER_THAN bestscore) && (w != ps_search_finish_wid(ngs)))
466  bestscore = score;
467  ngs->st.n_fwdflat_chan++;
468  }
469 
470  for (hmm = rhmm->next; hmm; hmm = hmm->next) {
471  if (hmm_frame(&hmm->hmm) == frame_idx) {
472  int32 score = chan_v_eval(hmm);
473  if (score BETTER_THAN bestscore)
474  bestscore = score;
475  ngs->st.n_fwdflat_chan++;
476  }
477  }
478  }
479 
480  ngs->best_score = bestscore;
481 }
482 
483 static void
484 fwdflat_prune_chan(ngram_search_t *ngs, int frame_idx)
485 {
486  int32 i, cf, nf, w, pip, newscore, thresh, wordthresh;
487  int32 *awl;
488  root_chan_t *rhmm;
489  chan_t *hmm, *nexthmm;
490 
491  cf = frame_idx;
492  nf = cf + 1;
493  i = ngs->n_active_word[cf & 0x1];
494  awl = ngs->active_word_list[cf & 0x1];
495  bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs));
496 
497  thresh = ngs->best_score + ngs->fwdflatbeam;
498  wordthresh = ngs->best_score + ngs->fwdflatwbeam;
499  pip = ngs->pip;
500  E_DEBUG(3,("frame %d thresh %d wordthresh %d\n", frame_idx, thresh, wordthresh));
501 
502  /* Scan all active words. */
503  for (w = *(awl++); i > 0; --i, w = *(awl++)) {
504  rhmm = (root_chan_t *) ngs->word_chan[w];
505  /* Propagate active root channels */
506  if (hmm_frame(&rhmm->hmm) == cf
507  && hmm_bestscore(&rhmm->hmm) BETTER_THAN thresh) {
508  hmm_frame(&rhmm->hmm) = nf;
509  bitvec_set(ngs->word_active, w);
510 
511  /* Transitions out of root channel */
512  newscore = hmm_out_score(&rhmm->hmm);
513  if (rhmm->next) {
514  assert(!dict_is_single_phone(ps_search_dict(ngs), w));
515 
516  newscore += pip;
517  if (newscore BETTER_THAN thresh) {
518  hmm = rhmm->next;
519  /* Enter all right context phones */
520  if (hmm->info.rc_id >= 0) {
521  for (; hmm; hmm = hmm->next) {
522  if ((hmm_frame(&hmm->hmm) < cf)
523  || (newscore BETTER_THAN hmm_in_score(&hmm->hmm))) {
524  hmm_enter(&hmm->hmm, newscore,
525  hmm_out_history(&rhmm->hmm), nf);
526  }
527  }
528  }
529  /* Just a normal word internal phone */
530  else {
531  if ((hmm_frame(&hmm->hmm) < cf)
532  || (newscore BETTER_THAN hmm_in_score(&hmm->hmm))) {
533  hmm_enter(&hmm->hmm, newscore,
534  hmm_out_history(&rhmm->hmm), nf);
535  }
536  }
537  }
538  }
539  else {
540  assert(dict_is_single_phone(ps_search_dict(ngs), w));
541 
542  /* Word exit for single-phone words (where did their
543  * whmms come from?) (either from
544  * ngram_search_fwdtree, or from
545  * ngram_fwdflat_allocate_1ph(), that's where) */
546  if (newscore BETTER_THAN wordthresh) {
547  ngram_search_save_bp(ngs, cf, w, newscore,
548  hmm_out_history(&rhmm->hmm), 0);
549  }
550  }
551  }
552 
553  /* Transitions out of non-root channels. */
554  for (hmm = rhmm->next; hmm; hmm = hmm->next) {
555  if (hmm_frame(&hmm->hmm) >= cf) {
556  /* Propagate forward HMMs inside the beam. */
557  if (hmm_bestscore(&hmm->hmm) BETTER_THAN thresh) {
558  hmm_frame(&hmm->hmm) = nf;
559  bitvec_set(ngs->word_active, w);
560 
561  newscore = hmm_out_score(&hmm->hmm);
562  /* Word-internal phones */
563  if (hmm->info.rc_id < 0) {
564  newscore += pip;
565  if (newscore BETTER_THAN thresh) {
566  nexthmm = hmm->next;
567  /* Enter all right-context phones. */
568  if (nexthmm->info.rc_id >= 0) {
569  for (; nexthmm; nexthmm = nexthmm->next) {
570  if ((hmm_frame(&nexthmm->hmm) < cf)
571  || (newscore BETTER_THAN
572  hmm_in_score(&nexthmm->hmm))) {
573  hmm_enter(&nexthmm->hmm,
574  newscore,
575  hmm_out_history(&hmm->hmm),
576  nf);
577  }
578  }
579  }
580  /* Enter single word-internal phone. */
581  else {
582  if ((hmm_frame(&nexthmm->hmm) < cf)
583  || (newscore BETTER_THAN
584  hmm_in_score(&nexthmm->hmm))) {
585  hmm_enter(&nexthmm->hmm, newscore,
586  hmm_out_history(&hmm->hmm), nf);
587  }
588  }
589  }
590  }
591  /* Right-context phones - apply word beam and exit. */
592  else {
593  if (newscore BETTER_THAN wordthresh) {
594  ngram_search_save_bp(ngs, cf, w, newscore,
595  hmm_out_history(&hmm->hmm),
596  hmm->info.rc_id);
597  }
598  }
599  }
600  /* Zero out inactive HMMs. */
601  else if (hmm_frame(&hmm->hmm) != nf) {
602  hmm_clear_scores(&hmm->hmm);
603  }
604  }
605  }
606  }
607 }
608 
609 static void
610 get_expand_wordlist(ngram_search_t *ngs, int32 frm, int32 win)
611 {
612  int32 f, sf, ef;
613  ps_latnode_t *node;
614 
615  if (!ngs->fwdtree) {
616  ngs->st.n_fwdflat_word_transition += ngs->n_expand_words;
617  return;
618  }
619 
620  sf = frm - win;
621  if (sf < 0)
622  sf = 0;
623  ef = frm + win;
624  if (ef > ngs->n_frame)
625  ef = ngs->n_frame;
626 
627  bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs));
628  ngs->n_expand_words = 0;
629 
630  for (f = sf; f < ef; f++) {
631  for (node = ngs->frm_wordlist[f]; node; node = node->next) {
632  if (!bitvec_is_set(ngs->expand_word_flag, node->wid)) {
633  ngs->expand_word_list[ngs->n_expand_words++] = node->wid;
634  bitvec_set(ngs->expand_word_flag, node->wid);
635  }
636  }
637  }
638  ngs->expand_word_list[ngs->n_expand_words] = -1;
639  ngs->st.n_fwdflat_word_transition += ngs->n_expand_words;
640 }
641 
642 static void
643 fwdflat_word_transition(ngram_search_t *ngs, int frame_idx)
644 {
645  int32 cf, nf, b, thresh, pip, i, w, newscore;
646  int32 best_silrc_score = 0, best_silrc_bp = 0; /* FIXME: good defaults? */
647  bptbl_t *bp;
648  int32 *rcss;
649  root_chan_t *rhmm;
650  int32 *awl;
651  float32 lwf;
652  dict_t *dict = ps_search_dict(ngs);
653  dict2pid_t *d2p = ps_search_dict2pid(ngs);
654 
655  cf = frame_idx;
656  nf = cf + 1;
657  thresh = ngs->best_score + ngs->fwdflatbeam;
658  pip = ngs->pip;
659  best_silrc_score = WORST_SCORE;
660  lwf = ngs->fwdflat_fwdtree_lw_ratio;
661 
662  /* Search for all words starting within a window of this frame.
663  * These are the successors for words exiting now. */
664  get_expand_wordlist(ngs, cf, ngs->max_sf_win);
665 
666  /* Scan words exited in current frame */
667  for (b = ngs->bp_table_idx[cf]; b < ngs->bpidx; b++) {
668  xwdssid_t *rssid;
669  int32 silscore;
670 
671  bp = ngs->bp_table + b;
672  ngs->word_lat_idx[bp->wid] = NO_BP;
673 
674  if (bp->wid == ps_search_finish_wid(ngs))
675  continue;
676 
677  /* DICT2PID location */
678  /* Get the mapping from right context phone ID to index in the
679  * right context table and the bscore_stack. */
680  rcss = ngs->bscore_stack + bp->s_idx;
681  if (bp->last2_phone == -1)
682  rssid = NULL;
683  else
684  rssid = dict2pid_rssid(d2p, bp->last_phone, bp->last2_phone);
685 
686  /* Transition to all successor words. */
687  for (i = 0; ngs->expand_word_list[i] >= 0; i++) {
688  int32 n_used;
689 
690  w = ngs->expand_word_list[i];
691 
692  /* Get the exit score we recorded in save_bwd_ptr(), or
693  * something approximating it. */
694  if (rssid)
695  newscore = rcss[rssid->cimap[dict_first_phone(dict, w)]];
696  else
697  newscore = bp->score;
698  if (newscore == WORST_SCORE)
699  continue;
700  /* FIXME: Floating point... */
701  newscore += lwf
702  * (ngram_tg_score(ngs->lmset,
703  dict_basewid(dict, w),
704  bp->real_wid,
705  bp->prev_real_wid,
706  &n_used) >> SENSCR_SHIFT);
707  newscore += pip;
708 
709  /* Enter the next word */
710  if (newscore BETTER_THAN thresh) {
711  rhmm = (root_chan_t *) ngs->word_chan[w];
712  if ((hmm_frame(&rhmm->hmm) < cf)
713  || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) {
714  hmm_enter(&rhmm->hmm, newscore, b, nf);
715  /* DICT2PID: This is where mpx ssids get introduced. */
716  /* Look up the ssid to use when entering this mpx triphone. */
717  hmm_mpx_ssid(&rhmm->hmm, 0) =
718  dict2pid_ldiph_lc(d2p, rhmm->ciphone, rhmm->ci2phone,
719  dict_last_phone(dict, bp->wid));
720  assert(IS_S3SSID(hmm_mpx_ssid(&rhmm->hmm, 0)));
721  E_DEBUG(6,("ssid %d(%d,%d) = %d\n",
722  rhmm->ciphone, dict_last_phone(dict, bp->wid), rhmm->ci2phone,
723  hmm_mpx_ssid(&rhmm->hmm, 0)));
724  bitvec_set(ngs->word_active, w);
725  }
726  }
727  }
728 
729  /* Get the best exit into silence. */
730  if (rssid)
731  silscore = rcss[rssid->cimap[ps_search_acmod(ngs)->mdef->sil]];
732  else
733  silscore = bp->score;
734  if (silscore BETTER_THAN best_silrc_score) {
735  best_silrc_score = silscore;
736  best_silrc_bp = b;
737  }
738  }
739 
740  /* Transition to <sil> */
741  newscore = best_silrc_score + ngs->silpen + pip;
742  if ((newscore BETTER_THAN thresh) && (newscore BETTER_THAN WORST_SCORE)) {
743  w = ps_search_silence_wid(ngs);
744  rhmm = (root_chan_t *) ngs->word_chan[w];
745  if ((hmm_frame(&rhmm->hmm) < cf)
746  || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) {
747  hmm_enter(&rhmm->hmm, newscore,
748  best_silrc_bp, nf);
749  bitvec_set(ngs->word_active, w);
750  }
751  }
752  /* Transition to noise words */
753  newscore = best_silrc_score + ngs->fillpen + pip;
754  if ((newscore BETTER_THAN thresh) && (newscore BETTER_THAN WORST_SCORE)) {
755  for (w = ps_search_silence_wid(ngs) + 1; w < ps_search_n_words(ngs); w++) {
756  rhmm = (root_chan_t *) ngs->word_chan[w];
757  /* Noise words that aren't a single phone will have NULL here. */
758  if (rhmm == NULL)
759  continue;
760  if ((hmm_frame(&rhmm->hmm) < cf)
761  || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) {
762  hmm_enter(&rhmm->hmm, newscore,
763  best_silrc_bp, nf);
764  bitvec_set(ngs->word_active, w);
765  }
766  }
767  }
768 
769  /* Reset initial channels of words that have become inactive even after word trans. */
770  i = ngs->n_active_word[cf & 0x1];
771  awl = ngs->active_word_list[cf & 0x1];
772  for (w = *(awl++); i > 0; --i, w = *(awl++)) {
773  rhmm = (root_chan_t *) ngs->word_chan[w];
774  if (hmm_frame(&rhmm->hmm) == cf) {
775  hmm_clear_scores(&rhmm->hmm);
776  }
777  }
778 }
779 
780 static void
781 fwdflat_renormalize_scores(ngram_search_t *ngs, int frame_idx, int32 norm)
782 {
783  root_chan_t *rhmm;
784  chan_t *hmm;
785  int32 i, cf, w, *awl;
786 
787  cf = frame_idx;
788 
789  /* Renormalize individual word channels */
790  i = ngs->n_active_word[cf & 0x1];
791  awl = ngs->active_word_list[cf & 0x1];
792  for (w = *(awl++); i > 0; --i, w = *(awl++)) {
793  rhmm = (root_chan_t *) ngs->word_chan[w];
794  if (hmm_frame(&rhmm->hmm) == cf) {
795  hmm_normalize(&rhmm->hmm, norm);
796  }
797  for (hmm = rhmm->next; hmm; hmm = hmm->next) {
798  if (hmm_frame(&hmm->hmm) == cf) {
799  hmm_normalize(&hmm->hmm, norm);
800  }
801  }
802  }
803 
804  ngs->renormalized = TRUE;
805 }
806 
807 int
809 {
810  int16 const *senscr;
811  int32 nf, i, j;
812  int32 *nawl;
813 
814  /* Activate our HMMs for the current frame if need be. */
815  if (!ps_search_acmod(ngs)->compallsen)
816  compute_fwdflat_sen_active(ngs, frame_idx);
817 
818  /* Compute GMM scores for the current frame. */
819  senscr = acmod_score(ps_search_acmod(ngs), &frame_idx);
820  ngs->st.n_senone_active_utt += ps_search_acmod(ngs)->n_senone_active;
821 
822  /* Mark backpointer table for current frame. */
823  ngram_search_mark_bptable(ngs, frame_idx);
824 
825  /* If the best score is equal to or worse than WORST_SCORE,
826  * recognition has failed, don't bother to keep trying. */
828  return 0;
829  /* Renormalize if necessary */
830  if (ngs->best_score + (2 * ngs->beam) WORSE_THAN WORST_SCORE) {
831  E_INFO("Renormalizing Scores at frame %d, best score %d\n",
832  frame_idx, ngs->best_score);
833  fwdflat_renormalize_scores(ngs, frame_idx, ngs->best_score);
834  }
835 
836  ngs->best_score = WORST_SCORE;
837  hmm_context_set_senscore(ngs->hmmctx, senscr);
838 
839  /* Evaluate HMMs */
840  fwdflat_eval_chan(ngs, frame_idx);
841  /* Prune HMMs and do phone transitions. */
842  fwdflat_prune_chan(ngs, frame_idx);
843  /* Do word transitions. */
844  fwdflat_word_transition(ngs, frame_idx);
845 
846  /* Create next active word list */
847  nf = frame_idx + 1;
848  nawl = ngs->active_word_list[nf & 0x1];
849  for (i = 0, j = 0; ngs->fwdflat_wordlist[i] >= 0; i++) {
850  if (bitvec_is_set(ngs->word_active, ngs->fwdflat_wordlist[i])) {
851  *(nawl++) = ngs->fwdflat_wordlist[i];
852  j++;
853  }
854  }
855  for (i = ps_search_start_wid(ngs); i < ps_search_n_words(ngs); i++) {
856  if (bitvec_is_set(ngs->word_active, i)) {
857  *(nawl++) = i;
858  j++;
859  }
860  }
861  if (!ngs->fwdtree)
862  ++ngs->n_frame;
863  ngs->n_active_word[nf & 0x1] = j;
864 
865  /* Return the number of frames processed. */
866  return 1;
867 }
868 
872 static void
873 destroy_fwdflat_wordlist(ngram_search_t *ngs)
874 {
875  ps_latnode_t *node, *tnode;
876  int32 f;
877 
878  if (!ngs->fwdtree)
879  return;
880 
881  for (f = 0; f < ngs->n_frame; f++) {
882  for (node = ngs->frm_wordlist[f]; node; node = tnode) {
883  tnode = node->next;
884  listelem_free(ngs->latnode_alloc, node);
885  }
886  }
887 }
888 
892 static void
893 destroy_fwdflat_chan(ngram_search_t *ngs)
894 {
895  int32 i, wid;
896 
897  for (i = 0; ngs->fwdflat_wordlist[i] >= 0; i++) {
898  root_chan_t *rhmm;
899  chan_t *thmm;
900  wid = ngs->fwdflat_wordlist[i];
901  if (dict_is_single_phone(ps_search_dict(ngs),wid))
902  continue;
903  assert(ngs->word_chan[wid] != NULL);
904 
905  /* The first HMM in ngs->word_chan[wid] was allocated with
906  * ngs->root_chan_alloc, but this will attempt to free it
907  * using ngs->chan_alloc, which will not work. Therefore we
908  * free it manually and move the list forward before handing
909  * it off. */
910  rhmm = (root_chan_t *)ngs->word_chan[wid];
911  thmm = rhmm->next;
912  listelem_free(ngs->root_chan_alloc, rhmm);
913  ngs->word_chan[wid] = thmm;
914  ngram_search_free_all_rc(ngs, wid);
915  }
916 }
917 
918 void
920 {
921  int32 cf;
922 
923  destroy_fwdflat_chan(ngs);
924  destroy_fwdflat_wordlist(ngs);
925  bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs));
926 
927  /* This is the number of frames processed. */
928  cf = ps_search_acmod(ngs)->output_frame;
929  /* Add a mark in the backpointer table for one past the final frame. */
930  ngram_search_mark_bptable(ngs, cf);
931 
932  ptmr_stop(&ngs->fwdflat_perf);
933  /* Print out some statistics. */
934  if (cf > 0) {
935  double n_speech = (double)(cf + 1)
936  / cmd_ln_int32_r(ps_search_config(ngs), "-frate");
937  E_INFO("%8d words recognized (%d/fr)\n",
938  ngs->bpidx, (ngs->bpidx + (cf >> 1)) / (cf + 1));
939  E_INFO("%8d senones evaluated (%d/fr)\n", ngs->st.n_senone_active_utt,
940  (ngs->st.n_senone_active_utt + (cf >> 1)) / (cf + 1));
941  E_INFO("%8d channels searched (%d/fr)\n",
942  ngs->st.n_fwdflat_chan, ngs->st.n_fwdflat_chan / (cf + 1));
943  E_INFO("%8d words searched (%d/fr)\n",
944  ngs->st.n_fwdflat_words, ngs->st.n_fwdflat_words / (cf + 1));
945  E_INFO("%8d word transitions (%d/fr)\n",
946  ngs->st.n_fwdflat_word_transition,
947  ngs->st.n_fwdflat_word_transition / (cf + 1));
948  E_INFO("fwdflat %.2f CPU %.3f xRT\n",
949  ngs->fwdflat_perf.t_cpu,
950  ngs->fwdflat_perf.t_cpu / n_speech);
951  E_INFO("fwdflat %.2f wall %.3f xRT\n",
952  ngs->fwdflat_perf.t_elapsed,
953  ngs->fwdflat_perf.t_elapsed / n_speech);
954  }
955 }
hmm_t hmm
Basic HMM structure.
Definition: ngram_search.h:65
int32 n_frame_alloc
Number of frames allocated in bp_table_idx and friends.
Definition: ngram_search.h:307
int32 wid
Word index.
Definition: ngram_search.h:113
void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
Definition: ngram_search.c:616
void hmm_init(hmm_context_t *ctx, hmm_t *hmm, int mpx, int ssid, int tmatid)
Populate a previously-allocated HMM structure, allocating internal data.
Definition: hmm.c:89
int32 lef
Last end frame.
chan_t * next
first descendant of this channel
Definition: ngram_search.h:94
listelem_alloc_t * chan_alloc
For chan_t.
Definition: ngram_search.h:211
frame_idx_t frame
start or end frame
Definition: ngram_search.h:110
hmm_context_t * hmmctx
HMM context.
Definition: ngram_search.h:200
void hmm_deinit(hmm_t *hmm)
Free an HMM structure, releasing internal data (but not the HMM structure itself).
Definition: hmm.c:111
int16 last2_phone
next-to-last phone of this word
Definition: ngram_search.h:120
void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm)
Activate senones associated with an HMM.
Definition: acmod.c:1191
int ngram_fwdflat_reinit(ngram_search_t *ngs)
Rebuild search structures for updated language models.
Word graph search implementation.
bitvec_t * word_active
array of active flags for all words.
Definition: ngram_search.h:247
void ngram_fwdflat_finish(ngram_search_t *ngs)
Finish fwdflat decoding for an utterance.
int ngram_fwdflat_search(ngram_search_t *ngs, int frame_idx)
Search one frame forward in an utterance.
int16 ciphone
first ciphone of this node; all words rooted at this node begin with this ciphone ...
Definition: ngram_search.h:100
int32 ** active_word_list
Array of active multi-phone words for current and next frame.
Definition: ngram_search.h:287
struct chan_s * next
first descendant of this channel; or, in the case of the last phone of a word, the next alternative r...
Definition: ngram_search.h:68
void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w, int32 score, int32 path, int32 rc)
Enter a word in the backpointer table.
Definition: ngram_search.c:398
int32 * single_phone_wid
list of single-phone word ids
Definition: ngram_search.h:264
int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx)
Record the current frame&#39;s index in the backpointer table.
Definition: ngram_search.c:344
int16 ci2phone
second ciphone of this node; one root HMM for each unique right context
Definition: ngram_search.h:102
int32 bp
Back Pointer.
Definition: ngram_search.h:114
int32 n_active_word[2]
Number entries in active_word_list.
Definition: ngram_search.h:288
int32 rc_id
right-context id for last phone of words
Definition: ngram_search.h:79
#define dict2pid_rssid(d, ci, lc)
Access macros; not designed for arbitrary use.
Definition: dict2pid.h:179
void ngram_fwdflat_start(ngram_search_t *ngs)
Start fwdflat decoding for an utterance.
N-Gram search module structure.
Definition: ngram_search.h:197
void hmm_normalize(hmm_t *h, int32 bestscr)
Renormalize the scores in this HMM based on the given best score.
Definition: hmm.c:209
int32 real_wid
wid of this or latest predecessor real word
Definition: ngram_search.h:117
root_chan_t * rhmm_1ph
Root HMMs for single-phone words.
Definition: ngram_search.h:236
int32 prev_real_wid
wid of second-last real word
Definition: ngram_search.h:118
#define WORST_SCORE
Large "bad" score.
Definition: hmm.h:88
N-Gram based multi-pass search ("FBS")
listelem_alloc_t * latnode_alloc
For latnode_t.
Definition: ngram_search.h:213
ps_latnode_t ** frm_wordlist
List of active words in each frame.
Definition: ngram_search.h:316
Lexical tree node data type for the first phone (root) of each dynamic HMM tree structure.
Definition: ngram_search.h:90
void hmm_enter(hmm_t *h, int32 score, int32 histid, int frame)
Enter an HMM with the given path score and history ID.
Definition: hmm.c:201
Lexical tree node data type.
Definition: ngram_search.h:64
hmm_t hmm
Basic HMM structure.
Definition: ngram_search.h:91
void acmod_clear_active(acmod_t *acmod)
Clear set of active senones.
Definition: acmod.c:1175
int32 wid
Dictionary word id.
#define hmm_context_set_senscore(ctx, senscr)
Change the senone score array for a context.
Definition: hmm.h:231
#define SENSCR_SHIFT
Shift count for senone scores.
Definition: hmm.h:77
a structure for a dictionary.
Definition: dict.h:79
#define WORSE_THAN
Is one score worse than another?
Definition: hmm.h:104
s3ssid_t dict2pid_internal(dict2pid_t *d2p, int32 wid, int pos)
Return the senone sequence ID for the given word position.
Definition: dict2pid.c:367
int32 best_score
Best Viterbi path score.
Definition: ngram_search.h:325
void hmm_clear(hmm_t *h)
Reset the states of the HMM to the invalid condition.
Definition: hmm.c:183
Back pointer table (forward pass lattice; actually a tree)
Definition: ngram_search.h:109
cross word triphone model structure
Definition: dict2pid.h:137
void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
Definition: ngram_search.c:665
#define BETTER_THAN
Is one score better than another?
Definition: hmm.h:99
void ngram_fwdflat_deinit(ngram_search_t *ngs)
Release memory associated with fwdflat decoding.
int32 s_idx
Start of BScoreStack for various right contexts.
Definition: ngram_search.h:116
int32 fef
First end frame.
int32 n_frame
Number of frames actually present.
Definition: ngram_search.h:308
ngram_model_t * lmset
Set of language models.
Definition: ngram_search.h:199
int32 n_1ph_words
Number single phone words in dict (total)
Definition: ngram_search.h:265
listelem_alloc_t * root_chan_alloc
For root_chan_t.
Definition: ngram_search.h:212
void hmm_clear_scores(hmm_t *h)
Reset the scores of the HMM.
Definition: hmm.c:170
int32 ciphone
ciphone for this node
Definition: ngram_search.h:73
void ngram_fwdflat_init(ngram_search_t *ngs)
Initialize N-Gram search for fwdflat decoding.
ngram_search_stats_t st
Various statistics for profiling.
Definition: ngram_search.h:335
chan_t ** word_chan
Channels associated with a given word (only used for right contexts, single-phone words in fwdtree se...
Definition: ngram_search.h:246
int32 * fwdflat_wordlist
List of active word IDs for utterance.
Definition: ngram_search.h:317
struct ps_latnode_s * next
Next node in DAG (no ordering implied)
int32 score
Score (best among all right contexts)
Definition: ngram_search.h:115
#define dict_size(d)
Packaged macro access to dictionary members.
Definition: dict.h:154
s3cipid_t * cimap
Index into ssid[] above for each ci phone.
Definition: dict2pid.h:139
#define dict_pron(d, w, p)
The CI phones of the word w at position p.
Definition: dict.h:168
Building composite triphone (as well as word internal triphones) with the dictionary.
Definition: dict2pid.h:148
int16 const * acmod_score(acmod_t *acmod, int *inout_frame_idx)
Score one frame of data.
Definition: acmod.c:1088
int16 last_phone
last phone of this word
Definition: ngram_search.h:119