1/*---------------------------------------------------------------------------*
2 *  srec.h  *
3 *                                                                           *
4 *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
5 *                                                                           *
6 *  Licensed under the Apache License, Version 2.0 (the 'License');          *
7 *  you may not use this file except in compliance with the License.         *
8 *                                                                           *
9 *  You may obtain a copy of the License at                                  *
10 *      http://www.apache.org/licenses/LICENSE-2.0                           *
11 *                                                                           *
12 *  Unless required by applicable law or agreed to in writing, software      *
13 *  distributed under the License is distributed on an 'AS IS' BASIS,        *
14 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 *  See the License for the specific language governing permissions and      *
16 *  limitations under the License.                                           *
17 *                                                                           *
18 *---------------------------------------------------------------------------*/
19
20/* this file contains defines needed by the srec search component*/
21
22#ifndef _h_srec_
23#define _h_srec_
24
25#include "swimodel.h"
26#include "hmm_desc.h"
27#include "utteranc.h"
28#include "hmmlib.h"
29#include "srec_sizes.h"
30#include "search_network.h"
31#include "srec_context.h"
32#include "srec_eosd.h"
33#include "astar.h"
34
35#define MAX_HMM 3            /*maximum HMM states in an allophone*/
36#define DO_ALLOW_MULTIPLE_MODELS 1
37
38/*in order to keep data sizes as small as possible, most of the the structure
39  below use indices into one fsmarc_token array and one word_token array.  This
40  makes the code a bit confusing (compared to just keeping pointers to these
41  structure around), uses a bit more CPU, but saves memory and gives us more
42  flexibility in the sizes of these data types*/
43
44/**
45 * @todo document
46 */
47typedef struct altword_token_t
48{
49  costdata costdelta;        /* cost relative to path being propagated */
50  wordID word;               /* alternative word, diff from path b.p. */
51  wtokenID word_backtrace;   /* alternative backtrace, diff from path b.p.*/
52  struct altword_token_t* next_token; /* todo: change this to indices */
53  asr_int16_t refcount;
54  costdata costbasis;        /* cost of best fsmarc_token host */
55}
56altword_token;
57#define AWTNULL 0
58/* fsmarc_tokens and fsmnode_tokens point to a batch of altword_tokens
59   to save memory, many fsmarc_tokens can point to the same altword_token
60   and these are propagated by reference */
61
62/**
63 * @todo document
64 */
65typedef struct fsmarc_token_t
66{
67  frameID num_hmm_states;           /* number of hmm states */
68  costdata cost[MAX_HMM];           /* cost so far*/
69  wtokenID word_backtrace[MAX_HMM]; /* index into word tokens*/
70  wordID word[MAX_HMM];             /* when the path encounters an output
71             symbol, store it here*/
72  frameID duration[MAX_HMM];        /* frames observed for this hmm state, todo: pack into char! */
73  arcID FSMarc_index;               /* index into the FSM arc array */
74
75  stokenID next_token_index;        /* for maintaining linked lists of these
76             tokens, both in search and in freelist */
77  altword_token* aword_backtrace[MAX_HMM];
78}
79fsmarc_token;
80/* 30 bytes */
81
82
83/**
84 * These are used while maximizing into FSM nodes.
85 */
86typedef struct fsmnode_token_t
87{
88  costdata cost;
89  wtokenID word_backtrace;  /* index into word tokens*/
90  wordID word;              /* when the path encounters an output*/
91  nodeID FSMnode_index;
92  ftokenID next_token_index;
93  altword_token* aword_backtrace;
94  frameID silence_duration;
95}
96fsmnode_token;
97/* 10 bytes */
98
99/**
100 * @todo document
101 */
102typedef struct word_token_t
103{
104  wordID word;                /* the word just observed */
105  frameID end_time;           /* end time of the word just observed, includes trailing silence */
106  nodeID end_node;            /* for backtrace with word graph */
107  wtokenID backtrace;         /* for backtrace */
108  costdata cost;              /* cost for path up to this point*/
109  wtokenID next_token_index;  /* for maintaining linked lists of these tokens
110       (both in the search and in the freelist) */
111  frameID _word_end_time;     /* end time of the word just observed, excl trailing silence */
112  /* since frameID is 16 bit, and 15bits is plenty
113     (ie 32767 frames * 20ms/frame = 655 sec), we use the high-bit to store
114	 whether this word_token represents a homonym, this is used in confidence
115	 score fixing! */
116#define WORD_TOKEN_GET_HOMONYM(wT)     (wT->_word_end_time & 0x8000)  // 10000000
117#define WORD_TOKEN_SET_HOMONYM(wT,hM)  (wT->_word_end_time = (wT->_word_end_time&0x7fff)|(hM?0x8000:0))
118#define WORD_TOKEN_GET_WD_ETIME(wT)    (wT->_word_end_time & 0x7fff) // 01111111
119#define WORD_TOKEN_SET_WD_ETIME(wT,eT) (wT->_word_end_time = (wT->_word_end_time&0x8000)|(eT))
120}
121word_token;
122/* 12 bytes */
123
124/**
125 * Contains what we need for later backtrace, nbest, etc.
126 */
127typedef struct
128{
129  /* there are various arrays below which frame number long - this is the number allocated */
130  frameID max_frames;
131
132  /* for each frame, head of a linked list of word tokens for that frame */
133  wtokenID *words_for_frame;
134  asr_int16_t *whether_sorted;
135
136}
137srec_word_lattice;
138
139/*This is just implemented as a list so far - use Johan's fancy implementation later*/
140
141/**
142 * @todo document
143 */
144typedef struct priority_q_t
145{
146  wtokenID word_token_list;  /* index of head token in queue - keep worst at end
147      (so we can pop one off) */
148  costdata max_cost_in_q;
149  miscdata num_in_q;
150  miscdata max_in_q;
151}
152priority_q;
153
154/*------------------------------------------------------------------*
155 *                                                                  *
156 *------------------------------------------------------------------*/
157
158/* notes ... what needs to be acoustic model specific
159
160   (p)ool it
161   (1) single  .r but reset
162   (x) specific
163
164   1 context
165   1 word_priority_q
166   x word_lattice
167   1 prune_delta
168   1 current_search_frame
169
170   1.r best_token_for_arc[]  max_fsm_arcs
171   1.r best_token_for_node[]   max_fsm_nodes
172   1 cost_offset_for_frame MAX_FRAMES
173   1 accumulated_cost_offset_for_frame MAX_FRAMES
174
175   x active_fsmarc_tokens
176   num_new_states   ... num in active_fsmarc_tokens
177   max_new_states   ... same as fsmarc_token_array_size
178
179   x active_fsm_node_tokens
180
181   ? current_model_scores num_model_slots_allocated
182
183   p fsmarc_token_array _size _freelist
184   p fsmnode_token_array  _size _freelist
185   x word_token_array _size _freelist
186   x word_token_array_flags
187
188   ... not used! best_fsmarc_token
189   srec_ended
190   astar_stack
191*/
192
193struct srec_t
194{  /*contains everything needed to run the search*/
195  asr_int16_t id;                   /*contains an id for this recognizer*/
196  srec_context *context;      /*contains the recognition context (fst, info about models, etc)*/
197  priority_q *word_priority_q; /*used to keep track of new word in frame*/
198  srec_word_lattice *word_lattice;  /*used to keep track of word lattice in utterance*/
199
200  costdata prune_delta;        /* controls the amount of score-based pruning - should this go in the context instead?*/
201  costdata current_prune_delta; /* when the above changes in mid-frame */
202  costdata current_best_cost;   /* 0 if single recog */
203
204  frameID current_search_frame;
205  stokenID *best_token_for_arc;  /* non-owning ptr, see multi_srec below */
206
207  stokenID active_fsmarc_tokens; /*head of list of state tokens for the next frame.  Used during
208        the search to keep track of new states for new frame.  This
209        is to allow us to efficently do things like prune, free state arrays, etc*/
210
211
212  nodeID num_new_states;
213  nodeID max_new_states;  /*the num allocated in the new_states array - if the search is exceeding this,
214         we need to tighten the pruning*/
215
216  ftokenID *best_token_for_node;   /* non-owning ptr, see multi_srec below */
217
218  ftokenID active_fsmnode_tokens;  /* linked list of all fsmnode token (same as ones in
219           best_state_for_node, just kept as a list)*/
220
221  costdata *current_model_scores;  /* temporary array used by the search to contain model scores -
222           size is max number of models*/
223  modelID num_model_slots_allocated;  /*num allocated in above array - search will only
224       work with models with less than this number of models*/
225
226  /*the following arrays handle all the state and word tokens.  All of them
227    are allocated to a fixed size at startup time, and the search uses elements
228    from the first array in the search.  The pruning of the search is used to
229    make sure that the allocated number is not exceeded*/
230
231
232  fsmarc_token *fsmarc_token_array;  /*used for storage of all state tokens
233           - allocated once at startup time and kept
234           around.  It's fixed size and the search
235           pruning must ensure that it is never
236           exceeded*/
237  stokenID fsmarc_token_array_size; /*total number of tokens allocated in this array*/
238  stokenID fsmarc_token_freelist;   /*index to head of state token freelist*/
239
240  fsmnode_token *fsmnode_token_array;  /*used for storage of all fsmnode tokens
241           - allocated once at startup time and kept
242           around.  It's fixed size and the search
243           pruning must ensure that it is never
244           exceeded*/
245  ftokenID fsmnode_token_array_size; /*total number of tokens allocated in this array*/
246  ftokenID fsmnode_token_freelist;   /*index to head of fsmnode token freelist*/
247
248  word_token *word_token_array;    /* used for storage of all word tokens -
249            allocated once at startup time and kept
250            around.  It's fixed size and the search
251            pruning must ensure that it is never
252            exceeded*/
253  asr_int16_t* word_token_array_flags;   /* bitarray used for flagging */
254  wtokenID word_token_array_size;  /* total number of tokens allocated in
255            this array*/
256  wtokenID word_token_freelist;    /* index to head of word token freelist*/
257
258  altword_token* altword_token_array; /* used to store alternative words before a wb */
259  wtokenID altword_token_array_size;
260  altword_token* altword_token_freelist;
261  wtokenID altword_token_freelist_len;
262
263  frameID max_frames;
264  costdata* best_model_cost_for_frame;
265  costdata* cost_offset_for_frame;        /* see multi_srec, below */
266  bigcostdata* accumulated_cost_offset;   /* see multi_srec, below */
267
268  stokenID best_fsmarc_token;      /* ?? index of best scoring state token
269           this is used to lookup wtokens on the
270           top choice path, to make sure they're not
271           pruned via reprune_word_tokens() */
272  costdata current_best_ftoken_cost[NODE_INFO_NUMS];
273  ftokenID current_best_ftoken_index[NODE_INFO_NUMS];
274
275  /*the following elements are to keep track of how big various arrays are*/
276  nodeID max_fsm_nodes;           /* see multi_srec below */
277  arcID max_fsm_arcs;             /* see multi_srec below */
278  asr_int16_t srec_ended;
279  AstarStack *astar_stack;        /* for backwards word search */
280  const featdata* avg_state_durations;  /* average state durations (from AMs) */
281
282  srec_eos_detector_state eosd_state;
283};
284
285#define MAX_RECOGNIZERS 2          /* generally, 1x for each acoustic model */
286#define MAX_ACOUSTIC_MODELS 2
287
288/**
289 * @todo document
290 */
291typedef struct
292{
293  asr_int32_t num_allocated_recs;
294  asr_int32_t num_activated_recs;
295  srec* rec;                       /* size num_allocated_recs, one for
296            each gender */
297
298  frameID max_frames;
299  costdata* cost_offset_for_frame; /* size max_frames, keeps track of
300            current_best_costs bookkeeping from
301            reset_current_best_costs_to_zero() */
302  bigcostdata *accumulated_cost_offset; /* same as above but cumulative */
303
304
305  ftokenID *best_token_for_node;  /* array (size max_fsm_nodes) best path into
306           fsmnode - kept as an fsmnode_token */
307  nodeID max_fsm_nodes;
308  stokenID *best_token_for_arc;   /* array (size max_fsm_arcs) best path into
309           fsmarc - kept as a fsmarc_token */
310  arcID max_fsm_arcs;
311
312  /* non owning pointer to compact acoustic models */
313  asr_int32_t num_swimodels;
314  const SWIModel    *swimodel[MAX_ACOUSTIC_MODELS];
315  EOSrc eos_status;
316}
317multi_srec;
318
319#ifdef __cplusplus
320extern "C"
321{
322#endif
323  priority_q* allocate_priority_q(int max_n);
324  void free_priority_q(priority_q* pq);
325  void clear_priority_q(priority_q *pq);
326  wtokenID get_word_token_list(priority_q *pq, word_token *word_token_array);
327  wtokenID add_word_token_to_priority_q(priority_q *pq, wtokenID token_index_to_add, word_token *word_token_array);
328  void remove_non_end_word_from_q(srec *rec, priority_q *pq, word_token *word_token_array, nodeID end_node);
329  costdata get_priority_q_threshold(priority_q *pq, word_token *word_token_array);
330
331  void free_word_token(srec *rec, wtokenID old_token_index);
332  int srec_begin(srec* rec, int begin_syn_node);
333  void srec_no_more_frames(srec* rec);
334  bigcostdata accumulated_cost_offset(costdata *cost_offsets, frameID frame);
335  void multi_srec_get_speech_bounds(multi_srec* rec, frameID* start_frame, frameID* end_frame);
336  int multi_srec_get_eos_status(multi_srec* rec);
337#ifdef __cplusplus
338}
339#endif
340
341/**
342 * For visualization in the debugger
343 */
344typedef struct
345{
346  asr_uint16_t data[50];
347}
348us50;
349
350/**
351 * @todo document
352 */
353typedef struct
354{
355  asr_uint16_t data[250];
356}
357us250;
358
359/**
360 * @todo document
361 */
362typedef struct
363{
364  asr_uint16_t data[1000];
365}
366us1000;
367
368#endif
369