1/*---------------------------------------------------------------------------* 2 * RecognizerImpl.c * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 21#include "ESR_Session.h" 22#include "ESR_SessionTypeImpl.h" 23#include "IntArrayList.h" 24#include "LCHAR.h" 25#include "passert.h" 26#include "plog.h" 27#include "pstdio.h" 28#include "pmemory.h" 29#include "ptimestamp.h" 30#include "SR_AcousticModelsImpl.h" 31#include "SR_AcousticStateImpl.h" 32#include "SR_GrammarImpl.h" 33#include "SR_SemprocDefinitions.h" 34#include "SR_SemanticResult.h" 35#include "SR_SemanticResultImpl.h" 36#include "SR_Recognizer.h" 37#include "SR_RecognizerImpl.h" 38#include "SR_RecognizerResultImpl.h" 39#include "SR_SemanticResultImpl.h" 40#include "SR_EventLog.h" 41#include "srec.h" 42 43#define MTAG NULL 44#define FILTER_NBEST_BY_SEM_RESULT 1 45#define AUDIO_CIRC_BUFFER_SIZE 20000 46#define SEMPROC_ACTIVE 1 47#define SAMPLE_SIZE (16 / CHAR_BIT) /* 16-bits / sample */ 48 49/* milliseconds per FRAME = 1/FRAMERATE * 1000 */ 50/* We multiple by 2 because we skip even frames */ 51#define MSEC_PER_FRAME (2000/FRAMERATE) 52#define MAX_ENTRY_LENGTH 512 53#define PREFIX_WORD "-pau-" 54#define PREFIX_WORD_LEN 5 55#define SUFFIX_WORD "-pau2-" 56#define SUFFIX_WORD_LEN 6 57 58 59static ESR_ReturnCode SR_Recognizer_Reset_Buffers ( SR_RecognizerImpl *impl ); 60 61/** 62 * Initializes recognizer properties to default values. 63 * 64 * Replaces setup_recognition_parameters() 65 */ 66ESR_ReturnCode SR_RecognizerToSessionImpl() 67{ 68 ESR_ReturnCode rc; 69 70 /* Old comment: remember to keep "ca_rip.h" up to date with these parameters... */ 71 72 /* CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.max_acoustic_models", 2)); */ 73 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("CREC.Recognizer.partial_results", ESR_FALSE)); 74 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.NBest", 1)); 75 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.eou_threshold", 100)); 76 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.max_altword_tokens", 400)); 77 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.max_frames", 1000)); 78 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.max_fsm_arcs", 3000)); 79 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.max_fsm_nodes", 3000)); 80 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.max_fsmnode_tokens", 1000)); 81 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.max_hmm_tokens", 1000)); 82 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.max_model_states", 1000)); 83 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.max_searches", 2)); 84 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.max_word_tokens", 1000)); 85 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.non_terminal_timeout", 50)); 86 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.num_wordends_per_frame", 10)); 87 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.often", 10)); 88 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.optional_terminal_timeout", 30)); 89 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.reject", 500)); 90 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.terminal_timeout", 10)); 91 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.viterbi_prune_thresh", 5000)); 92 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Recognizer.wordpen", 0)); 93 94 CHKLOG(rc, ESR_SessionSetSize_tIfEmpty("SREC.Recognizer.utterance_timeout", 400)); 95 96 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("enableGetWaveform", ESR_FALSE)); 97 98 return ESR_SUCCESS; 99CLEANUP: 100 return rc; 101} 102 103/** 104 * Initializes frontend properties to default values. 105 * 106 * Replaces load_up_parameter_list() 107 */ 108ESR_ReturnCode SR_RecognizerFrontendToSessionImpl() 109{ 110 IntArrayList* intList = NULL; 111 ESR_ReturnCode rc; 112 ESR_BOOL exists; 113 size_t i; 114 115 /* Old comment: Remember to keep "ca_pip.h" up to date with these parameters... */ 116 117 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.mel_dim", 12)); 118 CHKLOG(rc, ESR_SessionSetSize_tIfEmpty("CREC.Frontend.samplerate", 8000)); 119 CHKLOG(rc, ESR_SessionSetFloatIfEmpty("CREC.Frontend.premel", 0.98f)); 120 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.lowcut", 260)); /* Hz */ 121 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.highcut", 4000)); /* Hz */ 122 CHKLOG(rc, ESR_SessionSetFloatIfEmpty("CREC.Frontend.window_factor", 2.0)); /* times the frame size */ 123 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("CREC.Frontend.do_skip_even_frames", ESR_FALSE)); /* 10/20 ms rate */ 124 CHKLOG(rc, ESR_SessionSetFloatIfEmpty("CREC.Frontend.offset", 0)); /* additional */ 125 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("CREC.Frontend.ddmel", ESR_FALSE)); /* delta-delta mel pars */ 126 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.forgetfactor", 40)); 127 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.sv6_margin", 10)); 128 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("CREC.Frontend.rasta", ESR_FALSE)); 129 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("CREC.Frontend.rastac0", ESR_FALSE)); 130 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("CREC.Frontend.spectral_subtraction", ESR_FALSE)); 131 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.spec_sub_dur", 0)); 132 CHKLOG(rc, ESR_SessionSetFloatIfEmpty("CREC.Frontend.spec_sub_scale", 1.0)); 133 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("CREC.Frontend.do_filterbank_dump", ESR_FALSE)); /* Output is filterbank (30 floats) */ 134 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("CREC.Frontend.do_filterbank_input", ESR_FALSE)); /* Input is filterbank (30 floats) in place of audio samples */ 135 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("CREC.Frontend.do_smooth_c0", ESR_TRUE)); 136 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("CREC.Frontend.plp", ESR_FALSE)); /* Do PLP instead of MEL */ 137 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.lpcorder", 12)); /* order of lpc analysis in plp processing */ 138 CHKLOG(rc, ESR_SessionSetFloatIfEmpty("CREC.Frontend.warp_scale", 1.0)); 139 CHKLOG(rc, ESR_SessionSetFloatIfEmpty("CREC.Frontend.piecewise_start", 1.0)); 140 CHKLOG(rc, ESR_SessionSetFloatIfEmpty("CREC.Frontend.peakdecayup", -1.0)); /* If +ve, decay factor on peakpicker (low to high) */ 141 CHKLOG(rc, ESR_SessionSetFloatIfEmpty("CREC.Frontend.peakdecaydown", -1.0)); /* If +ve, decay factor on peakpicker (high to low) */ 142 CHKLOG(rc, ESR_SessionSetBoolIfEmpty("CREC.Frontend.cuberoot", ESR_FALSE)); /* Use cube root instead of log */ 143 144 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.mel_offset", &exists)); 145 if (!exists) 146 { 147 CHKLOG(rc, IntArrayListCreate(&intList)); 148 for (i = 0; i < 32; ++i) 149 CHKLOG(rc, IntArrayListAdd(intList, 0)); 150 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.mel_offset", intList, TYPES_INTARRAYLIST)); 151 intList = NULL; 152 } 153 154 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.mel_loop", &exists)); 155 if (!exists) 156 { 157 CHKLOG(rc, IntArrayListCreate(&intList)); 158 for (i = 0; i < 32; ++i) 159 CHKLOG(rc, IntArrayListAdd(intList, 1)); 160 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.mel_loop", intList, TYPES_INTARRAYLIST)); 161 intList = NULL; 162 } 163 164 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.melA", &exists)); 165 if (!exists) 166 { 167 CHKLOG(rc, IntArrayListCreate(&intList)); 168 CHKLOG(rc, IntArrayListAdd(intList, (int) 13.2911)); 169 CHKLOG(rc, IntArrayListAdd(intList, (int) 47.2229)); 170 CHKLOG(rc, IntArrayListAdd(intList, (int) 79.2485)); 171 CHKLOG(rc, IntArrayListAdd(intList, (int) 92.1967)); 172 CHKLOG(rc, IntArrayListAdd(intList, (int) 136.3855)); 173 CHKLOG(rc, IntArrayListAdd(intList, (int) 152.2896)); 174 CHKLOG(rc, IntArrayListAdd(intList, (int) 183.3601)); 175 CHKLOG(rc, IntArrayListAdd(intList, (int) 197.4200)); 176 CHKLOG(rc, IntArrayListAdd(intList, (int) 217.8278)); 177 CHKLOG(rc, IntArrayListAdd(intList, (int) 225.6556)); 178 CHKLOG(rc, IntArrayListAdd(intList, (int) 263.3073)); 179 CHKLOG(rc, IntArrayListAdd(intList, (int) 277.193)); 180 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.melA", intList, TYPES_INTARRAYLIST)); 181 intList = NULL; 182 } 183 184 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.melB", &exists)); 185 if (!exists) 186 { 187 CHKLOG(rc, IntArrayListCreate(&intList)); 188 CHKLOG(rc, IntArrayListAdd(intList, (int) 37.0847)); 189 CHKLOG(rc, IntArrayListAdd(intList, (int) 91.3289)); 190 CHKLOG(rc, IntArrayListAdd(intList, (int) 113.9995)); 191 CHKLOG(rc, IntArrayListAdd(intList, (int) 123.0336)); 192 CHKLOG(rc, IntArrayListAdd(intList, (int) 131.2704)); 193 CHKLOG(rc, IntArrayListAdd(intList, (int) 128.9942)); 194 CHKLOG(rc, IntArrayListAdd(intList, (int) 120.5267)); 195 CHKLOG(rc, IntArrayListAdd(intList, (int) 132.0079)); 196 CHKLOG(rc, IntArrayListAdd(intList, (int) 129.8076)); 197 CHKLOG(rc, IntArrayListAdd(intList, (int) 126.5029)); 198 CHKLOG(rc, IntArrayListAdd(intList, (int) 121.8519)); 199 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.melB", intList, TYPES_INTARRAYLIST)); 200 intList = NULL; 201 } 202 203 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.dmelA", &exists)); 204 if (!exists) 205 { 206 CHKLOG(rc, IntArrayListCreate(&intList)); 207 CHKLOG(rc, IntArrayListAdd(intList, (int) 91.6305)); 208 CHKLOG(rc, IntArrayListAdd(intList, (int) 358.3790)); 209 CHKLOG(rc, IntArrayListAdd(intList, (int) 527.5946)); 210 CHKLOG(rc, IntArrayListAdd(intList, (int) 536.3163)); 211 CHKLOG(rc, IntArrayListAdd(intList, (int) 731.2385)); 212 CHKLOG(rc, IntArrayListAdd(intList, (int) 757.8382)); 213 CHKLOG(rc, IntArrayListAdd(intList, (int) 939.4460)); 214 CHKLOG(rc, IntArrayListAdd(intList, (int) 1028.4136)); 215 CHKLOG(rc, IntArrayListAdd(intList, (int) 1071.3193)); 216 CHKLOG(rc, IntArrayListAdd(intList, (int) 1183.7922)); 217 CHKLOG(rc, IntArrayListAdd(intList, (int) 1303.1014)); 218 CHKLOG(rc, IntArrayListAdd(intList, (int) 1447.7766)); 219 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.dmelA", intList, TYPES_INTARRAYLIST)); 220 intList = NULL; 221 } 222 223 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.dmelB", &exists)); 224 if (!exists) 225 { 226 CHKLOG(rc, IntArrayListCreate(&intList)); 227 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.4785)); 228 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.3878)); 229 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.4029)); 230 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.3182)); 231 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.3706)); 232 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5394)); 233 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5150)); 234 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.4270)); 235 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.4871)); 236 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.4088)); 237 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.4361)); 238 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5449)); 239 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.dmelB", intList, TYPES_INTARRAYLIST)); 240 intList = NULL; 241 } 242 243 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.ddmelA", &exists)); 244 if (!exists) 245 { 246 CHKLOG(rc, IntArrayListCreate(&intList)); 247 CHKLOG(rc, IntArrayListAdd(intList, (int) 10.7381)); 248 CHKLOG(rc, IntArrayListAdd(intList, (int) 32.6775)); 249 CHKLOG(rc, IntArrayListAdd(intList, (int) 46.2301)); 250 CHKLOG(rc, IntArrayListAdd(intList, (int) 51.5438)); 251 CHKLOG(rc, IntArrayListAdd(intList, (int) 57.6636)); 252 CHKLOG(rc, IntArrayListAdd(intList, (int) 57.0581)); 253 CHKLOG(rc, IntArrayListAdd(intList, (int) 65.3696)); 254 CHKLOG(rc, IntArrayListAdd(intList, (int) 70.1910)); 255 CHKLOG(rc, IntArrayListAdd(intList, (int) 71.6751)); 256 CHKLOG(rc, IntArrayListAdd(intList, (int) 78.2364)); 257 CHKLOG(rc, IntArrayListAdd(intList, (int) 83.2440)); 258 CHKLOG(rc, IntArrayListAdd(intList, (int) 89.6261)); 259 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.ddmelA", intList, TYPES_INTARRAYLIST)); 260 intList = NULL; 261 } 262 263 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.ddmelB", &exists)); 264 if (!exists) 265 { 266 CHKLOG(rc, IntArrayListCreate(&intList)); 267 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5274)); 268 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5098)); 269 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5333)); 270 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5963)); 271 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5132)); 272 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5282)); 273 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5530)); 274 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5682)); 275 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.4662)); 276 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.4342)); 277 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.5235)); 278 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.4061)); 279 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.ddmelB", intList, TYPES_INTARRAYLIST)); 280 intList = NULL; 281 } 282 283 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.rastaA", &exists)); 284 if (!exists) 285 { 286 CHKLOG(rc, IntArrayListCreate(&intList)); 287 CHKLOG(rc, IntArrayListAdd(intList, (int) 7.80)); 288 CHKLOG(rc, IntArrayListAdd(intList, (int) 37.0)); 289 CHKLOG(rc, IntArrayListAdd(intList, (int) 54.0)); 290 CHKLOG(rc, IntArrayListAdd(intList, (int) 57.0)); 291 CHKLOG(rc, IntArrayListAdd(intList, (int) 84.0)); 292 CHKLOG(rc, IntArrayListAdd(intList, (int) 86.5)); 293 CHKLOG(rc, IntArrayListAdd(intList, (int) 98.1)); 294 CHKLOG(rc, IntArrayListAdd(intList, (int) 127.0)); 295 CHKLOG(rc, IntArrayListAdd(intList, (int) 153.0)); 296 CHKLOG(rc, IntArrayListAdd(intList, (int) 160.0)); 297 CHKLOG(rc, IntArrayListAdd(intList, (int) 188.0)); 298 CHKLOG(rc, IntArrayListAdd(intList, (int) 199.0)); 299 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.rastaA", intList, TYPES_INTARRAYLIST)); 300 intList = NULL; 301 } 302 303 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.rastaB", &exists)); 304 if (!exists) 305 { 306 CHKLOG(rc, IntArrayListCreate(&intList)); 307 CHKLOG(rc, IntArrayListAdd(intList, 117)); 308 CHKLOG(rc, IntArrayListAdd(intList, 121)); 309 CHKLOG(rc, IntArrayListAdd(intList, 114)); 310 CHKLOG(rc, IntArrayListAdd(intList, 111)); 311 CHKLOG(rc, IntArrayListAdd(intList, 113)); 312 CHKLOG(rc, IntArrayListAdd(intList, 126)); 313 CHKLOG(rc, IntArrayListAdd(intList, 134)); 314 CHKLOG(rc, IntArrayListAdd(intList, 130)); 315 CHKLOG(rc, IntArrayListAdd(intList, 135)); 316 CHKLOG(rc, IntArrayListAdd(intList, 129)); 317 CHKLOG(rc, IntArrayListAdd(intList, 139)); 318 CHKLOG(rc, IntArrayListAdd(intList, 138)); 319 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.rastaB", intList, TYPES_INTARRAYLIST)); 320 intList = NULL; 321 } 322 323 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.speech_detect", 18)); 324 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.speech_above", 18)); 325 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.ambient_within", 12)); 326 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.start_windback", 50)); 327 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.utterance_allowance", 40)); 328 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.speech_duration", 6)); 329 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.quiet_duration", 20)); 330 331 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.high_clip", 32767)); 332 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.low_clip", -32768)); 333 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.max_per10000_clip", 10)); 334 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.max_dc_offset", 1000)); 335 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.high_noise_level_bit", 11)); 336 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.low_speech_level_bit", 11)); 337 CHKLOG(rc, ESR_SessionSetIntIfEmpty("CREC.Frontend.min_samples", 10000)); 338 339 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.spectrum_filter_freq", &exists)); 340 if (!exists) 341 { 342 CHKLOG(rc, IntArrayListCreate(&intList)); 343 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.spectrum_filter_freq", intList, TYPES_INTARRAYLIST)); 344 intList = NULL; 345 } 346 CHKLOG(rc, ESR_SessionContains("CREC.Frontend.spectrum_filter_spread", &exists)); 347 if (!exists) 348 { 349 CHKLOG(rc, IntArrayListCreate(&intList)); 350 CHKLOG(rc, ESR_SessionSetProperty("CREC.Frontend.spectrum_filter_spread", intList, TYPES_INTARRAYLIST)); 351 intList = NULL; 352 } 353 return ESR_SUCCESS; 354CLEANUP: 355 if (intList != NULL) 356 intList->destroy(intList); 357 return rc; 358} 359 360/** 361 * Generate legacy frontend parameter structure from ESR_Session. 362 * 363 * @param impl SR_RecognizerImpl handle 364 * @param params Resulting structure 365 */ 366ESR_ReturnCode SR_RecognizerGetFrontendLegacyParametersImpl(CA_FrontendInputParams* params) 367{ 368 ESR_ReturnCode rc; 369 IntArrayList* intList; 370 size_t size, i, size_tValue; 371 int iValue; 372 373 passert(params != NULL); 374 params->is_loaded = ESR_FALSE; 375 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.mel_dim", ¶ms->mel_dim)); 376 CHKLOG(rc, ESR_SessionGetSize_t("CREC.Frontend.samplerate", &size_tValue)); 377 params->samplerate = (int) size_tValue; 378 CHKLOG(rc, ESR_SessionGetFloat("CREC.Frontend.premel", ¶ms->pre_mel)); 379 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.lowcut", ¶ms->low_cut)); 380 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.highcut", ¶ms->high_cut)); 381 CHKLOG(rc, ESR_SessionGetFloat("CREC.Frontend.window_factor", ¶ms->window_factor)); 382 CHKLOG(rc, ESR_SessionGetBool("CREC.Frontend.do_skip_even_frames", ¶ms->do_skip_even_frames)); 383 CHKLOG(rc, ESR_SessionGetFloat("CREC.Frontend.offset", ¶ms->offset)); 384 CHKLOG(rc, ESR_SessionGetBool("CREC.Frontend.ddmel", ¶ms->do_dd_mel)); 385 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.forgetfactor", ¶ms->forget_factor)); 386 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.sv6_margin", ¶ms->sv6_margin)); 387 CHKLOG(rc, ESR_SessionGetBool("CREC.Frontend.rastac0", ¶ms->do_rastac0)); 388 CHKLOG(rc, ESR_SessionGetBool("CREC.Frontend.spectral_subtraction", ¶ms->do_spectral_sub)); 389 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.spec_sub_dur", ¶ms->spectral_sub_frame_dur)); 390 CHKLOG(rc, ESR_SessionGetFloat("CREC.Frontend.spec_sub_scale", ¶ms->spec_sub_scale)); 391 CHKLOG(rc, ESR_SessionGetBool("CREC.Frontend.do_filterbank_dump", ¶ms->do_filterbank_input)); 392 CHKLOG(rc, ESR_SessionGetBool("CREC.Frontend.do_filterbank_input", ¶ms->do_filterbank_input)); 393 CHKLOG(rc, ESR_SessionGetBool("CREC.Frontend.do_smooth_c0", ¶ms->do_smooth_c0)); 394 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.lpcorder", ¶ms->lpc_order)); 395 CHKLOG(rc, ESR_SessionGetFloat("CREC.Frontend.warp_scale", ¶ms->warp_scale)); 396 CHKLOG(rc, ESR_SessionGetFloat("CREC.Frontend.piecewise_start", ¶ms->piecewise_start)); 397 CHKLOG(rc, ESR_SessionGetFloat("CREC.Frontend.peakdecayup", ¶ms->peakpickup)); 398 CHKLOG(rc, ESR_SessionGetFloat("CREC.Frontend.peakdecaydown", ¶ms->peakpickdown)); 399 400 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.mel_offset", (void **)&intList, TYPES_INTARRAYLIST)); 401 if (intList == NULL) 402 { 403 PLogError(L("ESR_INVALID_STATE")); 404 return ESR_INVALID_STATE; 405 } 406 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 407 for (i = 0; i < size; ++i) 408 CHKLOG(rc, IntArrayListGet(intList, i, ¶ms->mel_offset[i])); 409 410 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.mel_loop", (void **)&intList, TYPES_INTARRAYLIST)); 411 if (intList == NULL) 412 { 413 PLogError(L("ESR_INVALID_STATE")); 414 return ESR_INVALID_STATE; 415 } 416 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 417 for (i = 0; i < size; ++i) 418 CHKLOG(rc, IntArrayListGet(intList, i, ¶ms->mel_loop[i])); 419 420 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.melA", (void **)&intList, TYPES_INTARRAYLIST)); 421 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 422 for (i = 0; i < size; ++i) 423 CHKLOG(rc, IntArrayListGet(intList, i, ¶ms->melA_scale[i])); 424 425 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.melB", (void **)&intList, TYPES_INTARRAYLIST)); 426 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 427 for (i = 0; i < size; ++i) 428 CHKLOG(rc, IntArrayListGet(intList, i, ¶ms->melB_scale[i])); 429 430 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.dmelA", (void **)&intList, TYPES_INTARRAYLIST)); 431 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 432 for (i = 0; i < size; ++i) 433 CHKLOG(rc, IntArrayListGet(intList, i, ¶ms->dmelA_scale[i])); 434 435 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.dmelB", (void **)&intList, TYPES_INTARRAYLIST)); 436 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 437 for (i = 0; i < size; ++i) 438 CHKLOG(rc, IntArrayListGet(intList, i, ¶ms->dmelB_scale[i])); 439 440 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.ddmelA", (void **)&intList, TYPES_INTARRAYLIST)); 441 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 442 for (i = 0; i < size; ++i) 443 CHKLOG(rc, IntArrayListGet(intList, i, ¶ms->ddmelA_scale[i])); 444 445 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.ddmelB", (void **)&intList, TYPES_INTARRAYLIST)); 446 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 447 for (i = 0; i < size; ++i) 448 CHKLOG(rc, IntArrayListGet(intList, i, ¶ms->ddmelB_scale[i])); 449 450 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.rastaA", (void **)&intList, TYPES_INTARRAYLIST)); 451 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 452 for (i = 0; i < size; ++i) 453 CHKLOG(rc, IntArrayListGet(intList, i, ¶ms->rastaA_scale[i])); 454 455 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.rastaB", (void **)&intList, TYPES_INTARRAYLIST)); 456 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 457 for (i = 0; i < size; ++i) 458 CHKLOG(rc, IntArrayListGet(intList, i, ¶ms->rastaB_scale[i])); 459 460 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.speech_detect", ¶ms->voice_margin)); 461 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.speech_above", ¶ms->fast_voice_margin)); 462 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.ambient_within", ¶ms->tracker_margin)); 463 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.start_windback", ¶ms->start_windback)); 464 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.utterance_allowance", ¶ms->unsure_duration)); 465 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.speech_duration", ¶ms->voice_duration)); 466 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.quiet_duration", ¶ms->quiet_duration)); 467 468 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.high_clip", ¶ms->high_clip)); 469 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.low_clip", ¶ms->low_clip)); 470 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.max_per10000_clip", ¶ms->max_per10000_clip)); 471 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.max_dc_offset", ¶ms->max_dc_offset)); 472 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.high_noise_level_bit", ¶ms->high_noise_level_bit)); 473 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.low_speech_level_bit", ¶ms->low_speech_level_bit)); 474 CHKLOG(rc, ESR_SessionGetInt("CREC.Frontend.min_samples", ¶ms->min_samples)); 475 476 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.spectrum_filter_freq", (void **)&intList, TYPES_INTARRAYLIST)); 477 if (intList == NULL) 478 { 479 PLogError(L("ESR_INVALID_STATE")); 480 return ESR_INVALID_STATE; 481 } 482 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 483 for (i = 0; i < size; ++i) 484 { 485 CHKLOG(rc, IntArrayListGet(intList, i, &iValue)); 486 params->spectrum_filter_freq[i] = iValue; 487 } 488 489 CHKLOG(rc, ESR_SessionGetProperty("CREC.Frontend.spectrum_filter_spread", (void **)&intList, TYPES_INTARRAYLIST)); 490 if (intList == NULL) 491 { 492 PLogError(L("ESR_INVALID_STATE")); 493 return ESR_INVALID_STATE; 494 } 495 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 496 for (i = 0; i < size; ++i) 497 { 498 CHKLOG(rc, IntArrayListGet(intList, i, &iValue)); 499 params->spectrum_filter_spread[i] = iValue; 500 } 501 params->is_loaded = ESR_TRUE; 502 return ESR_SUCCESS; 503CLEANUP: 504 return rc; 505} 506 507/** 508 * Creates frontend components of SR_Recognizer. 509 * 510 * @param impl SR_RecognizerImpl handle 511 */ 512ESR_ReturnCode SR_RecognizerCreateFrontendImpl(SR_RecognizerImpl* impl) 513{ 514 ESR_ReturnCode rc; 515 CA_FrontendInputParams* frontendParams; 516 517 /* Create a frontend object */ 518 impl->frontend = CA_AllocateFrontend(1, 0, 1); 519 frontendParams = CA_AllocateFrontendParameters(); 520 CHKLOG(rc, SR_RecognizerGetFrontendLegacyParametersImpl(frontendParams)); 521 522 CA_ConfigureFrontend(impl->frontend, frontendParams); 523 524 /* Create a wave object */ 525 impl->wavein = CA_AllocateWave('N'); 526 if (impl->wavein == NULL) 527 { 528 rc = ESR_OUT_OF_MEMORY; 529 PLogError(ESR_rc2str(rc)); 530 goto CLEANUP; 531 } 532 CA_ConfigureWave(impl->wavein, impl->frontend); 533 CA_ConfigureVoicingAnalysis(impl->wavein, frontendParams); 534 535 CA_LoadCMSParameters(impl->wavein, NULL, frontendParams); 536 537 /* Create an utterance object */ 538 impl->utterance = CA_AllocateUtterance(); 539 if (impl->utterance == NULL) 540 { 541 rc = ESR_OUT_OF_MEMORY; 542 PLogError(ESR_rc2str(rc)); 543 goto CLEANUP; 544 } 545 CA_InitUtteranceForFrontend(impl->utterance, frontendParams); 546 CA_AttachCMStoUtterance(impl->wavein, impl->utterance); 547 CA_FreeFrontendParameters(frontendParams); 548 return ESR_SUCCESS; 549 550CLEANUP: 551 if (impl->frontend != NULL) 552 { 553 CA_UnconfigureFrontend(impl->frontend); 554 CA_FreeFrontend(impl->frontend); 555 impl->frontend = NULL; 556 } 557 if (impl->wavein != NULL) 558 { 559 CA_UnconfigureWave(impl->wavein); 560 CA_FreeWave(impl->wavein); 561 impl->wavein = NULL; 562 } 563 if (impl->utterance != NULL) 564 { 565 CA_ClearUtterance(impl->utterance); 566 CA_FreeUtterance(impl->utterance); 567 impl->utterance = NULL; 568 } 569 if (frontendParams != NULL) 570 CA_FreeFrontendParameters(frontendParams); 571 return rc; 572} 573 574/** 575 * Populates legacy recognizer parameters from the session. 576 * 577 * Replaces setup_pattern_parameters() 578 */ 579ESR_ReturnCode SR_AcousticModels_LoadLegacyRecognizerParameters(CA_RecInputParams* params) 580{ 581 ESR_ReturnCode rc; 582 583 passert(params != NULL); 584 params->is_loaded = ESR_FALSE; 585 CHKLOG(rc, ESR_SessionGetBool("CREC.Recognizer.partial_results", ¶ms->do_partial)); 586 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.NBest", ¶ms->top_choices)); 587 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.eou_threshold", ¶ms->eou_threshold)); 588 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.max_altword_tokens", ¶ms->max_altword_tokens)); 589 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.max_frames", ¶ms->max_frames)); 590 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.max_fsm_arcs", ¶ms->max_fsm_arcs)); 591 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.max_fsm_nodes", ¶ms->max_fsm_nodes)); 592 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.max_fsmnode_tokens", ¶ms->max_fsmnode_tokens)); 593 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.max_hmm_tokens", ¶ms->max_hmm_tokens)); 594 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.max_model_states", ¶ms->max_model_states)); 595 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.max_searches", ¶ms->max_searches)); 596 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.max_word_tokens", ¶ms->max_word_tokens)); 597 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.non_terminal_timeout", ¶ms->non_terminal_timeout)); 598 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.num_wordends_per_frame", ¶ms->num_wordends_per_frame)); 599 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.often", ¶ms->traceback_freq)); 600 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.optional_terminal_timeout", ¶ms->optional_terminal_timeout)); 601 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.reject", ¶ms->reject_score)); 602 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.terminal_timeout", ¶ms->terminal_timeout)); 603 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.viterbi_prune_thresh", ¶ms->viterbi_prune_thresh)); 604 CHKLOG(rc, ESR_SessionGetInt("CREC.Recognizer.wordpen", ¶ms->word_penalty)); 605 params->is_loaded = ESR_TRUE; 606 607 return ESR_SUCCESS; 608CLEANUP: 609 return rc; 610} 611 612ESR_ReturnCode SR_RecognizerCreate(SR_Recognizer** self) 613{ 614 SR_RecognizerImpl* impl; 615 CA_RecInputParams* recogParams = NULL; 616 ESR_ReturnCode rc; 617 LCHAR recHandle[12]; 618 619 if (self == NULL) 620 { 621 PLogError(L("ESR_INVALID_ARGUMENT")); 622 return ESR_INVALID_ARGUMENT; 623 } 624 impl = NEW(SR_RecognizerImpl, MTAG); 625 if (impl == NULL) 626 { 627 PLogError(L("ESR_OUT_OF_MEMORY")); 628 return ESR_OUT_OF_MEMORY; 629 } 630 631 impl->Interface.start = &SR_RecognizerStartImpl; 632 impl->Interface.stop = &SR_RecognizerStopImpl; 633 impl->Interface.destroy = &SR_RecognizerDestroyImpl; 634 impl->Interface.setup = &SR_RecognizerSetupImpl; 635 impl->Interface.unsetup = &SR_RecognizerUnsetupImpl; 636 impl->Interface.isSetup = &SR_RecognizerIsSetupImpl; 637 impl->Interface.getParameter = &SR_RecognizerGetParameterImpl; 638 impl->Interface.getSize_tParameter = &SR_RecognizerGetSize_tParameterImpl; 639 impl->Interface.getBoolParameter = &SR_RecognizerGetBoolParameterImpl; 640 impl->Interface.setParameter = &SR_RecognizerSetParameterImpl; 641 impl->Interface.setSize_tParameter = &SR_RecognizerSetSize_tParameterImpl; 642 impl->Interface.setBoolParameter = &SR_RecognizerSetBoolParameterImpl; 643 impl->Interface.setLockFunction = &SR_RecognizerSetLockFunctionImpl; 644 impl->Interface.hasSetupRules = &SR_RecognizerHasSetupRulesImpl; 645 impl->Interface.activateRule = &SR_RecognizerActivateRuleImpl; 646 impl->Interface.deactivateRule = &SR_RecognizerDeactivateRuleImpl; 647 impl->Interface.deactivateAllRules = &SR_RecognizerDeactivateAllRulesImpl; 648 impl->Interface.isActiveRule = &SR_RecognizerIsActiveRuleImpl; 649 impl->Interface.setWordAdditionCeiling = &SR_RecognizerSetWordAdditionCeilingImpl; 650 impl->Interface.checkGrammarConsistency = &SR_RecognizerCheckGrammarConsistencyImpl; 651 impl->Interface.getModels = &SR_RecognizerGetModelsImpl; 652 impl->Interface.putAudio = &SR_RecognizerPutAudioImpl; 653 impl->Interface.advance = &SR_RecognizerAdvanceImpl; 654 impl->Interface.loadUtterance = &SR_RecognizerLoadUtteranceImpl; 655 impl->Interface.loadWaveFile = &SR_RecognizerLoadWaveFileImpl; 656 impl->Interface.logEvent = &SR_RecognizerLogEventImpl; 657 impl->Interface.logToken = &SR_RecognizerLogTokenImpl; 658 impl->Interface.logTokenInt = &SR_RecognizerLogTokenIntImpl; 659 impl->Interface.logSessionStart = &SR_RecognizerLogSessionStartImpl; 660 impl->Interface.logSessionEnd = &SR_RecognizerLogSessionEndImpl; 661 impl->Interface.logWaveformData = &SR_RecognizerLogWaveformDataImpl; 662 impl->Interface.isSignalClipping = &SR_RecognizerIsSignalClippingImpl; 663 impl->Interface.isSignalDCOffset = &SR_RecognizerIsSignalDCOffsetImpl; 664 impl->Interface.isSignalNoisy = &SR_RecognizerIsSignalNoisyImpl; 665 impl->Interface.isSignalTooFewSamples = &SR_RecognizerIsSignalTooFewSamplesImpl; 666 impl->Interface.isSignalTooManySamples = &SR_RecognizerIsSignalTooManySamplesImpl; 667 impl->Interface.isSignalTooQuiet = &SR_RecognizerIsSignalTooQuietImpl; 668 669 impl->frontend = NULL; 670 impl->wavein = NULL; 671 impl->utterance = NULL; 672 impl->confidenceScorer = NULL; 673 impl->recognizer = NULL; 674 impl->models = NULL; 675 impl->grammars = NULL; 676 impl->result = NULL; 677 impl->parameters = NULL; 678 impl->acousticState = NULL; 679 impl->audioBuffer = NULL; 680 impl->buffer = NULL; 681 impl->frames = impl->processed; 682 impl->internalState = SR_RECOGNIZER_INTERNAL_BEGIN; 683 impl->isStarted = ESR_FALSE; 684 impl->isRecognizing = ESR_FALSE; 685 impl->gotLastFrame = ESR_FALSE; 686 impl->sampleRate = 0; 687 impl->lockFunction = NULL; 688 impl->lockData = NULL; 689 impl->eventLog = NULL; 690 impl->osi_log_level = 0; 691 impl->waveformBuffer = NULL; 692 impl->isSignalQualityInitialized = ESR_FALSE; 693 impl->beginningOfSpeechOffset = 0; 694 impl->gatedMode = ESR_TRUE; 695 impl->bgsniff = 0; 696 impl->isSignalClipping = ESR_FALSE; 697 impl->isSignalDCOffset = ESR_FALSE; 698 impl->isSignalNoisy = ESR_FALSE; 699 impl->isSignalTooFewSamples = ESR_FALSE; 700 impl->isSignalTooManySamples = ESR_FALSE; 701 impl->isSignalTooQuiet = ESR_FALSE; 702 703 CHKLOG(rc, ESR_SessionTypeCreate(&impl->parameters)); 704 CHKLOG(rc, SR_RecognizerToSessionImpl()); 705 CHKLOG(rc, ESR_SessionGetSize_t(L("SREC.Recognizer.osi_log_level"), &impl->osi_log_level)); 706 707 /* create the event log */ 708 if (impl->osi_log_level) /* do some logging if non-zero val */ 709 CHKLOG(rc, ESR_SessionGetProperty(L("eventlog"), (void **)&impl->eventLog, TYPES_SR_EVENTLOG)); 710 711 /* Record the OSI log event */ 712 psprintf(recHandle, L("%p"), impl); 713 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("REC"), recHandle)); 714 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIcrst"))); 715 716 CHKLOG(rc, SR_RecognizerFrontendToSessionImpl()); 717 CHKLOG(rc, SR_RecognizerCreateFrontendImpl(impl)); 718 rc = ESR_SessionGetProperty("recognizer.confidenceScorer", (void **)&impl->confidenceScorer, TYPES_CONFIDENCESCORER); 719 if (rc == ESR_NO_MATCH_ERROR) 720 { 721 impl->confidenceScorer = CA_AllocateConfidenceScorer(); 722 723 if (!CA_LoadConfidenceScorer(impl->confidenceScorer)) { 724 rc = ESR_INVALID_STATE; 725 PLogError(ESR_rc2str(rc)); 726 goto CLEANUP; 727 } 728 CHKLOG(rc, ESR_SessionSetProperty("recognizer.confidenceScorer", impl->confidenceScorer, TYPES_CONFIDENCESCORER)); 729 } 730 else if (rc != ESR_SUCCESS) 731 { 732 PLogError(ESR_rc2str(rc)); 733 goto CLEANUP; 734 } 735 736 recogParams = CA_AllocateRecognitionParameters(); 737 if (recogParams == NULL) 738 { 739 rc = ESR_OUT_OF_MEMORY; 740 PLogError(ESR_rc2str(rc)); 741 goto CLEANUP; 742 } 743 CHKLOG(rc, SR_AcousticModels_LoadLegacyRecognizerParameters(recogParams)); 744 impl->recognizer = CA_AllocateRecognition(); 745 if (impl->recognizer == NULL) 746 { 747 PLogError(ESR_rc2str(rc)); 748 goto CLEANUP; 749 } 750 CA_ConfigureRecognition(impl->recognizer, recogParams); 751 CA_FreeRecognitionParameters(recogParams); 752 CHKLOG(rc, HashMapCreate(&impl->grammars)); 753 CHKLOG(rc, CircularBufferCreate(sizeof(asr_int16_t) * AUDIO_CIRC_BUFFER_SIZE, MTAG, &impl->buffer)); 754 CHKLOG(rc, ESR_SessionGetSize_t("CREC.Frontend.samplerate", &impl->sampleRate)); 755 756 impl->FRAME_SIZE = impl->sampleRate / FRAMERATE * SAMPLE_SIZE; 757 758 if ((impl->audioBuffer = MALLOC(impl->FRAME_SIZE, MTAG)) == NULL) 759 { 760 rc = ESR_OUT_OF_MEMORY; 761 goto CLEANUP; 762 } 763 764 /* create the waveform buffer */ 765 CHKLOG(rc, WaveformBuffer_Create(&impl->waveformBuffer, impl->FRAME_SIZE)); 766 767 CHKLOG(rc, ESR_SessionGetSize_t("SREC.Recognizer.utterance_timeout", &impl->utterance_timeout)); 768 769 /* OSI logging (SUCCESS) */ 770 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("REC"), recHandle)); 771 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("SUCCESS"), L("ESR_SUCCESS"))); 772 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIcrnd"))); 773 774 CHKLOG(rc, SR_AcousticStateCreateImpl(&impl->Interface)); 775 776 CHKLOG(rc, ESR_SessionGetSize_t(L("cmdline.bgsniff"), &impl->bgsniff)); 777 /* gated mode == beginning of speech detection */ 778 CHKLOG(rc, ESR_SessionGetBool(L("cmdline.gatedmode"), &impl->gatedMode)); 779 780 *self = (SR_Recognizer*) impl; 781 return ESR_SUCCESS; 782CLEANUP: 783 /* OSI logging (FAILURE) */ 784 if (impl->eventLog != NULL) 785 { 786 SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("REC"), recHandle); 787 SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("FAILURE"), ESR_rc2str(rc)); 788 SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIcrnd")); 789 } 790 791 if (recogParams != NULL) 792 CA_FreeRecognitionParameters(recogParams); 793 impl->Interface.destroy(&impl->Interface); 794 return rc; 795} 796 797ESR_ReturnCode SR_RecognizerDestroyImpl(SR_Recognizer* self) 798{ 799 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 800 ESR_BOOL exists; // isSetup; 801 ESR_ReturnCode rc; 802 LCHAR recHandle[12]; 803 804 if (impl->result != NULL) 805 { 806 SR_RecognizerResult_Destroy(impl->result); 807 impl->result = NULL; 808 } 809 810 if (impl->eventLog != NULL) 811 { 812 /* Record the OSI log event */ 813 psprintf(recHandle, L("%p"), impl); 814 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("REC"), recHandle)); 815 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIdesst"))); 816 } 817 818 /* Clean session */ 819 CHKLOG(rc, ESR_SessionContains("recognizer.confidenceScorer", &exists)); 820 if (exists) 821 CHKLOG(rc, ESR_SessionRemoveProperty("recognizer.confidenceScorer")); 822 823 if (impl->confidenceScorer != NULL) 824 { 825 CA_FreeConfidenceScorer(impl->confidenceScorer); 826 impl->confidenceScorer = NULL; 827 } 828 829 /* Clear CMS, CRS_RecognizerClose() */ 830 if (impl->wavein != NULL) 831 { 832 ESR_BOOL isAttached, isConfigured; 833 834 CHKLOG(rc, CA_IsCMSAttachedtoUtterance(impl->wavein, &isAttached)); 835 if (isAttached) 836 CA_DetachCMSfromUtterance(impl->wavein, impl->utterance); 837 838 CHKLOG(rc, CA_IsConfiguredForAgc(impl->wavein, &isConfigured)); 839 if (isConfigured) 840 CA_ClearCMSParameters(impl->wavein); 841 } 842 843 /* Free Utterance */ 844 if (impl->utterance != NULL) 845 { 846 CA_ClearUtterance(impl->utterance); 847 CA_FreeUtterance(impl->utterance); 848 impl->utterance = NULL; 849 } 850 851 /* Free WaveformBuffer */ 852 if (impl->waveformBuffer != NULL) 853 { 854 WaveformBuffer_Destroy(impl->waveformBuffer); 855 impl->waveformBuffer = NULL; 856 } 857 858 /* Free recognizer */ 859/* CHKLOG(rc, self->isSetup(self, &isSetup)); 860 if (isSetup) 861 CHKLOG(rc, self->unsetup(self));*/ 862 if (impl->grammars != NULL) 863 CHKLOG(rc, self->deactivateAllRules(self)); 864 if (impl->recognizer != NULL) 865 { 866 CA_UnloadRecognitionModels(impl->recognizer); 867 CA_UnconfigureRecognition(impl->recognizer); 868 CA_FreeRecognition(impl->recognizer); 869 impl->recognizer = NULL; 870 } 871 872 if (impl->grammars != NULL) 873 { 874 CHKLOG(rc, HashMapDestroy(impl->grammars)); 875 impl->grammars = NULL; 876 } 877 878 if (impl->buffer != NULL) 879 { 880 FREE(impl->buffer); 881 impl->buffer = NULL; 882 } 883 884 if (impl->audioBuffer != NULL) 885 { 886 FREE(impl->audioBuffer); 887 impl->audioBuffer = NULL; 888 } 889 890 /* Free frontend */ 891 if (impl->frontend) 892 { 893 CA_UnconfigureFrontend(impl->frontend); 894 CA_FreeFrontend(impl->frontend); 895 impl->frontend = NULL; 896 } 897 898 /* Free wave */ 899 if (impl->wavein) 900 { 901 CA_UnconfigureWave(impl->wavein); 902 CA_FreeWave(impl->wavein); 903 impl->wavein = NULL; 904 } 905 906 if (impl->parameters != NULL) 907 CHKLOG(rc, impl->parameters->destroy(impl->parameters)); 908 909 if (impl->eventLog != NULL) 910 { 911 /* OSI logging (SUCCESS) */ 912 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("REC"), recHandle)); 913 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("SUCCESS"), L("ESR_SUCCESS"))); 914 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIdesnd"))); 915 impl->eventLog = NULL; 916 } 917 918 if (impl->acousticState != NULL) 919 { 920 impl->acousticState->destroy(self); 921 impl->acousticState = NULL; 922 } 923 FREE(impl); 924 return ESR_SUCCESS; 925CLEANUP: 926 return rc; 927} 928 929ESR_ReturnCode beginRecognizing(SR_RecognizerImpl* impl) 930{ 931 CA_RecInputParams* recogParams; 932 LCHAR tok[80]; 933 LCHAR* val; 934 PTimeStamp BORT; 935 size_t i, grammarSize; 936 ESR_ReturnCode rc; 937 938 /* Setup recognizer for new utterance */ 939 recogParams = CA_AllocateRecognitionParameters(); 940 if (recogParams == NULL) 941 { 942 rc = ESR_OUT_OF_MEMORY; 943 PLogError(ESR_rc2str(rc)); 944 goto CLEANUP; 945 } 946 SR_AcousticModels_LoadLegacyRecognizerParameters(recogParams); 947 CA_BeginRecognition(impl->recognizer, NULL, 1, recogParams); 948 CA_FreeRecognitionParameters(recogParams); 949 impl->isRecognizing = ESR_TRUE; 950 951 /* OSI log the grammars */ 952 CHKLOG(rc, HashMapGetSize(impl->grammars, &grammarSize)); 953 for (i = 0; i < grammarSize; ++i) 954 { 955 psprintf(tok, L("GURI%d"), i); 956 /* use the key as the grammar URI */ 957 CHKLOG(rc, HashMapGetKeyAtIndex(impl->grammars, i, &val)); 958 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, tok, val)); 959 } 960 /* OSI ACST acoustic state reset */ 961 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("ACST"), 0)); 962 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("LANG"), L("en-us"))); 963 964 /* OSI log the start of recognition */ 965 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIrcst"))); 966 967 /* save the BORT timing (begin of recog) */ 968 PTimeStampSet(&BORT); 969 impl->recogLogTimings.BORT = PTimeStampDiff(&BORT, &impl->timestamp); 970 971 return ESR_SUCCESS; 972CLEANUP: 973 if (recogParams != NULL) 974 CA_FreeRecognitionParameters(recogParams); 975 return rc; 976} 977 978ESR_ReturnCode SR_RecognizerStartImpl(SR_Recognizer* self) 979{ 980 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 981 size_t silence_duration_in_frames; 982 size_t end_of_utterance_hold_off_in_frames; 983 size_t grammarCount; 984 ESR_ReturnCode rc; 985 ESR_BOOL enableGetWaveform = ESR_FALSE; 986 987 CHKLOG(rc, impl->grammars->getSize(impl->grammars, &grammarCount)); 988 if (impl->models == NULL) 989 { 990 PLogError("ESR_INVALID_STATE: No rule has been set up"); 991 return ESR_INVALID_STATE; 992 } 993 if (grammarCount < 1) 994 { 995 PLogError("ESR_INVALID_STATE: No rule has been activated"); 996 return ESR_INVALID_STATE; 997 } 998 999 if (!CA_OpenWaveFromDevice(impl->wavein, DEVICE_RAW_PCM, impl->frontend->samplerate, 0, WAVE_DEVICE_RAW)) 1000 { 1001 rc = ESR_INVALID_STATE; 1002 PLogError(ESR_rc2str(rc)); 1003 goto CLEANUP; 1004 } 1005 1006 /* Setup utterance */ 1007 CA_UnlockUtteranceForInput(impl->utterance); 1008 1009 /* Setup utterance */ 1010 CHKLOG(rc, ESR_SessionGetSize_t(L("cmdline.silence_duration_in_frames"), &silence_duration_in_frames)); 1011 CHKLOG(rc, ESR_SessionGetSize_t(L("cmdline.end_of_utterance_hold_off_in_frames"), &end_of_utterance_hold_off_in_frames)); 1012 CA_SetEndOfUtteranceByLevelTimeout(impl->utterance, silence_duration_in_frames, end_of_utterance_hold_off_in_frames); 1013 1014 CA_ResetVoicing(impl->utterance); 1015 1016 /* 1017 * NOTE: We don't actually begin the recognizer here, the beginning of speech 1018 * detector will do that. 1019 */ 1020 1021 impl->gotLastFrame = ESR_FALSE; 1022 impl->isStarted = ESR_TRUE; 1023 impl->isRecognizing = ESR_FALSE; 1024 impl->isSignalQualityInitialized = ESR_FALSE; 1025 impl->internalState = SR_RECOGNIZER_INTERNAL_BEGIN; 1026 PTimeStampSet(&impl->timestamp); 1027 1028 /* reset waveform buffer at start of every recognition */ 1029 CHKLOG(rc, WaveformBuffer_Reset(impl->waveformBuffer)); 1030 1031 /* is waveform buffering active? */ 1032 rc = ESR_SessionGetBool(L("enableGetWaveform"), &enableGetWaveform); 1033 // rc = impl->parameters->getBool(impl->parameters, L("enableGetWaveform"), &enableGetWaveform); 1034 if (rc != ESR_SUCCESS && rc != ESR_NO_MATCH_ERROR) 1035 { 1036 PLogError(L("%s: could determine whether VoiceEnrollment active or not"), ESR_rc2str(rc)); 1037 goto CLEANUP; 1038 } 1039 if (enableGetWaveform) 1040 CHKLOG(rc, WaveformBuffer_SetBufferingState(impl->waveformBuffer, WAVEFORM_BUFFERING_ON_CIRCULAR)); 1041 else 1042 CHKLOG(rc, WaveformBuffer_SetBufferingState(impl->waveformBuffer, WAVEFORM_BUFFERING_OFF)); 1043 1044 /* I am going to try to open the audio waveform file here */ 1045 if (impl->osi_log_level & OSI_LOG_LEVEL_AUDIO) 1046 { 1047 /* open a new audio waveform file */ 1048 rc = SR_EventLogAudioOpen(impl->eventLog, L("audio/L16"), impl->sampleRate, SAMPLE_SIZE); 1049 if (rc != ESR_SUCCESS) 1050 { 1051 PLogError(L("%s: could not open the RIFF audio file"), ESR_rc2str(rc)); 1052 goto CLEANUP; 1053 } 1054 } 1055 impl->frames = impl->processed = 0; 1056 return ESR_SUCCESS; 1057CLEANUP: 1058/* self->stop(self);*/ 1059 return rc; 1060} 1061 1062ESR_ReturnCode SR_RecognizerStopImpl(SR_Recognizer* self) 1063{ 1064 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1065 SR_AcousticModelsImpl* modelsImpl; 1066 ESR_ReturnCode rc; 1067 1068 PLOG_DBG_API_ENTER(); 1069 if (!impl->isStarted) 1070 { 1071 /* In case the user calls stop() twice */ 1072 return ESR_SUCCESS; 1073 } 1074 modelsImpl = (SR_AcousticModelsImpl*) impl->models; 1075 1076 /* Clean-up recognizer and utterance */ 1077 switch (impl->internalState) 1078 { 1079 case SR_RECOGNIZER_INTERNAL_BEGIN: 1080 /* Recognizer was never started */ 1081 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("MODE"), L("BEGIN"))); 1082 CA_LockUtteranceFromInput(impl->utterance); 1083 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 1084 if (impl->eventLog != NULL) 1085 { 1086 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("SR_RECOGNIZER_INTERNAL_BEGIN -> SR_RECOGNIZER_INTERNAL_END"))); 1087 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 1088 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 1089 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 1090 } 1091 break; 1092 1093 case SR_RECOGNIZER_INTERNAL_BOS_TIMEOUT: 1094 /* Recognizer was never started */ 1095 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("MODE"), L("BOS_TIMEOUT"))); 1096 CA_LockUtteranceFromInput(impl->utterance); 1097 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 1098 if (impl->eventLog != NULL) 1099 { 1100 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("SR_RECOGNIZER_INTERNAL_BOS_TIMEOUT -> SR_RECOGNIZER_INTERNAL_END"))); 1101 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 1102 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 1103 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 1104 } 1105 break; 1106 1107 case SR_RECOGNIZER_INTERNAL_BOS_NO_MATCH: 1108 /* Recognizer was never started */ 1109 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("MODE"), L("BOS_NO_MATCH"))); 1110 CA_LockUtteranceFromInput(impl->utterance); 1111 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 1112 if (impl->eventLog != NULL) 1113 { 1114 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("SR_RECOGNIZER_INTERNAL_BOS_NO_MATCH -> SR_RECOGNIZER_INTERNAL_END"))); 1115 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 1116 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 1117 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 1118 } 1119 break; 1120 1121 case SR_RECOGNIZER_INTERNAL_BOS_DETECTION: 1122 /* Recognizer was never started */ 1123 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("MODE"), L("BOS_DETECTION"))); 1124 CA_LockUtteranceFromInput(impl->utterance); 1125 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 1126 if (impl->eventLog != NULL) 1127 { 1128 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("SR_RECOGNIZER_INTERNAL_BOS_DETECTION -> SR_RECOGNIZER_INTERNAL_END"))); 1129 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 1130 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 1131 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 1132 } 1133 break; 1134 1135 case SR_RECOGNIZER_INTERNAL_EOS_DETECTION: 1136 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("MODE"), L("EOS_DETECTION"))); 1137 CA_LockUtteranceFromInput(impl->utterance); 1138 if (!CA_EndRecognition(impl->recognizer, modelsImpl->pattern, impl->utterance)) 1139 { 1140 rc = ESR_INVALID_STATE; 1141 PLogError(ESR_rc2str(rc)); 1142 goto CLEANUP; 1143 } 1144 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 1145 if (impl->eventLog != NULL) 1146 { 1147 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("SR_RECOGNIZER_INTERNAL_EOS_DETECTION -> SR_RECOGNIZER_INTERNAL_END"))); 1148 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 1149 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 1150 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 1151 } 1152 break; 1153 1154 case SR_RECOGNIZER_INTERNAL_EOI: 1155 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("MODE"), L("EOI"))); 1156 CA_LockUtteranceFromInput(impl->utterance); 1157 if (!CA_EndRecognition(impl->recognizer, modelsImpl->pattern, impl->utterance)) 1158 { 1159 rc = ESR_INVALID_STATE; 1160 PLogError(ESR_rc2str(rc)); 1161 goto CLEANUP; 1162 } 1163 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 1164 if (impl->eventLog != NULL) 1165 { 1166 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("SR_RECOGNIZER_INTERNAL_EOI -> SR_RECOGNIZER_INTERNAL_END"))); 1167 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 1168 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 1169 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 1170 } 1171 break; 1172 1173 case SR_RECOGNIZER_INTERNAL_EOS: 1174 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("MODE"), L("EOS"))); 1175 CA_LockUtteranceFromInput(impl->utterance); 1176 if (!CA_EndRecognition(impl->recognizer, modelsImpl->pattern, impl->utterance)) 1177 { 1178 rc = ESR_INVALID_STATE; 1179 PLogError(ESR_rc2str(rc)); 1180 goto CLEANUP; 1181 } 1182 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 1183 if (impl->eventLog != NULL) 1184 { 1185 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("SR_RECOGNIZER_INTERNAL_EOS -> SR_RECOGNIZER_INTERNAL_END"))); 1186 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 1187 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 1188 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 1189 } 1190 break; 1191 1192 case SR_RECOGNIZER_INTERNAL_END: 1193 /* Recognizer already shut down */ 1194 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("MODE"), L("END"))); 1195 break; 1196 1197 default: 1198 /* Shut down recognizer */ 1199 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("MODE"), impl->internalState)); 1200 if (impl->eventLog != NULL) 1201 { 1202 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("unknown state -> SR_RECOGNIZER_INTERNAL_END"))); 1203 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 1204 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 1205 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 1206 } 1207 CA_LockUtteranceFromInput(impl->utterance); 1208 if (impl->isRecognizing) 1209 { 1210 if (!CA_EndRecognition(impl->recognizer, modelsImpl->pattern, impl->utterance)) 1211 { 1212 rc = ESR_INVALID_STATE; 1213 PLogError(ESR_rc2str(rc)); 1214 goto CLEANUP; 1215 } 1216 } 1217 rc = ESR_INVALID_STATE; 1218 PLogError(L("%s: %d"), ESR_rc2str(rc), impl->internalState); 1219 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 1220 goto CLEANUP; 1221 } 1222 if (impl->eventLog != NULL) 1223 { 1224 int n; 1225 LCHAR result[MAX_ENTRY_LENGTH]; 1226 result[0] = L('\0'); 1227 1228 n = CA_GetUnprocessedFramesInUtterance(impl->utterance); 1229 CHKLOG(rc, SR_EventLogTokenInt(impl->eventLog, L("CA_GetUnprocessedFramesInUtterance() (x10ms)"), n)); 1230 CA_FullResultLabel(impl->recognizer, result, MAX_ENTRY_LENGTH - 1); 1231 CHKLOG(rc, SR_EventLogToken(impl->eventLog, L("CA_FullResultLabel() (x20ms)"), result)); 1232 n = CircularBufferGetSize(impl->buffer); 1233 CHKLOG(rc, SR_EventLogTokenInt(impl->eventLog, L("CircularBufferGetSize() (samples)"), n / SAMPLE_SIZE)); 1234 } 1235 if (impl->lockFunction) 1236 impl->lockFunction(ESR_LOCK, impl->lockData); 1237 CircularBufferReset(impl->buffer); 1238 if (impl->lockFunction) 1239 impl->lockFunction(ESR_UNLOCK, impl->lockData); 1240 if (CA_RecognitionHasResults(impl->recognizer)) 1241 CA_ClearResults(impl->recognizer); 1242 CA_FlushUtteranceFrames(impl->utterance); 1243 CA_CalculateCMSParameters(impl->wavein); 1244 CA_CloseDevice(impl->wavein); 1245 1246 /* record the OSI event */ 1247 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIstop"))); 1248 1249 if (impl->result != NULL) 1250 { 1251 CHKLOG(rc, SR_RecognizerResult_Destroy(impl->result)); 1252 impl->result = NULL; 1253 } 1254 1255 if (impl->lockFunction) 1256 impl->lockFunction(ESR_LOCK, impl->lockData); 1257 impl->gotLastFrame = ESR_TRUE; 1258 PLOG_DBG_TRACE((L("SR_Recognizer shutdown occured"))); 1259 impl->isStarted = ESR_FALSE; 1260 impl->isRecognizing = ESR_FALSE; 1261 if (impl->osi_log_level & OSI_LOG_LEVEL_AUDIO) 1262 SR_EventLogAudioClose(impl->eventLog); 1263 1264 impl->recogLogTimings.BORT = 0; 1265 impl->recogLogTimings.DURS = 0; 1266 impl->recogLogTimings.EORT = 0; 1267 impl->recogLogTimings.EOSD = 0; 1268 impl->recogLogTimings.EOSS = 0; 1269 impl->recogLogTimings.BOSS = 0; 1270 impl->recogLogTimings.EOST = 0; 1271 impl->eos_reason = L("undefined"); 1272 1273 if (impl->lockFunction) 1274 impl->lockFunction(ESR_UNLOCK, impl->lockData); 1275 PLOG_DBG_API_EXIT(rc); 1276 return rc; 1277CLEANUP: 1278 PLOG_DBG_API_EXIT(rc); 1279 return rc; 1280} 1281 1282ESR_ReturnCode SR_RecognizerSetupImpl(SR_Recognizer* self) 1283{ 1284 ESR_ReturnCode rc; 1285 CA_AcoustInputParams* acousticParams = NULL; 1286 SR_AcousticModelsImpl* modelsImpl; 1287 SR_AcousticModels* models; 1288 SR_RecognizerImpl* recogImpl = NULL; 1289 CA_Acoustic* acoustic; 1290 size_t size, i; 1291 LCHAR filenames[P_PATH_MAX]; 1292 size_t len; 1293 1294 len = P_PATH_MAX; 1295 CHKLOG(rc, ESR_SessionGetLCHAR ( L("cmdline.modelfiles"), filenames, &len )); 1296 1297 CHKLOG(rc, SR_AcousticModelsLoad ( filenames, &models )); 1298 1299 if (models == NULL) 1300 { 1301 PLogError(L("ESR_INVALID_STATE while finding cmdline.modelfiles")); 1302 return ESR_INVALID_STATE; 1303 } 1304 modelsImpl = (SR_AcousticModelsImpl*) models; 1305 recogImpl = (SR_RecognizerImpl*) self; 1306 acousticParams = NULL; 1307 1308 CHKLOG(rc, SR_AcousticModelsGetCount(models, &size)); 1309 acousticParams = CA_AllocateAcousticParameters(); 1310 if (acousticParams == NULL) 1311 { 1312 rc = ESR_OUT_OF_MEMORY; 1313 PLogError(ESR_rc2str(rc)); 1314 goto CLEANUP; 1315 } 1316 CHKLOG(rc, modelsImpl->getLegacyParameters(acousticParams)); 1317 CHKLOG(rc, ArrayListGetSize(modelsImpl->acoustic, &size)); 1318 for (i = 0; i < size; ++i) 1319 { 1320 CHKLOG(rc, ArrayListGet(modelsImpl->acoustic, i, (void **)&acoustic)); 1321 CA_LoadModelsInAcoustic(recogImpl->recognizer, acoustic, acousticParams); 1322 } 1323 CA_FreeAcousticParameters(acousticParams); 1324 1325 recogImpl->models = models; 1326 CHKLOG(rc, modelsImpl->setupPattern(recogImpl->models, self)); 1327 return ESR_SUCCESS; 1328 CLEANUP: 1329 if (acousticParams != NULL) 1330 CA_FreeAcousticParameters(acousticParams); 1331 if (recogImpl != NULL) 1332 CA_UnloadRecognitionModels(recogImpl->recognizer); 1333 return rc; 1334} 1335 1336ESR_ReturnCode SR_RecognizerUnsetupImpl(SR_Recognizer* self) 1337{ 1338 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1339 SR_AcousticModelsImpl* modelsImpl = (SR_AcousticModelsImpl*) impl->models; 1340 ESR_ReturnCode rc; 1341 1342 CHKLOG(rc, modelsImpl->unsetupPattern(impl->models)); 1343 CA_UnloadRecognitionModels(impl->recognizer); 1344 CHKLOG(rc, SR_AcousticModelsDestroy ( impl->models )); 1345 impl->models = NULL; 1346 return ESR_SUCCESS; 1347 CLEANUP: 1348 return rc; 1349} 1350 1351ESR_ReturnCode SR_RecognizerIsSetupImpl(SR_Recognizer* self, ESR_BOOL* isSetup) 1352{ 1353 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1354 1355 if (isSetup == NULL) 1356 { 1357 PLogError(L("ESR_INVALID_ARGUMENT")); 1358 return ESR_INVALID_ARGUMENT; 1359 } 1360 *isSetup = impl->models != NULL; 1361 return ESR_SUCCESS; 1362} 1363 1364ESR_ReturnCode SR_RecognizerGetParameterImpl(SR_Recognizer* self, const LCHAR* key, 1365 LCHAR* value, size_t* len) 1366{ 1367 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1368 ESR_ReturnCode rc; 1369 1370 rc = impl->parameters->getLCHAR(impl->parameters, key, value, len); 1371 if (rc == ESR_NO_MATCH_ERROR) 1372 { 1373 CHKLOG(rc, ESR_SessionGetLCHAR(key, value, len)); 1374 return ESR_SUCCESS; 1375 } 1376 else if (rc != ESR_SUCCESS) 1377 { 1378 PLogError(ESR_rc2str(rc)); 1379 goto CLEANUP; 1380 } 1381 return ESR_SUCCESS; 1382CLEANUP: 1383 return rc; 1384} 1385 1386/* 1387 * The get / set code is a mess. Since we only use size_t parameters, that's all 1388 * that I am going to make work. The impl->parameters don't work so you always 1389 * have to get them from the session. The impl always logs an error. SteveR 1390 */ 1391 1392ESR_ReturnCode SR_RecognizerGetSize_tParameterImpl(SR_Recognizer* self, const LCHAR* key, 1393 size_t* value) 1394{ 1395 ESR_ReturnCode rc; 1396 1397 CHKLOG(rc, ESR_SessionGetSize_t(key, value)); 1398 return ESR_SUCCESS; 1399CLEANUP: 1400 return rc; 1401} 1402 1403ESR_ReturnCode SR_RecognizerGetBoolParameterImpl(SR_Recognizer* self, const LCHAR* key, ESR_BOOL* value) 1404{ 1405 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1406 ESR_ReturnCode rc; 1407 1408 rc = impl->parameters->getBool(impl->parameters, key, value); 1409 if (rc == ESR_NO_MATCH_ERROR) 1410 { 1411 CHKLOG(rc, ESR_SessionGetBool(key, value)); 1412 return ESR_SUCCESS; 1413 } 1414 else if (rc != ESR_SUCCESS) 1415 { 1416 PLogError(ESR_rc2str(rc)); 1417 goto CLEANUP; 1418 } 1419 return ESR_SUCCESS; 1420CLEANUP: 1421 return rc; 1422} 1423 1424ESR_ReturnCode SR_RecognizerSetParameterImpl(SR_Recognizer* self, const LCHAR* key, 1425 LCHAR* value) 1426{ 1427 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1428 LCHAR temp[256]; 1429 ESR_ReturnCode rc; 1430 size_t len = 256; 1431 1432 rc = impl->parameters->getLCHAR(impl->parameters, key, temp, &len); 1433 if (rc == ESR_SUCCESS) 1434 { 1435 if (LSTRCMP(temp, value) == 0) 1436 return ESR_SUCCESS; 1437 CHKLOG(rc, impl->parameters->removeAndFreeProperty(impl->parameters, key)); 1438 } 1439 else if (rc != ESR_NO_MATCH_ERROR && rc != ESR_INVALID_RESULT_TYPE) 1440 { 1441 PLogError(ESR_rc2str(rc)); 1442 goto CLEANUP; 1443 } 1444 1445 CHKLOG(rc, impl->parameters->setLCHAR(impl->parameters, key, value)); 1446 return ESR_SUCCESS; 1447CLEANUP: 1448 return rc; 1449} 1450/* 1451 * The only set param function that is working is for the size_t parameters; and not 1452 * all of them are working, only the ones specified in the function itself. There are 1453 * two reasons for this: first most of the set functions just put the value in an unused 1454 * table that has no effect; second many of the changes need to be propogated to a specific 1455 * part of the code. This needs to be evaluated on a per parameter basis. SteveR 1456 */ 1457 1458/* 1459 * This function will be used to set parameters in the session. We need to go through 1460 * the recognizer so as to propogate the values into the recognizer. We will rely on 1461 * the session to do the right thing. SteveR 1462 */ 1463 1464ESR_ReturnCode SR_RecognizerSetSize_tParameterImpl(SR_Recognizer* self, const LCHAR* key, 1465 size_t value) 1466{ 1467 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1468 ESR_ReturnCode rc; 1469 1470 rc = ESR_SessionSetSize_t ( key, value ); 1471 1472 if (rc == ESR_SUCCESS) 1473 { 1474 if ( LSTRCMP ( L("SREC.Recognizer.utterance_timeout"), key ) == 0 ) 1475 { 1476 impl->utterance_timeout = value; 1477 } 1478 else if ( LSTRCMP ( L("CREC.Recognizer.terminal_timeout"), key ) == 0 ) 1479 { 1480 impl->recognizer->eosd_parms->endnode_timeout = value; 1481 } 1482 else if ( LSTRCMP ( L("CREC.Recognizer.optional_terminal_timeout"), key ) == 0 ) 1483 { 1484 impl->recognizer->eosd_parms->optendnode_timeout = value; 1485 } 1486 else if ( LSTRCMP ( L("CREC.Recognizer.non_terminal_timeout"), key ) == 0 ) 1487 { 1488 impl->recognizer->eosd_parms->internalnode_timeout = value; 1489 } 1490 else if ( LSTRCMP ( L("CREC.Recognizer.eou_threshold"), key ) == 0 ) 1491 { 1492 impl->recognizer->eosd_parms->eos_costdelta = (frameID)value; 1493 impl->recognizer->eosd_parms->opt_eos_costdelta = (frameID)value; 1494 } 1495 else 1496 { 1497 PLogError(L("ESR_INVALID_ARGUMENT")); 1498 rc = ESR_INVALID_ARGUMENT; 1499 } 1500 } 1501 return rc; 1502} 1503 1504 1505ESR_ReturnCode SR_RecognizerSetBoolParameterImpl(SR_Recognizer* self, const LCHAR* key, ESR_BOOL value) 1506{ 1507 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1508 ESR_BOOL temp; 1509 ESR_ReturnCode rc; 1510 1511 rc = impl->parameters->getBool(impl->parameters, key, &temp); 1512 if (rc == ESR_SUCCESS) 1513 { 1514 if (temp == value) 1515 return ESR_SUCCESS; 1516 CHKLOG(rc, impl->parameters->removeAndFreeProperty(impl->parameters, key)); 1517 } 1518 else if (rc != ESR_NO_MATCH_ERROR && rc != ESR_INVALID_RESULT_TYPE) 1519 return rc; 1520 1521 CHKLOG(rc, impl->parameters->setBool(impl->parameters, key, value)); 1522 return ESR_SUCCESS; 1523CLEANUP: 1524 return rc; 1525} 1526 1527ESR_ReturnCode SR_RecognizerHasSetupRulesImpl(SR_Recognizer* self, ESR_BOOL* hasSetupRules) 1528{ 1529 SR_RecognizerImpl* recogImpl = (SR_RecognizerImpl*) self; 1530 size_t size; 1531 ESR_ReturnCode rc; 1532 1533 if (hasSetupRules == NULL) 1534 { 1535 PLogError(L("ESR_INVALID_ARGUMENT")); 1536 return ESR_INVALID_ARGUMENT; 1537 } 1538 CHKLOG(rc, HashMapGetSize(recogImpl->grammars, &size)); 1539 *hasSetupRules = size > 0; 1540 return ESR_SUCCESS; 1541CLEANUP: 1542 return rc; 1543} 1544 1545ESR_ReturnCode SR_RecognizerActivateRuleImpl(SR_Recognizer* self, SR_Grammar* grammar, 1546 const LCHAR* ruleName, unsigned int weight) 1547{ 1548 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1549 SR_GrammarImpl* grammarImpl = (SR_GrammarImpl*) grammar; 1550 SR_AcousticModelsImpl* modelsImpl; 1551 LCHAR grammarID[80]; 1552 ESR_ReturnCode rc; 1553 char *failure_reason = NULL; 1554 1555 if (grammar == NULL) 1556 { 1557 if (impl->eventLog) 1558 failure_reason = "badinput"; 1559 rc = ESR_INVALID_ARGUMENT; 1560 PLogError(L("ESR_INVALID_ARGUMENT")); 1561 goto CLEANUP; 1562 } 1563 1564 if (impl->models == NULL) 1565 { 1566 failure_reason = "nomodels"; 1567 rc = ESR_INVALID_STATE; 1568 PLogError(L("acoustic models must be configured")); 1569 goto CLEANUP; 1570 } 1571 1572 modelsImpl = (SR_AcousticModelsImpl*) impl->models; 1573 1574 if (ruleName == NULL) 1575 psprintf(grammarID, L("%p"), grammar); 1576 else 1577 { 1578 if (LSTRLEN(ruleName) > 80) 1579 { 1580 rc = ESR_BUFFER_OVERFLOW; 1581 PLogError(ESR_rc2str(rc)); 1582 goto CLEANUP; 1583 } 1584 LSTRCPY(grammarID, ruleName); 1585 } 1586 1587 CHKLOG(rc, HashMapPut(impl->grammars, grammarID, grammar)); 1588 if (CA_SetupSyntaxForRecognizer(grammarImpl->syntax, impl->recognizer)) 1589 { 1590 failure_reason = "cafailed"; 1591 rc = ESR_INVALID_STATE; 1592 PLogError(L("ESR_INVALID_STATE")); 1593 goto CLEANUP; 1594 } 1595 1596 CHKLOG(rc, SR_Grammar_SetupRecognizer(grammar, self)); 1597 grammarImpl->isActivated = ESR_TRUE; 1598 1599 /* 1600 * If we want to log dynamically added words, then we must give the grammar a reference 1601 * to our event log. The grammar logs word additions if and only if its reference to 1602 * eventLog is non-null. 1603 */ 1604 if (impl->osi_log_level & OSI_LOG_LEVEL_ADDWD) 1605 grammarImpl->eventLog = impl->eventLog; 1606 else 1607 grammarImpl->eventLog = NULL; 1608 1609 rc = ESR_SUCCESS; 1610 1611CLEANUP: 1612 if (impl->eventLog) 1613 { 1614 if (failure_reason) 1615 { 1616 SR_EventLogTokenInt(impl->eventLog, L("igrm"), (int) grammar); 1617 SR_EventLogToken(impl->eventLog, L("rule"), ruleName); 1618 SR_EventLogToken(impl->eventLog, L("rslt"), "fail"); 1619 SR_EventLogToken(impl->eventLog, L("reason"), failure_reason); 1620 SR_EventLogEvent(impl->eventLog, L("ESRacGrm")); 1621 } 1622 else 1623 { 1624 SR_EventLogTokenInt(impl->eventLog, L("igrm"), (int) grammar); 1625 SR_EventLogToken(impl->eventLog, L("rule"), ruleName); 1626 SR_EventLogToken(impl->eventLog, L("rslt"), "ok"); 1627 SR_EventLogEvent(impl->eventLog, L("ESRacGrm")); 1628 } 1629 } 1630 return rc; 1631} 1632 1633ESR_ReturnCode SR_RecognizerDeactivateRuleImpl(SR_Recognizer* self, SR_Grammar* grammar, 1634 const LCHAR* ruleName) 1635{ 1636 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1637 SR_GrammarImpl* grammarImpl = (SR_GrammarImpl*) grammar; 1638 LCHAR grammarID[MAX_INT_DIGITS+1]; 1639 ESR_ReturnCode rc; 1640 1641 if (ruleName == NULL) 1642 { 1643 psprintf(grammarID, L("%p"), grammar); 1644 CHKLOG(rc, HashMapRemove(impl->grammars, grammarID)); 1645 } 1646 else 1647 CHKLOG(rc, HashMapRemove(impl->grammars, ruleName)); 1648 grammarImpl->isActivated = ESR_FALSE; 1649 return ESR_SUCCESS; 1650CLEANUP: 1651 return rc; 1652} 1653 1654ESR_ReturnCode SR_RecognizerDeactivateAllRulesImpl(SR_Recognizer* self) 1655{ 1656 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1657 ESR_ReturnCode rc; 1658 1659 CHKLOG(rc, HashMapRemoveAll(impl->grammars)); 1660 CA_ClearSyntaxForRecognizer(0, impl->recognizer); 1661 return ESR_SUCCESS; 1662CLEANUP: 1663 return rc; 1664} 1665 1666ESR_ReturnCode SR_RecognizerIsActiveRuleImpl(SR_Recognizer* self, SR_Grammar* grammar, 1667 const LCHAR* ruleName, ESR_BOOL* isActiveRule) 1668{ 1669 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1670 LCHAR grammarID[MAX_INT_DIGITS+1]; 1671 ESR_ReturnCode rc; 1672 1673 psprintf(grammarID, L("%p"), grammar); 1674 CHKLOG(rc, HashMapContainsKey(impl->grammars, (LCHAR*) &grammarID, isActiveRule)); 1675 return ESR_SUCCESS; 1676CLEANUP: 1677 return rc; 1678} 1679 1680ESR_ReturnCode SR_RecognizerSetWordAdditionCeilingImpl(SR_Recognizer* self, SR_Grammar* grammar) 1681{ 1682 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1683 SR_GrammarImpl* grammarImpl = (SR_GrammarImpl*)grammar; 1684 int iRc; 1685 1686 if(!impl || !grammarImpl) 1687 return ESR_INVALID_ARGUMENT; 1688 iRc = CA_CeilingSyntaxForRecognizer( grammarImpl->syntax, impl->recognizer); 1689 if(iRc) return ESR_INVALID_STATE; 1690 1691 return ESR_SUCCESS; 1692} 1693 1694ESR_ReturnCode SR_RecognizerCheckGrammarConsistencyImpl(SR_Recognizer* self, SR_Grammar* grammar, 1695 ESR_BOOL* isConsistent) 1696{ 1697 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1698 SR_GrammarImpl* grammarImpl; 1699 SR_RecognizerImpl* impl2; 1700 1701 1702 grammarImpl = (SR_GrammarImpl*) grammar; 1703 impl2 = (SR_RecognizerImpl*)grammarImpl->recognizer; 1704 // *isConsistent = grammarImpl->models == impl->models; 1705 *isConsistent = (impl2->models == impl->models); 1706 return ESR_SUCCESS; 1707} 1708 1709ESR_ReturnCode SR_RecognizerGetModelsImpl(SR_Recognizer* self, SR_AcousticModels** pmodels) 1710{ 1711 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1712 *pmodels = impl->models; 1713 return ESR_SUCCESS; 1714} 1715 1716ESR_ReturnCode SR_RecognizerPutAudioImpl(SR_Recognizer* self, asr_int16_t* buffer, size_t* bufferSize, 1717 ESR_BOOL isLast) 1718{ 1719 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1720 ESR_ReturnCode rc; 1721 int rcBufWrite; 1722 size_t nbWritten; 1723 1724 if (isLast == ESR_FALSE && (buffer == NULL || bufferSize == NULL)) 1725 { 1726 PLogError(L("ESR_INVALID_ARGUMENT")); 1727 return ESR_INVALID_ARGUMENT; 1728 } 1729 1730 if (impl->lockFunction) 1731 impl->lockFunction(ESR_LOCK, impl->lockData); 1732 if (!impl->isStarted) 1733 { 1734 if (impl->lockFunction) 1735 impl->lockFunction(ESR_UNLOCK, impl->lockData); 1736 PLogMessage(L("ESR_INVALID_STATE: Tried pushing audio while recognizer was offline")); 1737 return ESR_INVALID_STATE; 1738 } 1739 if (impl->gotLastFrame) 1740 { 1741 if (impl->lockFunction) 1742 impl->lockFunction(ESR_UNLOCK, impl->lockData); 1743 PLogMessage(L("ESR_INVALID_STATE: isLast=TRUE")); 1744 return ESR_INVALID_STATE; 1745 } 1746 if (buffer == NULL && isLast == ESR_FALSE) 1747 { 1748 if (impl->lockFunction) 1749 impl->lockFunction(ESR_UNLOCK, impl->lockData); 1750 PLogError(L("ESR_INVALID_ARGUMENT: got NULL buffer on non-terminal frame")); 1751 return ESR_INVALID_ARGUMENT; 1752 } 1753 1754 rcBufWrite = CircularBufferWrite(impl->buffer, buffer, *bufferSize * SAMPLE_SIZE); 1755 if (rcBufWrite < 0) 1756 { 1757 rc = ESR_INVALID_STATE; 1758 PLogError(L("%s: error writing to buffer (buffer=%d, available=%u)"), ESR_rc2str(rc), (int) impl->buffer, CircularBufferGetAvailable(impl->buffer)); 1759 goto CLEANUP; 1760 } 1761 1762 nbWritten = (size_t)rcBufWrite; 1763 if (nbWritten % SAMPLE_SIZE != 0) 1764 { 1765 size_t amountUnwritten; 1766 1767 /* The buffer is byte-based while we're sample based. Make sure we write entire samples or not at all */ 1768 amountUnwritten = CircularBufferUnwrite(impl->buffer, nbWritten % SAMPLE_SIZE); 1769 passert(amountUnwritten == nbWritten % SAMPLE_SIZE); 1770 nbWritten -= amountUnwritten; 1771 } 1772 passert(nbWritten % 2 == 0); /* make sure CircularBufferSize is divisible by 2 */ 1773 1774 if (nbWritten < *bufferSize * SAMPLE_SIZE) 1775 { 1776 rc = ESR_BUFFER_OVERFLOW; 1777#ifndef NDEBUG 1778 PLOG_DBG_TRACE((L("%s: writing to circular buffer"), ESR_rc2str(rc))); 1779#endif 1780 *bufferSize = nbWritten / SAMPLE_SIZE; 1781 if (impl->lockFunction) 1782 impl->lockFunction(ESR_UNLOCK, impl->lockData); 1783 goto CLEANUP; 1784 } 1785 if (impl->lockFunction) 1786 impl->lockFunction(ESR_UNLOCK, impl->lockData); 1787 1788 if (isLast) 1789 impl->gotLastFrame = ESR_TRUE; 1790 return ESR_SUCCESS; 1791CLEANUP: 1792 return rc; 1793} 1794 1795/* utility function to sort the ArrayList of nbest list results by the score of the first 1796 semantic result */ 1797ESR_ReturnCode SemanticResults_SortByScore(ArrayList *results, size_t nbestSize) 1798{ 1799 ESR_ReturnCode rc; 1800 ArrayList* semanticResultList; 1801 ArrayList* semanticResultList_swap; 1802 SR_SemanticResult* semanticResult_i; 1803 SR_SemanticResult* semanticResult_j; 1804 size_t i, j; 1805 LCHAR scoreStr[MAX_ENTRY_LENGTH] ; 1806 size_t scoreStrLen = MAX_ENTRY_LENGTH ; 1807 int score_i, score_j; 1808 1809 /* bubble sort */ 1810 for (i = 0; i < (size_t)nbestSize; ++i) 1811 { 1812 for (j = i + 1; j < (size_t)nbestSize; ++j) 1813 { 1814 /* get for i */ 1815 CHKLOG(rc, ArrayListGet(results, i, (void **)&semanticResultList)); /* nbest index */ 1816 CHKLOG(rc, ArrayListGet(semanticResultList, 0, (void **)&semanticResult_i)); /* semresult 0 */ 1817 1818 /* get for j */ 1819 CHKLOG(rc, ArrayListGet(results, j, (void **)&semanticResultList)); /* nbest index */ 1820 CHKLOG(rc, ArrayListGet(semanticResultList, 0, (void **)&semanticResult_j)); /* semresult 0 */ 1821 1822 scoreStrLen = MAX_ENTRY_LENGTH ; 1823 CHKLOG(rc, semanticResult_i->getValue(semanticResult_i, "raws", scoreStr, &scoreStrLen)); 1824 CHKLOG(rc, lstrtoi(scoreStr, &score_i, 10)); 1825 scoreStrLen = MAX_ENTRY_LENGTH ; 1826 CHKLOG(rc, semanticResult_j->getValue(semanticResult_j, "raws", scoreStr, &scoreStrLen)); 1827 CHKLOG(rc, lstrtoi(scoreStr, &score_j, 10)); 1828 1829 if (score_j < score_i) 1830 { 1831 /* need to swap */ 1832 CHKLOG(rc, ArrayListGet(results, i, (void **)&semanticResultList_swap)); /* put i in swap */ 1833 CHKLOG(rc, ArrayListSet(results, i, semanticResultList)); /* put j in i */ 1834 CHKLOG(rc, ArrayListSet(results, j, semanticResultList_swap)); /* put swap in j */ 1835 } 1836 } 1837 } 1838 return ESR_SUCCESS; 1839CLEANUP: 1840 return rc; 1841} 1842 1843ESR_ReturnCode filter_CA_FullResultLabel(const LCHAR* label, LCHAR *filtered_label, size_t* boss, size_t* eoss) 1844{ 1845 ESR_ReturnCode rc; 1846 enum 1847 { 1848 NO_COPY, 1849 FRAME, 1850 WORD, 1851 } filter_state = WORD; 1852 LCHAR *dst = filtered_label; 1853 LCHAR eosBuf[16]; /* max 9999 + '\0' */ 1854 LCHAR bosBuf[16]; /* max 9999 + '\0' */ 1855 LCHAR* pBuf = NULL; 1856 1857 /** 1858 * example: you want to filter this: 1859 * 1860 * "-pau-@23 clock@97 twenty_four@125 hour@145 " 1861 * ^boss = 23 ^ eoss = 145 1862 * and get this: 1863 * 1864 * "clock twenty_four hour" 1865 */ 1866 1867 passert(LSTRLEN(label) > 0); 1868 while (*label) 1869 { 1870 switch (filter_state) 1871 { 1872 case NO_COPY: 1873 if (*label == L(' ')) 1874 filter_state = WORD; 1875 else if (*label == L('@')) 1876 { 1877 filter_state = FRAME; 1878 if (pBuf == NULL) 1879 pBuf = bosBuf; 1880 else 1881 { 1882 *pBuf = 0; 1883 pBuf = eosBuf; 1884 } 1885 } 1886 break; 1887 case WORD: 1888 if (*label == L('@')) 1889 { 1890 *dst = L(' '); /* insert space */ 1891 dst++; 1892 filter_state = FRAME; 1893 if (pBuf == NULL) 1894 pBuf = bosBuf; 1895 else 1896 { 1897 *pBuf = 0; 1898 pBuf = eosBuf; 1899 } 1900 } 1901 else 1902 { 1903 *dst = *label; 1904 dst++; 1905 } 1906 break; 1907 case FRAME: 1908 if (*label == L(' ')) 1909 filter_state = WORD; 1910 else 1911 { 1912 *pBuf = *label; 1913 pBuf++; 1914 } 1915 break; 1916 } 1917 label++; 1918 } 1919 *dst = 0; /* term the string */ 1920 *pBuf = 0; /* term the string */ 1921 1922 /* trim the end spaces */ 1923 dst--; 1924 while (*dst == ' ') 1925 *dst-- = '\0'; 1926 1927 /* set the eos signal indicated by the end pointed data */ 1928 if (eosBuf[0] != 0) 1929 CHKLOG(rc, lstrtoui(eosBuf, eoss, 10)); 1930 else 1931 eoss = 0; 1932 1933 if (bosBuf[0] != 0) 1934 CHKLOG(rc, lstrtoui(bosBuf, boss, 10)); 1935 else 1936 boss = 0; 1937 1938 return ESR_SUCCESS; 1939CLEANUP: 1940 return rc; 1941} 1942 1943/** 1944 * Populates the recognizer result if it can, otherwise it returns NO MATCH cuz no results exist 1945 * 1946 * INPUT STATE: SR_RECOGNIZER_INTERNAL_EOS 1947 * 1948 * @param self SR_Recognizer handle 1949 * @todo break up into smaller functions 1950 */ 1951ESR_ReturnCode SR_RecognizerCreateResultImpl(SR_Recognizer* self, SR_RecognizerStatus* status, 1952 SR_RecognizerResultType* type) 1953{ 1954 LCHAR label[MAX_ENTRY_LENGTH * 2]; /* run out of buffer */ 1955#define WORDID_COUNT 48 /* can be quite high for voice enrollment! */ 1956 wordID wordIDs[WORDID_COUNT]; 1957 LCHAR tok[80]; 1958 LCHAR waveformFilename[P_PATH_MAX]; 1959 LCHAR* pkey; 1960 SR_GrammarImpl* pgrammar; 1961 asr_int32_t raws; /* raw score */ 1962 size_t iBest, nbestSize, jBest, k, grammarSize, semanticResultsSize, grammarIndex_for_iBest; 1963 LCHAR* lValue; 1964 LCHAR* lValue2; 1965 int confValue; 1966 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 1967 SR_RecognizerResultImpl* resultImpl = (SR_RecognizerResultImpl*) impl->result; 1968 ESR_BOOL containsKey; 1969 int valid, score, recogID; 1970 LCHAR result[MAX_ENTRY_LENGTH]; 1971 size_t len, size; 1972 size_t locale; 1973 int current_choice; 1974 1975 /** 1976 * Semantic result stuff 1977 */ 1978 /* a temp buffer to hold semantic results of a parse (there may be several results) */ 1979 SR_SemanticResult* semanticResults[MAX_SEM_RESULTS]; 1980 ArrayList* semanticList; 1981 ArrayList* semanticList2; 1982 SR_SemanticResultImpl* semanticImpl; 1983 SR_SemanticResultImpl* semanticImpl2; 1984 SR_SemanticResult* semanticResult; 1985 SR_SemanticResult* semanticResult2; 1986 waveform_buffering_state_t buffering_state; 1987 1988 SR_AcousticModelsImpl* modelsImpl = (SR_AcousticModelsImpl*) impl->models; 1989 ESR_ReturnCode rc; 1990 PTimeStamp EORT; 1991 1992 CA_LockUtteranceFromInput(impl->utterance); 1993 if (!CA_EndRecognition(impl->recognizer, modelsImpl->pattern, impl->utterance)) 1994 { 1995 PLogError(L("ESR_INVALID_STATE")); 1996 return ESR_INVALID_STATE; 1997 } 1998 1999 /* check if the forward search was successful */ 2000 valid = CA_FullResultLabel(impl->recognizer, result, MAX_ENTRY_LENGTH - 1); 2001 CA_GetRecogID(impl->recognizer, &recogID); 2002 CA_FullResultScore(impl->recognizer, &score, 1); 2003#ifdef SREC_ENGINE_VERBOSE_LOGGING 2004 PLogMessage(L("R: %s type %d score %d from recognizer%d"), result, type, score, valid, recogID); 2005 PLogMessage(L("R: %s score %d from recognizer%d"), result, score, valid, recogID); 2006#endif 2007#ifdef _WIN32 2008 //pfprintf(PSTDOUT, ("R: %s type %d score %d from recognizer%d\n"), result, type, score, valid, recogID); 2009#endif 2010 2011 2012 switch (valid) 2013 { 2014 case FULL_RESULT: 2015 CHKLOG(rc, filter_CA_FullResultLabel(result, label, &impl->recogLogTimings.BOSS, &impl->recogLogTimings.EOSS)); 2016#ifdef SREC_ENGINE_VERBOSE_LOGGING 2017 PLogMessage("R: %s", result); 2018#endif 2019 CA_FullResultScore(impl->recognizer, (int*) &raws, 0); 2020#ifdef SREC_ENGINE_VERBOSE_LOGGING 2021 PLogMessage("S: %d", raws); 2022#endif 2023 2024 /* now that we have an endpointed result, we can parse the result transcription 2025 to see where speech started and ended. Then we can trim off excess parts of the 2026 recorded audio waveform (if exists) so that nametags are just the right amount of 2027 audio 2028 */ 2029 CHKLOG(rc, WaveformBuffer_GetBufferingState(impl->waveformBuffer, &buffering_state)); 2030 if (buffering_state != WAVEFORM_BUFFERING_OFF) 2031 { 2032 CHKLOG(rc, WaveformBuffer_GetSize(impl->waveformBuffer, &size)); 2033 if (size > 0) 2034 { 2035 rc = WaveformBuffer_ParseEndPointedResultAndTrim(impl->waveformBuffer, result, impl->FRAME_SIZE); 2036 if (rc == ESR_BUFFER_OVERFLOW) 2037 { 2038 /* Nametag EOS occured beyond end of buffer */ 2039 } 2040 else if (rc != ESR_SUCCESS) 2041 { 2042 PLogError(ESR_rc2str(rc)); 2043 goto CLEANUP; 2044 } 2045 } 2046 } 2047 break; 2048 2049 case REJECT_RESULT: 2050#ifdef SREC_ENGINE_VERBOSE_LOGGING 2051 PLogMessage(L("R: <REJECTED>")); 2052#endif 2053 break; 2054 default: 2055#ifdef SREC_ENGINE_VERBOSE_LOGGING 2056 PLogMessage(L("E: No results available")); 2057 PLogMessage(L("R: <FAILED>")); 2058#endif 2059 break; 2060 } 2061 2062 2063 if (valid == FULL_RESULT) 2064 { 2065 /* Populate SR_RecognizerResult */ 2066 resultImpl->nbestList = CA_PrepareNBestList(impl->recognizer, 10, &raws); 2067 if (resultImpl->nbestList == NULL) 2068 { 2069 /* 2070 * This is not a failure. It simply means that I have not advanced far 2071 * enough in recognition in order to obtain results (no paths in 2072 * graph). This occurs, for instance, when a eof is reached (no more data) 2073 * and I have not even created any paths in my graph. 2074 */ 2075 2076 *status = SR_RECOGNIZER_EVENT_NO_MATCH; 2077 *type = SR_RECOGNIZER_RESULT_TYPE_COMPLETE; 2078 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 2079 if (impl->eventLog != NULL) 2080 { 2081 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("SR_RecognizerCreateResultImpl() -> SR_RECOGNIZER_INTERNAL_END"))); 2082 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2083 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2084 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2085 } 2086 passert(0); 2087 return ESR_SUCCESS; 2088 } 2089 2090 nbestSize = CA_NBestListCount(resultImpl->nbestList); 2091 } 2092 else 2093 nbestSize = 0; 2094 2095 if (resultImpl->results != NULL) 2096 ArrayListRemoveAll(resultImpl->results); 2097 else 2098 CHKLOG(rc, ArrayListCreate(&resultImpl->results)); 2099 if (nbestSize == 0) 2100 { 2101 /* 2102 * Got empty n-best list even though the recognition was successful. 2103 * We handle this in the same way that recog_startpt does... we consider it a no match. 2104 * We could adjust the CREC.Recognizer.viterbi_prune_thresh to a higher level, but that 2105 * may not fix the problem completely. We need to fix the bug in the astar search!!! 2106 */ 2107 *status = SR_RECOGNIZER_EVENT_NO_MATCH; 2108 *type = SR_RECOGNIZER_RESULT_TYPE_COMPLETE; 2109 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 2110 if (impl->eventLog != NULL) 2111 { 2112 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("SR_RecognizerCreateResultImpl() -> SR_RECOGNIZER_INTERNAL_END"))); 2113 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2114 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2115 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2116 } 2117#ifdef SREC_ENGINE_VERBOSE_LOGGING 2118 PLogMessage(L("ESR_INVALID_STATE: got empty n-best list even though the recognition was successful")); 2119#endif 2120 return ESR_SUCCESS; /* we do not want to halt the app in this case */ 2121 } 2122 else 2123 { 2124 *status = SR_RECOGNIZER_EVENT_RECOGNITION_RESULT; 2125 *type = SR_RECOGNIZER_RESULT_TYPE_COMPLETE; 2126 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 2127 if (impl->eventLog != NULL) 2128 { 2129 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("SR_RecognizerCreateResultImpl() -> SR_RECOGNIZER_INTERNAL_END"))); 2130 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2131 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2132 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2133 } 2134 } 2135 2136 /** 2137 * All grammars associated with the recognizer are considered to be active 2138 * and therefore, I do a semantic parse on each. On the first grammar that 2139 * gives one or more semantic results, I stop parsing the other grammars. 2140 */ 2141 CHKLOG(rc, impl->grammars->getSize(impl->grammars, &grammarSize)); 2142 ASSERT( grammarSize == 1); 2143 2144 for (iBest = 0; iBest < nbestSize; ++iBest) 2145 { 2146 len = WORDID_COUNT; 2147 if (CA_NBestListGetResultWordIDs(resultImpl->nbestList, iBest, wordIDs, &len, &raws) != ESR_SUCCESS) 2148 { 2149 *status = SR_RECOGNIZER_EVENT_NO_MATCH; 2150 *type = SR_RECOGNIZER_RESULT_TYPE_COMPLETE; 2151 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 2152 if (impl->eventLog != NULL) 2153 { 2154 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("SR_RecognizerCreateResultImpl() -> SR_RECOGNIZER_INTERNAL_END"))); 2155 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2156 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2157 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2158 } 2159 PLogError(L("ESR_INVALID_STATE: got bad n-best list entry %d"), iBest); 2160 return ESR_INVALID_STATE; 2161 } 2162 2163 CHKLOG(rc, ArrayListCreate(&semanticList)); 2164 CHKLOG(rc, resultImpl->results->add(resultImpl->results, semanticList)); 2165 2166 grammarIndex_for_iBest = 0; 2167 CHKLOG(rc, impl->grammars->getKeyAtIndex(impl->grammars, grammarIndex_for_iBest, &pkey)); 2168 CHKLOG(rc, impl->grammars->get(impl->grammars, pkey, (void **)&pgrammar)); 2169 2170 CHKLOG(rc, SR_GrammarGetSize_tParameter((SR_Grammar*) pgrammar, L("locale"), &locale)); 2171 resultImpl->locale = locale; 2172 2173 /* I need to manage my semantic results external to the check parse function */ 2174 for (k = 0; k < MAX_SEM_RESULTS; ++k) 2175 SR_SemanticResultCreate(&semanticResults[k]); 2176 2177 /* 2178 The code here tries to make the voice-enrollment more effective. 2179 The VE grammar decodes a sequence of best phonemes, but the nbest 2180 processing may find a better score for an alternative choice than 2181 the score of the viterbi best choice. The reason for this is that 2182 alternative choices don't honor cross-word context-dependency quite 2183 accurately. If we choose an alternative choice then the sequence of 2184 phoneme decoded does not correspond to the sequence of models decoded. 2185 To counter this, we FORCIBLY make sure the top choice here is the 2186 VITERBI top choice. 2187 */ 2188 2189 if (iBest == 0) 2190 { 2191 if (CA_IsEnrollmentSyntax( pgrammar->syntax)) { 2192 /* this was voice enrollment, so let's try to replace */ 2193 // char* word1 = CA_NBestListGetResultWord(resultImpl->nbestList,wordIDs[0]); 2194 // char* word2 = CA_NBestListGetResultWord(resultImpl->nbestList,wordIDs[1]); 2195 // if (!strncmp(word1,voice_enroll_word_prefix,VEWPLEN)&&!strncmp(word2,voice_enroll_word_prefix,VEWPLEN)) 2196 len = WORDID_COUNT; 2197 rc = CA_FullResultWordIDs(impl->recognizer, wordIDs, &len); 2198 if (rc != ESR_SUCCESS) 2199 { 2200 /* in case of problem with viterbi path choice, we revert back */ 2201 len = WORDID_COUNT; 2202 rc = CA_NBestListGetResultWordIDs(resultImpl->nbestList, iBest, wordIDs, &len, &raws) ; 2203 } 2204 } 2205 } 2206 2207 LSTRCPY(label, L("")); 2208 for (k = 0; wordIDs[k] != MAXwordID; ++k) 2209 { 2210 LCHAR* wordk = NULL; 2211 wordk = CA_NBestListGetResultWord(resultImpl->nbestList,wordIDs[k]); 2212 LSTRCAT(label, wordk); 2213 LSTRCAT(label, L(" ")); 2214 } 2215 CHKLOG(rc, CA_ResultStripSlotMarkers(label)); 2216 passert(LSTRCMP(label, L("")) != 0); 2217 2218 /* strip the trailing blank */ 2219 k = LSTRLEN(label) - 1; 2220 if (k > 0 && label[k] == L(' ')) 2221 label[k] = 0; 2222 2223 semanticResultsSize = MAX_SEM_RESULTS; 2224 2225#if SEMPROC_ACTIVE 2226 2227 /* set the literal prior to processing so that semproc can read the value 2228 during processing */ 2229 CHKLOG(rc, pgrammar->semproc->flush(pgrammar->semproc)); 2230 CHKLOG(rc, pgrammar->semproc->setParam(pgrammar->semproc, L("literal"), label)); 2231 2232 rc = pgrammar->semproc->checkParseByWordID(pgrammar->semproc, pgrammar->semgraph, 2233 wordIDs, semanticResults, &semanticResultsSize); 2234 2235 /* rc = pgrammar->semproc->checkParse(pgrammar->semproc, pgrammar->semgraph, 2236 label, semanticResults, &semanticResultsSize); */ 2237 2238 if (rc != ESR_SUCCESS) 2239 { 2240 for (k = 0; k < MAX_SEM_RESULTS; ++k) 2241 { 2242 semanticResults[k]->destroy(semanticResults[k]); 2243 semanticResults[k] = NULL; 2244 } 2245 goto CLEANUP; 2246 } 2247#else 2248 semanticResultsSize = 0; 2249#endif 2250 /* cleanup the empty ones */ 2251 for (k = semanticResultsSize; k < MAX_SEM_RESULTS; ++k) 2252 { 2253 CHKLOG(rc, semanticResults[k]->destroy(semanticResults[k])); 2254 semanticResults[k] = NULL; 2255 } 2256 2257 /* save the good ones */ 2258 for (k = 0; k < semanticResultsSize; ++k) 2259 { 2260 /* 2261 * Save the pointer to the semantic result that was created. 2262 * Remember that the semantic result array only holds pointers 2263 * and for each time that the function is called, new semantic results 2264 * are created, and the pointers overwrite old values in the array 2265 */ 2266 CHKLOG(rc, semanticList->add(semanticList, semanticResults[k])); 2267 } 2268 2269#if SEMPROC_ACTIVE 2270 if (semanticResultsSize > 0) 2271 { 2272 /* OSI log the grammar(s) that was used in recognizing */ 2273 psprintf(tok, L("GURI%d"), grammarIndex_for_iBest); 2274 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("GRMR"), tok)); 2275 } 2276#else 2277 /* OSI log the grammar(s) that was used in recognizing */ 2278 psprintf(tok, L("GURI%d"), grammarIndex_for_iBest); 2279 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("GRMR"), tok)); 2280#endif 2281 2282 /* Populate semantic results for each nbest list entry */ 2283 CHKLOG(rc, semanticList->getSize(semanticList, &semanticResultsSize)); 2284 if (semanticResultsSize == 0) 2285 { 2286 /* 2287 * If there was no semantic result... then I need to create one so that I can store 2288 * literal, conf, meaning which are default keys that must ALWAYS exist 2289 */ 2290 CHKLOG(rc, SR_SemanticResultCreate(&semanticResult)); 2291 CHKLOG(rc, semanticList->add(semanticList, semanticResult)); 2292 semanticResultsSize = 1; 2293 } 2294 2295 for (k = 0; k < semanticResultsSize;++k) 2296 { 2297 CHKLOG(rc, semanticList->get(semanticList, k, (void **)&semanticResult)); 2298 if (semanticResult == NULL) 2299 { 2300 PLogError(L("nbest entry contained NULL semanticResult"), ESR_INVALID_STATE); 2301 return ESR_INVALID_STATE; 2302 } 2303 2304 semanticImpl = (SR_SemanticResultImpl*) semanticResult; 2305 2306 /* put in the literal */ 2307 lValue = MALLOC(sizeof(LCHAR) * (LSTRLEN(label) + 1), MTAG); 2308 if (lValue == NULL) 2309 { 2310 PLogError(L("ESR_OUT_OF_MEMORY")); 2311 return ESR_OUT_OF_MEMORY; 2312 } 2313 LSTRCPY(lValue, label); 2314 CHKLOG(rc, semanticImpl->results->put(semanticImpl->results, L("literal"), lValue)); 2315 2316 /* if the meaning is not set, then put in the meaning which will be the literal */ 2317 CHKLOG(rc, semanticImpl->results->containsKey(semanticImpl->results, L("meaning"), &containsKey)); 2318 if (!containsKey) 2319 { 2320 lValue = MALLOC(sizeof(LCHAR) * (LSTRLEN(label) + 1), MTAG); 2321 if (lValue == NULL) 2322 { 2323 PLogError(L("ESR_OUT_OF_MEMORY")); 2324 return ESR_OUT_OF_MEMORY; 2325 } 2326 LSTRCPY(lValue, label); 2327 CHKLOG(rc, semanticImpl->results->put(semanticImpl->results, L("meaning"), lValue)); 2328 } 2329 2330 /* put in the raw score */ 2331 psprintf(label, L("%d"), raws); 2332 lValue = MALLOC(sizeof(LCHAR) * (LSTRLEN(label) + 1), MTAG); 2333 if (lValue == NULL) 2334 { 2335 PLogError(L("ESR_OUT_OF_MEMORY")); 2336 return ESR_OUT_OF_MEMORY; 2337 } 2338 LSTRCPY(lValue, label); 2339 CHKLOG(rc, semanticImpl->results->put(semanticImpl->results, L("raws"), lValue)); 2340 } 2341 } 2342 2343 /* Now I have an nBest list where each entry has at least one semantic result */ 2344 /* What I need to do is filter out the nBest list entries which have matching 2345 semantic results for 'meaning' */ 2346 /* Once I have filtered out the nBest list based on this criteria, I can calculate the confidence 2347 score and populate the result of the first entry with the raw score */ 2348 2349#if FILTER_NBEST_BY_SEM_RESULT 2350 2351 for (iBest = nbestSize-1; iBest>0; iBest--) /* do not filter out nBest entry 0 */ 2352 { 2353 /** 2354 * This is the entry (indexed by i) targeted for removal 2355 * 2356 */ 2357 2358 /* get the nBest entry which you wish to remove (if duplicate found) */ 2359 CHKLOG(rc, ArrayListGet(resultImpl->results, iBest, (void **)&semanticList)); 2360 2361 /* get the first sem_result for the entry */ 2362 CHKLOG(rc, ArrayListGet(semanticList, 0, (void **)&semanticResult)); 2363 semanticImpl = (SR_SemanticResultImpl*) semanticResult; 2364 2365 /* get the meaning */ 2366 CHKLOG(rc, semanticImpl->results->get(semanticImpl->results, L("meaning"), (void **)&lValue)); 2367 2368 /* get the other entries to check against (start with 0, end on the current i entry) */ 2369 for (jBest = 0; jBest < iBest; ++jBest) 2370 { 2371 /* 2372 * This is the entry (indexed by jBest) that we will compare with 2373 */ 2374 2375 /* get the nBest entry which you wish to compare with */ 2376 CHKLOG(rc, ArrayListGet(resultImpl->results, jBest, (void **)&semanticList2)); 2377 2378 CHKLOG(rc, ArrayListGet(semanticList2, 0, (void **)&semanticResult2)); 2379 semanticImpl2 = (SR_SemanticResultImpl*) semanticResult2; 2380 2381 CHKLOG(rc, semanticImpl2->results->get(semanticImpl2->results, L("meaning"), (void **)&lValue2)); 2382 if (LSTRCMP(lValue, lValue2) == 0) 2383 { 2384 /* pfprintf(PSTDOUT,"duplicate sem result found %d == %d\n", iBest, jBest); 2385 pfprintf(PSTDOUT,"removing %d\n", iBest); */ 2386 2387 /* removing from the list indexed by iBest */ 2388 CHKLOG(rc, semanticList->remove(semanticList, semanticResult)); 2389 CHKLOG(rc, semanticResult->destroy(semanticResult)); 2390 2391 CHKLOG(rc, resultImpl->results->remove(resultImpl->results, semanticList)); 2392 CHKLOG(rc, semanticList->destroy(semanticList)); 2393 2394 if (!CA_NBestListRemoveResult(resultImpl->nbestList, iBest)) 2395 return ESR_ARGUMENT_OUT_OF_BOUNDS; 2396 break; 2397 } 2398 } 2399 } 2400 nbestSize = CA_NBestListCount(resultImpl->nbestList); 2401#endif 2402 2403 CHKLOG(rc, ArrayListGetSize(resultImpl->results, &nbestSize)); 2404 2405 if (nbestSize) 2406 { 2407 if(CA_ComputeConfidenceValues(impl->confidenceScorer, impl->recognizer, resultImpl->nbestList)) 2408 return ESR_INVALID_STATE; 2409 2410 for(current_choice=nbestSize-1;current_choice>=0;current_choice--) 2411 { 2412 /* get the nBest entry you want to deal with */ 2413 CHKLOG(rc, ArrayListGet(resultImpl->results, current_choice, (void **)&semanticList)); 2414 /* get the first sem_result for that entry */ 2415 CHKLOG(rc, ArrayListGet(semanticList, 0, (void **)&semanticResult)); 2416 semanticImpl = (SR_SemanticResultImpl*) semanticResult; 2417 2418 /* put in the conf value for that nBest entry */ 2419 if(!CA_NBestListGetResultConfidenceValue( resultImpl->nbestList, current_choice, &confValue)) 2420 return ESR_ARGUMENT_OUT_OF_BOUNDS; 2421 2422 psprintf(label, L("%d"), confValue); 2423 lValue = MALLOC(sizeof(LCHAR) * (LSTRLEN(label) + 1), MTAG); 2424 if (lValue == NULL) 2425 { 2426 PLogError(L("ESR_OUT_OF_MEMORY")); 2427 return ESR_OUT_OF_MEMORY; 2428 } 2429 LSTRCPY(lValue, label); 2430 CHKLOG(rc, semanticImpl->results->put(semanticImpl->results, L("conf"),lValue)); 2431 } 2432 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("CMPT"), 0)); 2433 } 2434 2435 /* OSI log the end of recognition and all bufferred tokens */ 2436 2437 /* OSI log end of recognition time */ 2438 PTimeStampSet(&EORT); 2439 impl->recogLogTimings.EORT = PTimeStampDiff(&EORT, &impl->timestamp); 2440 impl->recogLogTimings.DURS = impl->processed * MSEC_PER_FRAME; 2441 2442 /*****************************************/ 2443 /* OSI Logging stuff */ 2444 /*****************************************/ 2445if( impl->osi_log_level != 0) 2446 { 2447 /* get the nBest size (this size may have changed since previous set cuz of nbest list filtering) */ 2448 CHKLOG(rc, ArrayListGetSize(resultImpl->results, &nbestSize)); 2449 /* OSI log the nBest list size */ 2450 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("NBST"), nbestSize)); 2451 2452 2453 for (iBest = 0; iBest < nbestSize; iBest++) /* loop */ 2454 { 2455 /* get the nBest entry */ 2456 CHKLOG(rc, ArrayListGet(resultImpl->results, iBest, (void**)&semanticList)); 2457 2458 /* get the first sem_result for the entry (ther emay be many, but ignore others) */ 2459 CHKLOG(rc, ArrayListGet(semanticList, 0, (void **)&semanticResult)); 2460 semanticImpl = (SR_SemanticResultImpl*) semanticResult; 2461 2462 /* get the meaning and OSI log it */ 2463 CHKLOG(rc, semanticImpl->results->get(semanticImpl->results, L("meaning"), (void **)&lValue)); 2464 /* OSI log RSLT (meaning) for nbest item */ 2465 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("RSLT"), lValue)); 2466 2467 /* get the literal and OSI log it */ 2468 CHKLOG(rc, semanticImpl->results->get(semanticImpl->results, L("literal"), (void **)&lValue)); 2469 /* OSI log RAWT SPOK (literal) for nbest item */ 2470 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("RAWT"), lValue)); 2471 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("SPOK"), lValue)); 2472 2473 /* get the score and OSI log it */ 2474 CHKLOG(rc, semanticImpl->results->get(semanticImpl->results, L("raws"), (void **)&lValue)); 2475 /* OSI log RAWS (score) for nbest item */ 2476 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("RAWS"), lValue)); 2477 /* get the confidence value and OSI log it */ 2478 CHKLOG(rc, semanticImpl->results->get(semanticImpl->results, L("conf"), (void **)&lValue)); 2479 /* OSI log CONF (values) for nbest item */ 2480 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("CONF"), lValue)); 2481 } 2482 2483 /* log the values */ 2484 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("BORT"), impl->recogLogTimings.BORT)); 2485 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("DURS"), impl->recogLogTimings.DURS)); 2486 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("EORT"), impl->recogLogTimings.EORT)); 2487 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("EOSD"), impl->recogLogTimings.EOSD)); 2488 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("EOSS"), impl->recogLogTimings.EOSS)); 2489 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("EOST"), impl->recogLogTimings.EOST)); 2490 if (impl->osi_log_level & OSI_LOG_LEVEL_AUDIO) 2491 { 2492 len = P_PATH_MAX; 2493 CHKLOG(rc, SR_EventLogAudioGetFilename(impl->eventLog, waveformFilename, &len)); 2494 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("WVNM"), waveformFilename)); 2495 } 2496 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("RSTT"), L("ok"))); 2497 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("RENR"), L("ok"))); 2498 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("ENDR"), impl->eos_reason)); 2499 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIrcnd"))); 2500 2501 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("BOSS"), impl->recogLogTimings.BOSS)); /* extra not in OSI spec */ 2502 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("ESRboss"))); 2503 2504 /* 2505 * Record which recognizer was the successful one (male or female) 2506 * this index refers to the order in the swimdllist file. 2507 */ 2508 CHKLOG(rc, CA_GetRecogID(impl->recognizer, &recogID)); 2509 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("RECOG"), recogID)); 2510 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("ESRrcid"))); 2511 2512 /* Record semantic results returned by top nbestlist entry */ 2513 if (1) 2514 { 2515#define MAX_SEMANTIC_KEYS 50 2516 LCHAR* semanticKeys[MAX_SEMANTIC_KEYS]; 2517#define SEMANTIC_VALUE_SIZE 512 2518 LCHAR semanticValue[SEMANTIC_VALUE_SIZE]; 2519 size_t num_semanticKeys; 2520 2521 rc = resultImpl->results->getSize(resultImpl->results, &nbestSize); 2522 if (rc != ESR_SUCCESS) 2523 { 2524 PLogError(ESR_rc2str(rc)); 2525 goto DONE; 2526 } 2527 for (iBest = 0; iBest < nbestSize; ++iBest) /* loop2 */ 2528 { 2529 rc = resultImpl->results->get(resultImpl->results, iBest, (void **)&semanticList); 2530 if (rc != ESR_SUCCESS) 2531 { 2532 PLogError(ESR_rc2str(rc)); 2533 goto DONE; 2534 } 2535 2536 /* semanticResultsSize is the number of semantic meanings, although 2537 ambiguous parses are not entirely supported 2538 num_semanticKeys is associated to a particular parse */ 2539 2540 rc = semanticList->getSize(semanticList, &semanticResultsSize); 2541 if (rc != ESR_SUCCESS) 2542 { 2543 PLogError(ESR_rc2str(rc)); 2544 goto DONE; 2545 } 2546 for (k = 0; k < semanticResultsSize; ++k) 2547 { 2548 size_t iKey; 2549 rc = semanticList->get(semanticList, k, (void **)&semanticResult); 2550 if (rc != ESR_SUCCESS) 2551 { 2552 PLogError(ESR_rc2str(rc)); 2553 goto DONE; 2554 } 2555 num_semanticKeys = MAX_SEMANTIC_KEYS; 2556 rc = semanticResult->getKeyList(semanticResult, (LCHAR**) & semanticKeys, &num_semanticKeys); 2557 if (rc != ESR_SUCCESS) 2558 { 2559 PLogError(ESR_rc2str(rc)); 2560 goto DONE; 2561 } 2562 2563 for (iKey=0; iKey<num_semanticKeys; ++iKey) 2564 { 2565 len = SEMANTIC_VALUE_SIZE; 2566 rc = semanticResult->getValue(semanticResult, semanticKeys[iKey], (LCHAR*) &semanticValue, &len); 2567 if (rc != ESR_SUCCESS) 2568 { 2569 PLogError(ESR_rc2str(rc)); 2570 goto DONE; 2571 } 2572 2573 rc = SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, semanticKeys[iKey], semanticValue); 2574 if (rc != ESR_SUCCESS) 2575 { 2576 PLogError(ESR_rc2str(rc)); 2577 goto DONE; 2578 } 2579 } 2580 } 2581 } 2582 rc = SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("ESR_SemanticResult[0]")); 2583 if (rc != ESR_SUCCESS) 2584 { 2585 PLogError(ESR_rc2str(rc)); 2586 goto DONE; 2587 } 2588 } 2589} 2590DONE: 2591 return ESR_SUCCESS; 2592CLEANUP: 2593 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 2594 return rc; 2595} 2596 2597/** 2598 * Indicates if it is possible to push data from SREC into the internal recognizer. 2599 * If data can be pushed, ESR_CONTINUE_PROCESSING is returned. 2600 * 2601 * INPUT STATES: SR_RECOGNIZER_INTERNAL_BOS_DETECTION, SR_RECOGNIZER_INTERNAL_EOS_DETECTION 2602 * OUTPUT STATES: same or SR_RECOGNIZER_INTERNAL_EOI 2603 */ 2604PINLINE ESR_ReturnCode canPushAudioIntoRecognizer(SR_RecognizerImpl* impl) 2605{ 2606 ESR_ReturnCode rc; 2607 2608 if (impl->lockFunction) 2609 impl->lockFunction(ESR_LOCK, impl->lockData); 2610 2611 /* do I have enough to make a frame ? */ 2612 if (CircularBufferGetSize(impl->buffer) < impl->FRAME_SIZE) 2613 { 2614 /* Not enough data */ 2615 if (!impl->gotLastFrame) 2616 { 2617 /* not last frame, so ask for more audio */ 2618 if (impl->lockFunction) 2619 impl->lockFunction(ESR_UNLOCK, impl->lockData); 2620 return ESR_SUCCESS; 2621 } 2622 else 2623 { 2624 /* last frame, make do with what you have */ 2625 if (impl->lockFunction) 2626 impl->lockFunction(ESR_UNLOCK, impl->lockData); 2627#ifdef SREC_ENGINE_VERBOSE_LOGGING 2628 PLogMessage("L: Voicing END (EOI) at %d frames (%d processed)", impl->frames, impl->processed); 2629#endif 2630 impl->isRecognizing = ESR_FALSE; 2631 impl->recogLogTimings.EOSD = impl->frames; 2632 impl->eos_reason = L("EOI"); 2633 impl->internalState = SR_RECOGNIZER_INTERNAL_EOI; 2634 if (impl->eventLog != NULL) 2635 { 2636 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("canPushAudioIntoRecognizer() -> SR_RECOGNIZER_INTERNAL_EOI"))); 2637 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2638 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2639 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2640 } 2641 return ESR_CONTINUE_PROCESSING; 2642 } 2643 } 2644 if (impl->lockFunction) 2645 impl->lockFunction(ESR_UNLOCK, impl->lockData); 2646 return ESR_CONTINUE_PROCESSING; 2647CLEANUP: 2648 return rc; 2649} 2650 2651/** 2652 * Pushes data from SREC into the internal recognizer. 2653 * 2654 * INPUT STATES: SR_RECOGNIZER_INTERNAL_BOS_DETECTION, SR_RECOGNIZER_INTERNAL_EOS_DETECTION 2655 * OUTPUT STATES: same 2656 */ 2657PINLINE ESR_ReturnCode pushAudioIntoRecognizer(SR_RecognizerImpl* impl, SR_RecognizerStatus* status, 2658 SR_RecognizerResultType* type, 2659 SR_RecognizerResult* result) 2660{ 2661 size_t count; 2662 ESR_ReturnCode rc; 2663 2664 if (CA_GetUnprocessedFramesInUtterance(impl->utterance) > 0 && impl->frames >= impl->bgsniff) 2665 { 2666 /* Don't push frames unless they're needed */ 2667 2668 /* Check for leaked state */ 2669 passert(*status == SR_RECOGNIZER_EVENT_INVALID && *type == SR_RECOGNIZER_RESULT_TYPE_INVALID); 2670 return ESR_CONTINUE_PROCESSING; 2671 } 2672 if (impl->lockFunction) 2673 impl->lockFunction(ESR_LOCK, impl->lockData); 2674 count = CircularBufferRead(impl->buffer, impl->audioBuffer, impl->FRAME_SIZE); 2675 if (impl->lockFunction) 2676 impl->lockFunction(ESR_UNLOCK, impl->lockData); 2677 2678 WaveformBuffer_Write(impl->waveformBuffer, impl->audioBuffer, count); 2679 if (impl->osi_log_level & OSI_LOG_LEVEL_AUDIO) 2680 { 2681 rc = SR_EventLogAudioWrite(impl->eventLog, impl->audioBuffer, count); 2682 if (rc == ESR_BUFFER_OVERFLOW) 2683 rc = ESR_INVALID_STATE; 2684 if (rc != ESR_SUCCESS) 2685 { 2686 PLogError(ESR_rc2str(rc)); 2687 if (impl->lockFunction) 2688 impl->lockFunction(ESR_UNLOCK, impl->lockData); 2689 goto CLEANUP; 2690 } 2691 } 2692 if (count < impl->FRAME_SIZE) 2693 { 2694 rc = ESR_INVALID_STATE; 2695 PLogError(L("%s: error reading buffer data (count=%d, frameSize=%d)"), ESR_rc2str(rc), count, impl->FRAME_SIZE); 2696 goto CLEANUP; 2697 } 2698 if (!CA_LoadSamples(impl->wavein, impl->audioBuffer, impl->sampleRate / FRAMERATE)) 2699 { 2700 PLogError(L("ESR_INVALID_STATE")); 2701 rc = ESR_INVALID_STATE; 2702 goto CLEANUP; 2703 } 2704 2705 CA_ConditionSamples(impl->wavein); 2706 /* Check for leaked state */ 2707 passert(*status == SR_RECOGNIZER_EVENT_INVALID && *type == SR_RECOGNIZER_RESULT_TYPE_INVALID); 2708 return ESR_CONTINUE_PROCESSING; 2709CLEANUP: 2710 return rc; 2711} 2712 2713/** 2714 * INPUT STATES: SR_RECOGNIZER_INTERNAL_BOS_DETECTION, SR_RECOGNIZER_INTERNAL_EOS_DETECTION 2715 * OUTPUT STATES: same 2716 */ 2717PINLINE ESR_ReturnCode generateFrameFromAudio(SR_RecognizerImpl* impl, SR_RecognizerStatus* status, 2718 SR_RecognizerResultType* type, 2719 SR_RecognizerResult* result) 2720{ 2721 if (CA_GetUnprocessedFramesInUtterance(impl->utterance) > 0 && impl->frames >= impl->bgsniff) 2722 { 2723 /* Don't create frames unless they're needed */ 2724 2725 /* Check for leaked state */ 2726 passert(*status == SR_RECOGNIZER_EVENT_INVALID && *type == SR_RECOGNIZER_RESULT_TYPE_INVALID); 2727 return ESR_CONTINUE_PROCESSING; 2728 } 2729 2730 /* Try processing one frame */ 2731 if (!CA_MakeFrame(impl->frontend, impl->utterance, impl->wavein)) 2732 { 2733 /* 2734 * One of three cases occured: 2735 * 2736 * - We don't have enough samples to process one frame. This should be impossible because 2737 * pushAudioIntoRecognizer() is always called before us and will not continue if we don't 2738 * have enough samples. 2739 * 2740 * - The internal recognizer needs a minimum amount of audio before it'll begin generating 2741 * frames. This is normal and we return with a success value. 2742 * 2743 * - The recognizer skips every even frame number (for performance reasons). This is normal 2744 * and we return with a success value. 2745 */ 2746 *status = SR_RECOGNIZER_EVENT_INCOMPLETE; 2747 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 2748 return ESR_SUCCESS; 2749 } 2750 ++impl->frames; 2751 /* Check for leaked state */ 2752 passert(*status == SR_RECOGNIZER_EVENT_INVALID && *type == SR_RECOGNIZER_RESULT_TYPE_INVALID); 2753 return ESR_CONTINUE_PROCESSING; 2754} 2755 2756/** 2757 * INPUT STATES: SR_RECOGNIZER_INTERNAL_EOS_DETECTION 2758 * OUTPUT STATES: same 2759 */ 2760PINLINE ESR_ReturnCode generateFrameStats(SR_RecognizerImpl* impl, SR_RecognizerStatus* status, 2761 SR_RecognizerResultType* type, 2762 SR_RecognizerResult* result) 2763{ 2764 if (impl->frames < impl->bgsniff) 2765 { 2766 /* Wait until we have enough frames to estimate background stats */ 2767 *status = SR_RECOGNIZER_EVENT_INCOMPLETE; 2768 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 2769 return ESR_SUCCESS; 2770 } 2771 else if (impl->frames == impl->bgsniff) 2772 CA_CalculateUtteranceStatistics(impl->utterance, 0, impl->bgsniff); 2773 2774 /* Check for leaked state */ 2775 passert(*status == SR_RECOGNIZER_EVENT_INVALID && *type == SR_RECOGNIZER_RESULT_TYPE_INVALID); 2776 return ESR_CONTINUE_PROCESSING; 2777} 2778 2779/** 2780 * INPUT STATES: SR_RECOGNIZER_INTERNAL_EOS_DETECTION 2781 * OUTPUT STATES: same or SR_RECOGNIZER_INTERNAL_EOI, SR_RECOGNIZER_INTERNAL_EOS 2782 */ 2783PINLINE ESR_ReturnCode generatePatternFromFrame(SR_RecognizerImpl* impl, SR_RecognizerStatus* status, 2784 SR_RecognizerResultType* type, 2785 SR_RecognizerResult* result) 2786{ 2787 SR_AcousticModelsImpl* modelsImpl; 2788 ESR_ReturnCode rc; 2789 2790 /* Run the search */ 2791 modelsImpl = (SR_AcousticModelsImpl*) impl->models; 2792 if (!CA_MakePatternFrame(modelsImpl->pattern, impl->utterance)) 2793 { 2794 *status = SR_RECOGNIZER_EVENT_NO_MATCH; 2795 *type = SR_RECOGNIZER_RESULT_TYPE_COMPLETE; 2796 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 2797 if (impl->eventLog != NULL) 2798 { 2799 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("generatePatternFromFrame() -> SR_RECOGNIZER_INTERNAL_END"))); 2800 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2801 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2802 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2803 } 2804 PLogError(L("ESR_INVALID_STATE")); 2805 return ESR_INVALID_STATE; 2806 } 2807 if (!CA_AdvanceUtteranceFrame(impl->utterance)) 2808 { 2809 *status = SR_RECOGNIZER_EVENT_NO_MATCH; 2810 *type = SR_RECOGNIZER_RESULT_TYPE_COMPLETE; 2811 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 2812 if (impl->eventLog != NULL) 2813 { 2814 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("canPushAudioIntoRecognizer() -> SR_RECOGNIZER_INTERNAL_END"))); 2815 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2816 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2817 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2818 } 2819 PLogError(L("ESR_INVALID_STATE")); 2820 return ESR_INVALID_STATE; 2821 } 2822 CA_AdvanceRecognitionByFrame(impl->recognizer, modelsImpl->pattern, impl->utterance); 2823 ++impl->processed; 2824 2825 if (impl->lockFunction) 2826 impl->lockFunction(ESR_LOCK, impl->lockData); 2827 if (impl->gotLastFrame && CircularBufferGetSize(impl->buffer) < impl->FRAME_SIZE) 2828 { 2829 /* 2830 * SREC have run out of data but the underlying recognizer might have some frames 2831 * queued for processing. 2832 */ 2833 if (CA_GetUnprocessedFramesInUtterance(impl->utterance) > 0) 2834 { 2835 /* EOI means end of input */ 2836#ifdef SREC_ENGINE_VERBOSE_LOGGING 2837 PLogMessage("L: Voicing END (EOI) at %d frames (%d processed)", impl->frames, impl->processed); 2838#endif 2839 impl->isRecognizing = ESR_FALSE; 2840 impl->recogLogTimings.EOSD = impl->frames; 2841 impl->eos_reason = L("EOI"); 2842 impl->internalState = SR_RECOGNIZER_INTERNAL_EOI; 2843 if (impl->eventLog != NULL) 2844 { 2845 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("generatePatternFromFrame() -> SR_RECOGNIZER_INTERNAL_EOI"))); 2846 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2847 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2848 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2849 } 2850 } 2851 else 2852 { 2853#ifdef SREC_ENGINE_VERBOSE_LOGGING 2854 PLogMessage("L: Voicing END (EOF) at %d frames (%d processed)", impl->frames, impl->processed); 2855#endif 2856 2857 impl->isRecognizing = ESR_FALSE; 2858 impl->recogLogTimings.EOSD = impl->frames; 2859 impl->eos_reason = L("EOF"); 2860 impl->internalState = SR_RECOGNIZER_INTERNAL_EOS; 2861 if (impl->eventLog != NULL) 2862 { 2863 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("generatePatternFromFrame() -> SR_RECOGNIZER_INTERNAL_EOS"))); 2864 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2865 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2866 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2867 } 2868 *status = SR_RECOGNIZER_EVENT_END_OF_VOICING; 2869 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 2870 passert(impl->processed == impl->frames); 2871 if (impl->lockFunction) 2872 impl->lockFunction(ESR_UNLOCK, impl->lockData); 2873 return ESR_SUCCESS; 2874 } 2875 } 2876 if (impl->lockFunction) 2877 impl->lockFunction(ESR_UNLOCK, impl->lockData); 2878 2879 /* Check for leaked state */ 2880 passert(*status == SR_RECOGNIZER_EVENT_INVALID && *type == SR_RECOGNIZER_RESULT_TYPE_INVALID); 2881 return ESR_CONTINUE_PROCESSING; 2882CLEANUP: 2883 return rc; 2884} 2885 2886/** 2887 * Same as generatePatternFromFrame() only the buffer is known to be empty. 2888 * 2889 * INPUT STATES: SR_RECOGNIZER_INTERNAL_EOI 2890 * OUTPUT STATES: same or SR_RECOGNIZER_INTERNAL_EOS 2891 */ 2892PINLINE ESR_ReturnCode generatePatternFromFrameEOI(SR_RecognizerImpl* impl, SR_RecognizerStatus* status, 2893 SR_RecognizerResultType* type, 2894 SR_RecognizerResult* result) 2895{ 2896 SR_AcousticModelsImpl* modelsImpl; 2897 ESR_ReturnCode rc; 2898 2899 /* Run the search */ 2900 modelsImpl = (SR_AcousticModelsImpl*) impl->models; 2901 2902 if (CA_GetUnprocessedFramesInUtterance(impl->utterance) <= 0) 2903 { 2904 passert(impl->processed == impl->frames); 2905 *status = SR_RECOGNIZER_EVENT_END_OF_VOICING; 2906 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 2907 impl->internalState = SR_RECOGNIZER_INTERNAL_EOS; 2908 return ESR_SUCCESS; 2909 } 2910 2911 if (!CA_MakePatternFrame(modelsImpl->pattern, impl->utterance)) 2912 { 2913 *status = SR_RECOGNIZER_EVENT_NO_MATCH; 2914 *type = SR_RECOGNIZER_RESULT_TYPE_COMPLETE; 2915 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 2916 if (impl->eventLog != NULL) 2917 { 2918 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("generatePatternFromFrameEOI() -> SR_RECOGNIZER_INTERNAL_END"))); 2919 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2920 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2921 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2922 } 2923 PLogError(L("ESR_INVALID_STATE")); 2924 return ESR_INVALID_STATE; 2925 } 2926 if (!CA_AdvanceUtteranceFrame(impl->utterance)) 2927 { 2928 *status = SR_RECOGNIZER_EVENT_NO_MATCH; 2929 *type = SR_RECOGNIZER_RESULT_TYPE_COMPLETE; 2930 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 2931 if (impl->eventLog != NULL) 2932 { 2933 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("generatePatternFromFrameEOI() -> SR_RECOGNIZER_INTERNAL_END"))); 2934 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2935 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2936 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2937 } 2938 PLogError(L("ESR_INVALID_STATE")); 2939 return ESR_INVALID_STATE; 2940 } 2941 CA_AdvanceRecognitionByFrame(impl->recognizer, modelsImpl->pattern, impl->utterance); 2942 ++impl->processed; 2943 2944 if (impl->lockFunction) 2945 impl->lockFunction(ESR_LOCK, impl->lockData); 2946 2947 if (CA_GetUnprocessedFramesInUtterance(impl->utterance) <= 0) 2948 { 2949 passert(impl->processed == impl->frames); 2950 *status = SR_RECOGNIZER_EVENT_END_OF_VOICING; 2951 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 2952 impl->internalState = SR_RECOGNIZER_INTERNAL_EOS; 2953 if (impl->eventLog != NULL) 2954 { 2955 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("generatePatternFromFrameEOI() -> SR_RECOGNIZER_INTERNAL_EOS"))); 2956 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 2957 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 2958 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 2959 } 2960 if (impl->lockFunction) 2961 impl->lockFunction(ESR_UNLOCK, impl->lockData); 2962 return ESR_SUCCESS; 2963 } 2964 if (impl->lockFunction) 2965 impl->lockFunction(ESR_UNLOCK, impl->lockData); 2966 2967 /* Check for leaked state */ 2968 passert(*status == SR_RECOGNIZER_EVENT_INVALID && *type == SR_RECOGNIZER_RESULT_TYPE_INVALID); 2969 return ESR_CONTINUE_PROCESSING; 2970CLEANUP: 2971 if (impl->lockFunction) 2972 impl->lockFunction(ESR_UNLOCK, impl->lockData); 2973 return rc; 2974} 2975 2976 2977/** 2978 * INPUT STATES: SR_RECOGNIZER_INTERNAL_EOI, SR_RECOGNIZER_INTERNAL_EOS_DETECTION 2979 * OUTPUT STATES: same or SR_RECOGNIZER_INTERNAL_EOS 2980 */ 2981ESR_ReturnCode detectEndOfSpeech(SR_RecognizerImpl* impl, SR_RecognizerStatus* status, 2982 SR_RecognizerResultType* type, 2983 SR_RecognizerResult* result) 2984{ 2985 EOSrc eos; /* eos means end of speech */ 2986 int eos_by_level; /* eos means end of speech */ 2987 PTimeStamp timestamp; 2988 ESR_ReturnCode rc; 2989 ESR_BOOL enableGetWaveform = ESR_FALSE; 2990 2991 eos_by_level = CA_UtteranceHasEnded(impl->utterance); 2992 if (eos_by_level) 2993 { 2994 eos = SPEECH_ENDED_BY_LEVEL_TIMEOUT; 2995 } 2996 else 2997 { 2998 eos = CA_IsEndOfUtteranceByResults(impl->recognizer); 2999 } 3000 3001 ESR_SessionGetBool(L("enableGetWaveform"), &enableGetWaveform); 3002 //impl->parameters->getBool(impl->parameters, L("enableGetWaveform"), &enableGetWaveform); 3003 3004 if (eos == VALID_SPEECH_CONTINUING && enableGetWaveform && impl->waveformBuffer->overflow_count > 0) 3005 { 3006 size_t bufferSize; 3007 CHKLOG(rc, WaveformBuffer_GetSize(impl->waveformBuffer, &bufferSize)); 3008 PLogMessage("Forcing EOS due to wfbuf overflow (fr=%d,sz=%d,of=%d)", impl->frames, bufferSize, impl->waveformBuffer->overflow_count); 3009 eos = SPEECH_TOO_LONG; 3010 } 3011 3012 if (eos != VALID_SPEECH_CONTINUING) 3013 { 3014 switch (eos) 3015 { 3016 case SPEECH_ENDED: 3017 /* normal */ 3018 impl->eos_reason = L("itimeout"); 3019 break; 3020 3021 case SPEECH_ENDED_WITH_ERROR: 3022 /* error */ 3023 impl->eos_reason = L("err"); 3024 break; 3025 3026 case SPEECH_TOO_LONG: 3027 /* timeout*/ 3028 impl->eos_reason = L("ctimeout"); 3029 break; 3030 3031 case SPEECH_MAYBE_ENDED: 3032 /* normal */ 3033 impl->eos_reason = L("itimeout"); 3034 break; 3035 case SPEECH_ENDED_BY_LEVEL_TIMEOUT: 3036 /* normal */ 3037 impl->eos_reason = L("levelTimeout"); 3038 break; 3039 3040 default: 3041 /* error */ 3042 impl->eos_reason = L("err"); 3043 } 3044 3045#ifdef SREC_ENGINE_VERBOSE_LOGGING 3046 PLogMessage("L: Voicing END (EOS) at %d frames, %d processed (reason: %s)\n", impl->frames, impl->processed, impl->eos_reason); 3047#endif 3048 3049 impl->recogLogTimings.EOSD = impl->frames; /* how many frames have been sent prior to detect EOS */ 3050 PTimeStampSet(×tamp); /* time it took to detect EOS (in millisec) */ 3051 impl->recogLogTimings.EOST = PTimeStampDiff(×tamp, &impl->timestamp); 3052 3053 *status = SR_RECOGNIZER_EVENT_END_OF_VOICING; 3054 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 3055 impl->internalState = SR_RECOGNIZER_INTERNAL_EOS; 3056 if (impl->eventLog != NULL) 3057 { 3058 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("internalState"), L("detectEndOfSpeech() -> SR_RECOGNIZER_INTERNAL_EOS"))); 3059 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("reason"), impl->eos_reason)); 3060 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("frames"), impl->frames)); 3061 CHKLOG(rc, SR_EventLogTokenSize_t_BASIC(impl->eventLog, impl->osi_log_level, L("processed"), impl->processed)); 3062 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SR_Recognizer"))); 3063 } 3064 impl->isRecognizing = ESR_FALSE; 3065 return ESR_SUCCESS; 3066 } 3067 3068 /* Check for leaked state */ 3069 passert(*status == SR_RECOGNIZER_EVENT_INVALID && *type == SR_RECOGNIZER_RESULT_TYPE_INVALID); 3070 return ESR_CONTINUE_PROCESSING; 3071CLEANUP: 3072 return rc; 3073} 3074 3075/** 3076 * INPUT STATES: SR_RECOGNIZER_INTERNAL_BOS_DETECTION 3077 * OUTPUT STATES: same or SR_RECOGNIZER_INTERNAL_EOS_DETECTION, SR_RECOGNIZER_INTERNAL_EOI 3078 */ 3079ESR_ReturnCode detectBeginningOfSpeech(SR_RecognizerImpl* impl, 3080 SR_RecognizerStatus* status, 3081 SR_RecognizerResultType* type, 3082 SR_RecognizerResult* result) 3083{ 3084 ESR_ReturnCode rc; 3085 ESR_BOOL gatedMode; 3086 size_t num_windback_bytes, num_windback_frames; 3087 waveform_buffering_state_t buffering_state; 3088 3089 CHKLOG(rc, ESR_SessionGetBool(L("cmdline.gatedmode"), &gatedMode)); 3090 3091 if (gatedMode || (!gatedMode && impl->frames < impl->bgsniff)) 3092 { 3093 ESR_BOOL pushable = ESR_FALSE; 3094 3095 rc = canPushAudioIntoRecognizer(impl); 3096 if (rc == ESR_SUCCESS) 3097 { 3098 /* Not enough samples to process one frame */ 3099 if (CA_GetUnprocessedFramesInUtterance(impl->utterance) <= 0) 3100 { 3101 *status = SR_RECOGNIZER_EVENT_NEED_MORE_AUDIO; 3102 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 3103 return ESR_SUCCESS; 3104 } 3105 } 3106 else if (rc != ESR_CONTINUE_PROCESSING) 3107 return rc; 3108 else if (impl->internalState == SR_RECOGNIZER_INTERNAL_EOI) 3109 { 3110 /* Got end of input before beginning of speech */ 3111 *status = SR_RECOGNIZER_EVENT_NO_MATCH; 3112 *type = SR_RECOGNIZER_RESULT_TYPE_COMPLETE; 3113 impl->internalState = SR_RECOGNIZER_INTERNAL_BOS_NO_MATCH; 3114 CHKLOG(rc, impl->Interface.stop(&impl->Interface)); 3115 return ESR_SUCCESS; 3116 } 3117 else 3118 pushable = ESR_TRUE; 3119 if (pushable) 3120 { 3121 rc = pushAudioIntoRecognizer(impl, status, type, result); 3122 /* OUTPUT STATES: same or SR_RECOGNIZER_INTERNAL_EOI */ 3123 if (rc != ESR_CONTINUE_PROCESSING) 3124 { 3125 /* Not enough samples to process one frame */ 3126 return rc; 3127 } 3128 rc = generateFrameFromAudio(impl, status, type, result); 3129 /* OUTPUT STATES: same */ 3130 if (rc != ESR_CONTINUE_PROCESSING) 3131 { 3132 /* 3133 * The internal recognizer needs a minimum amount of audio before 3134 * it begins generating frames. 3135 */ 3136 return rc; 3137 } 3138 } 3139 if (!CA_AdvanceUtteranceFrame(impl->utterance)) 3140 { 3141 PLogError(L("ESR_INVALID_STATE: Failed Advancing Utt Frame %d"), impl->frames); 3142 return ESR_INVALID_STATE; 3143 } 3144 if (CA_UtteranceHasVoicing(impl->utterance)) 3145 { 3146 /* Utterance stats for Lombard if enough frames */ 3147 if (impl->frames > impl->bgsniff) 3148 { 3149#ifdef SREC_ENGINE_VERBOSE_LOGGING 3150 PLogMessage("L: Voicing START at %d frames", impl->frames); 3151#endif 3152 /* OSI log the endpointed data */ 3153 3154 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("BTIM"), impl->frames * MSEC_PER_FRAME)); 3155 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("BRGN"), 0)); /* Barge-in not supported */ 3156 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIendp"))); 3157 3158 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, L("BOSD"), impl->frames)); 3159 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("ESRbosd"))); 3160 3161 if (gatedMode) 3162 CA_CalculateUtteranceStatistics(impl->utterance, (int)(impl->frames * -1), 0); 3163 else 3164 CA_CalculateUtteranceStatistics(impl->utterance, 0, impl->frames); 3165 } 3166 3167 /* OK, we've got voicing or the end of input has occured 3168 ** (or both, I suppose). If we had voicing then progress 3169 ** the recognizer, otherwise skip to the end. 3170 ** Of course, we could be running outside 'Gated Mode' 3171 ** so we won't have any frames processed at all yet - 3172 ** in this case start the recognizer anyway. 3173 */ 3174 3175 /************************************* 3176 ** Run recognition until endOfInput ** 3177 *************************************/ 3178 3179 /* 3180 * Initialize both recognizers first 3181 * and disable reporting of results 3182 */ 3183 if (gatedMode) 3184 { 3185 /* 3186 * We're in Gated Mode - 3187 * Because we'll have had voicing we wind-back 3188 * until the start of voicing (unsure region) 3189 */ 3190 num_windback_frames = CA_SeekStartOfUtterance(impl->utterance); 3191 impl->beginningOfSpeechOffset = impl->frames - num_windback_frames; 3192 num_windback_bytes = num_windback_frames * impl->FRAME_SIZE * 2 /* due to skip even frames */; 3193 3194 /* pfprintf(PSTDOUT,L("audio buffer windback %d frames == %d bytes\n"), num_windback_frames, num_windback_bytes); */ 3195 CHKLOG(rc, WaveformBuffer_GetBufferingState(impl->waveformBuffer, &buffering_state)); 3196 if (buffering_state != WAVEFORM_BUFFERING_OFF) 3197 CHKLOG(rc, WaveformBuffer_WindBack(impl->waveformBuffer, num_windback_bytes)); 3198 3199 /* 3200 * Only transition to linear if it was previously circular (in other words if 3201 * buffering was active in the first place) 3202 */ 3203 if (buffering_state == WAVEFORM_BUFFERING_ON_CIRCULAR) 3204 CHKLOG(rc, WaveformBuffer_SetBufferingState(impl->waveformBuffer, WAVEFORM_BUFFERING_ON_LINEAR)); 3205 impl->frames = CA_GetUnprocessedFramesInUtterance(impl->utterance); 3206 } 3207 else 3208 impl->frames = 0; 3209 /* reset the frames */ 3210 impl->processed = 0; 3211 CHKLOG(rc, beginRecognizing(impl)); 3212 impl->internalState = SR_RECOGNIZER_INTERNAL_EOS_DETECTION; 3213 *status = SR_RECOGNIZER_EVENT_START_OF_VOICING; 3214 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 3215 return ESR_SUCCESS; 3216 } 3217 else 3218 { 3219 if (impl->frames > impl->utterance_timeout) 3220 { 3221 /* beginning of speech timeout */ 3222 impl->internalState = SR_RECOGNIZER_INTERNAL_BOS_TIMEOUT; 3223 *status = SR_RECOGNIZER_EVENT_START_OF_UTTERANCE_TIMEOUT; 3224 *type = SR_RECOGNIZER_RESULT_TYPE_COMPLETE; 3225 CHKLOG(rc, impl->Interface.stop(&impl->Interface)); 3226 return ESR_SUCCESS; 3227 } 3228 } 3229 } 3230 else if (!gatedMode && impl->frames >= impl->bgsniff) 3231 { 3232 /* 3233 * If not gated mode and I have processed enough frames, then start the recognizer 3234 * right away. 3235 */ 3236 impl->internalState = SR_RECOGNIZER_INTERNAL_EOS_DETECTION; 3237 *status = SR_RECOGNIZER_EVENT_INCOMPLETE; 3238 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 3239 3240 /* reset the frames */ 3241 impl->frames = impl->processed = 0; 3242 CHKLOG(rc, beginRecognizing(impl)); 3243 return ESR_SUCCESS; 3244 } 3245 *status = SR_RECOGNIZER_EVENT_INCOMPLETE; 3246 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 3247 return ESR_SUCCESS; 3248 3249CLEANUP: 3250 return rc; 3251} 3252 3253ESR_ReturnCode SR_RecognizerAdvanceImpl(SR_Recognizer* self, SR_RecognizerStatus* status, 3254 SR_RecognizerResultType* type, 3255 SR_RecognizerResult** result) 3256{ 3257 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 3258 ESR_BOOL pushable; 3259 ESR_ReturnCode rc; 3260 3261 if (status == NULL || type == NULL || result == NULL) 3262 { 3263 PLogError(L("ESR_INVALID_ARGUMENT")); 3264 return ESR_INVALID_ARGUMENT; 3265 } 3266 3267 /* create the result holder and save the pointer */ 3268 /* creation only happens once (due to the if condition) */ 3269 if (impl->result == NULL) 3270 CHKLOG(rc, SR_RecognizerResult_Create(&impl->result, impl)); 3271 *result = impl->result; 3272 3273 /* 3274 * The following two lines are used to detect bugs whereby we forget to set 3275 * status or type before returning 3276 */ 3277 *status = SR_RECOGNIZER_EVENT_INVALID; 3278 *type = SR_RECOGNIZER_RESULT_TYPE_INVALID; 3279 3280MOVE_TO_NEXT_STATE: 3281 switch (impl->internalState) 3282 { 3283 case SR_RECOGNIZER_INTERNAL_BEGIN: 3284 impl->internalState = SR_RECOGNIZER_INTERNAL_BOS_DETECTION; 3285 *status = SR_RECOGNIZER_EVENT_STARTED; 3286 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 3287 return ESR_SUCCESS; 3288 3289 case SR_RECOGNIZER_INTERNAL_BOS_DETECTION: 3290 rc = detectBeginningOfSpeech(impl, status, type, impl->result); 3291 if (rc != ESR_CONTINUE_PROCESSING) 3292 { 3293 /* 3294 * SR_RECOGNIZER_INTERNAL_BOS_DETECTION, SR_RECOGNIZER_INTERNAL_EOS_DETECTION, or 3295 * SR_RECOGNIZER_INTERNAL_EOI 3296 */ 3297 return rc; 3298 } 3299 /* Leaked state */ 3300 passert(0); 3301 break; 3302 3303 case SR_RECOGNIZER_INTERNAL_EOS_DETECTION: 3304 pushable = ESR_FALSE; 3305 rc = canPushAudioIntoRecognizer(impl); 3306 if (rc == ESR_SUCCESS) 3307 { 3308 /* Not enough samples to process one frame */ 3309 if (CA_GetUnprocessedFramesInUtterance(impl->utterance) <= 0) 3310 { 3311 *status = SR_RECOGNIZER_EVENT_NEED_MORE_AUDIO; 3312 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 3313 return ESR_SUCCESS; 3314 } 3315 } 3316 else if (rc != ESR_CONTINUE_PROCESSING) 3317 return rc; 3318 else if (impl->internalState == SR_RECOGNIZER_INTERNAL_EOI) 3319 goto MOVE_TO_NEXT_STATE; 3320 else 3321 pushable = ESR_TRUE; 3322 if (pushable) 3323 { 3324 rc = pushAudioIntoRecognizer(impl, status, type, impl->result); 3325 if (rc != ESR_CONTINUE_PROCESSING) 3326 { 3327 /* Not enough samples to process one frame */ 3328 return rc; 3329 } 3330 if (impl->internalState == SR_RECOGNIZER_INTERNAL_EOI) 3331 goto MOVE_TO_NEXT_STATE; 3332 rc = generateFrameFromAudio(impl, status, type, impl->result); 3333 if (rc != ESR_CONTINUE_PROCESSING) 3334 { 3335 /* 3336 * The internal recognizer needs a minimum amount of audio before 3337 * it begins generating frames. 3338 */ 3339 return rc; 3340 } 3341 } 3342 rc = generateFrameStats(impl, status, type, impl->result); 3343 if (rc != ESR_CONTINUE_PROCESSING) 3344 { 3345 /* Not enough frames to calculate stats */ 3346 return rc; 3347 } 3348 rc = generatePatternFromFrame(impl, status, type, impl->result); 3349 if (rc != ESR_CONTINUE_PROCESSING) 3350 { 3351 /* End of speech detected */ 3352 return rc; 3353 } 3354 if (impl->internalState == SR_RECOGNIZER_INTERNAL_END) 3355 goto MOVE_TO_NEXT_STATE; 3356 rc = detectEndOfSpeech(impl, status, type, impl->result); 3357 if (rc != ESR_CONTINUE_PROCESSING) 3358 { 3359 /* End of speech detected */ 3360 return rc; 3361 } 3362 *status = SR_RECOGNIZER_EVENT_INCOMPLETE; 3363 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 3364 return ESR_SUCCESS; 3365 3366 case SR_RECOGNIZER_INTERNAL_EOI: 3367 /* 3368 * On EOI (end of input), we need to process the remaining frames that had not 3369 * been processed when PutAudio set the gotLastFrame flag 3370 */ 3371 rc = generatePatternFromFrameEOI(impl, status, type, impl->result); 3372 if (rc != ESR_CONTINUE_PROCESSING) 3373 { 3374 /* End of speech detected */ 3375 return rc; 3376 } 3377 rc = detectEndOfSpeech(impl, status, type, impl->result); 3378 if (rc != ESR_CONTINUE_PROCESSING) 3379 { 3380 /* End of speech detected */ 3381 return rc; 3382 } 3383 *status = SR_RECOGNIZER_EVENT_INCOMPLETE; 3384 *type = SR_RECOGNIZER_RESULT_TYPE_NONE; 3385 return ESR_SUCCESS; 3386 3387 case SR_RECOGNIZER_INTERNAL_EOS: 3388 /* On EOS (end of speech detected - not due to end of input), create the result */ 3389 if (impl->lockFunction) 3390 impl->lockFunction(ESR_LOCK, impl->lockData); 3391 CircularBufferReset(impl->buffer); 3392 if (impl->lockFunction) 3393 impl->lockFunction(ESR_UNLOCK, impl->lockData); 3394 CHKLOG(rc, SR_RecognizerCreateResultImpl((SR_Recognizer*) impl, status, type)); 3395 impl->internalState = SR_RECOGNIZER_INTERNAL_END; 3396 return ESR_SUCCESS; 3397 3398 case SR_RECOGNIZER_INTERNAL_END: 3399 return ESR_SUCCESS; 3400 default: 3401 PLogError(L("ESR_INVALID_STATE")); 3402 return ESR_INVALID_STATE; 3403 } 3404CLEANUP: 3405 return rc; 3406} 3407 3408 3409 3410ESR_ReturnCode SR_RecognizerLoadUtteranceImpl(SR_Recognizer* self, const LCHAR* filename) 3411{ 3412 /* TODO: complete */ 3413 return ESR_SUCCESS; 3414} 3415 3416ESR_ReturnCode SR_RecognizerLoadWaveFileImpl(SR_Recognizer* self, const LCHAR* filename) 3417{ 3418 /* TODO: complete */ 3419 return ESR_SUCCESS; 3420} 3421 3422ESR_ReturnCode SR_RecognizerLogEventImpl(SR_Recognizer* self, const LCHAR* event) 3423{ 3424 ESR_ReturnCode rc; 3425 SR_RecognizerImpl *impl = (SR_RecognizerImpl*) self; 3426 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, event)); 3427 return ESR_SUCCESS; 3428CLEANUP: 3429 return rc; 3430} 3431 3432ESR_ReturnCode SR_RecognizerLogTokenImpl(SR_Recognizer* self, const LCHAR* token, const LCHAR* value) 3433{ 3434 ESR_ReturnCode rc; 3435 SR_RecognizerImpl *impl = (SR_RecognizerImpl*) self; 3436 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, token, value)); 3437 return ESR_SUCCESS; 3438CLEANUP: 3439 return rc; 3440} 3441 3442ESR_ReturnCode SR_RecognizerLogTokenIntImpl(SR_Recognizer* self, const LCHAR* token, int value) 3443{ 3444 ESR_ReturnCode rc; 3445 SR_RecognizerImpl *impl = (SR_RecognizerImpl*) self; 3446 CHKLOG(rc, SR_EventLogTokenInt_BASIC(impl->eventLog, impl->osi_log_level, token, value)); 3447 return ESR_SUCCESS; 3448CLEANUP: 3449 return rc; 3450} 3451 3452ESR_ReturnCode SR_RecognizerLogSessionStartImpl(SR_Recognizer* self, const LCHAR* sessionName) 3453{ 3454 ESR_ReturnCode rc; 3455 SR_RecognizerImpl *impl = (SR_RecognizerImpl*) self; 3456 /** 3457 * OSI Platform logging. 3458 * In OSR, these events are logged by the platform. We have no platform in ESR, so we 3459 * log them here. 3460 */ 3461 3462 /* call (session) start, tokens optional */ 3463 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIclst"))); 3464 3465 /* service start, in this case SRecTest service */ 3466 CHKLOG(rc, SR_EventLogToken_BASIC(impl->eventLog, impl->osi_log_level, L("SVNM"), sessionName)); 3467 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIsvst"))); 3468 if (impl->osi_log_level & OSI_LOG_LEVEL_BASIC) 3469 CHKLOG(rc, SR_EventLogEventSession(impl->eventLog)); 3470 3471 return ESR_SUCCESS; 3472CLEANUP: 3473 return rc; 3474} 3475 3476ESR_ReturnCode SR_RecognizerLogSessionEndImpl(SR_Recognizer* self) 3477{ 3478 ESR_ReturnCode rc; 3479 SR_RecognizerImpl *impl = (SR_RecognizerImpl*) self; 3480 3481 /* OSI log end of call (session) */ 3482 CHKLOG(rc, SR_EventLogEvent_BASIC(impl->eventLog, impl->osi_log_level, L("SWIclnd"))); 3483 if (impl->osi_log_level & OSI_LOG_LEVEL_BASIC) 3484 CHKLOG(rc, SR_EventLogEventSession(impl->eventLog)); 3485 return ESR_SUCCESS; 3486CLEANUP: 3487 return rc; 3488} 3489 3490 3491ESR_ReturnCode SR_RecognizerLogWaveformDataImpl(SR_Recognizer* self, const LCHAR* waveformFilename, 3492 const LCHAR* transcription, const double bos, 3493 const double eos, ESR_BOOL isInvocab) 3494{ 3495 ESR_ReturnCode rc; 3496 SR_RecognizerImpl *impl = (SR_RecognizerImpl*) self; 3497 LCHAR num[P_PATH_MAX]; 3498 int frame; 3499 3500 CHKLOG(rc, SR_EventLogToken_AUDIO(impl->eventLog, impl->osi_log_level, L("FILE"), waveformFilename)); 3501 CHKLOG(rc, SR_EventLogToken_AUDIO(impl->eventLog, impl->osi_log_level, L("TRANS"), transcription)); 3502 sprintf(num, L("%.2f"), bos); 3503 CHKLOG(rc, SR_EventLogToken_AUDIO(impl->eventLog, impl->osi_log_level, L("BOS_SEC"), num)); 3504 sprintf(num, L("%.2f"), eos); 3505 CHKLOG(rc, SR_EventLogToken_AUDIO(impl->eventLog, impl->osi_log_level, L("EOS_SEC"), num)); 3506 CHKLOG(rc, SR_EventLogTokenInt_AUDIO(impl->eventLog, impl->osi_log_level, L("FRAMESIZE"), impl->FRAME_SIZE)); 3507 CHKLOG(rc, SR_EventLogTokenInt_AUDIO(impl->eventLog, impl->osi_log_level, L("SAMPLERATE"), impl->sampleRate)); 3508 frame = (int)(bos * impl->sampleRate * 2 /* 2 bytes per sample */) / impl->FRAME_SIZE; 3509 CHKLOG(rc, SR_EventLogTokenInt_AUDIO(impl->eventLog, impl->osi_log_level, L("BOS_FR"), frame)); 3510 frame = (int)(eos * impl->sampleRate * 2 /* 2 bytes per sample */) / impl->FRAME_SIZE; 3511 CHKLOG(rc, SR_EventLogTokenInt_AUDIO(impl->eventLog, impl->osi_log_level, L("EOS_FR"), frame)); 3512 CHKLOG(rc, SR_EventLogTokenInt_AUDIO(impl->eventLog, impl->osi_log_level, L("INVOCAB"), isInvocab)); 3513 CHKLOG(rc, SR_EventLogEvent_AUDIO(impl->eventLog, impl->osi_log_level, L("ESRwfrd"))); 3514 return ESR_SUCCESS; 3515CLEANUP: 3516 return rc; 3517} 3518 3519ESR_ReturnCode SR_RecognizerSetLockFunctionImpl(SR_Recognizer* self, SR_RecognizerLockFunction function, void* data) 3520{ 3521 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 3522 3523 impl->lockFunction = function; 3524 impl->lockData = data; 3525 return ESR_SUCCESS; 3526} 3527 3528static ESR_ReturnCode doSignalQualityInit(SR_RecognizerImpl* impl) 3529{ 3530 CA_DoSignalCheck(impl->wavein, &impl->isSignalClipping, &impl->isSignalDCOffset, 3531 &impl->isSignalNoisy, &impl->isSignalTooQuiet, &impl->isSignalTooFewSamples, 3532 &impl->isSignalTooManySamples); 3533 impl->isSignalQualityInitialized = ESR_TRUE; 3534 return ESR_SUCCESS; 3535} 3536 3537ESR_ReturnCode SR_RecognizerIsSignalClippingImpl(SR_Recognizer* self, ESR_BOOL* isClipping) 3538{ 3539 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 3540 ESR_ReturnCode rc; 3541 3542 if (isClipping == NULL) 3543 { 3544 PLogError("SR_RecognizerIsSignalClippingImpl", ESR_INVALID_ARGUMENT); 3545 return ESR_INVALID_ARGUMENT; 3546 } 3547 if (!impl->isSignalQualityInitialized) 3548 CHKLOG(rc, doSignalQualityInit(impl)); 3549 *isClipping = impl->isSignalClipping; 3550 return ESR_SUCCESS; 3551CLEANUP: 3552 return rc; 3553} 3554 3555ESR_ReturnCode SR_RecognizerIsSignalDCOffsetImpl(SR_Recognizer* self, ESR_BOOL* isDCOffset) 3556{ 3557 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 3558 ESR_ReturnCode rc; 3559 3560 if (isDCOffset == NULL) 3561 { 3562 PLogError("SR_RecognizerIsSignalDCOffsetImpl", ESR_INVALID_ARGUMENT); 3563 return ESR_INVALID_ARGUMENT; 3564 } 3565 if (!impl->isSignalQualityInitialized) 3566 CHKLOG(rc, doSignalQualityInit(impl)); 3567 *isDCOffset = impl->isSignalDCOffset; 3568 return ESR_SUCCESS; 3569CLEANUP: 3570 return rc; 3571} 3572 3573ESR_ReturnCode SR_RecognizerIsSignalNoisyImpl(SR_Recognizer* self, ESR_BOOL* isNoisy) 3574{ 3575 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 3576 ESR_ReturnCode rc; 3577 3578 if (isNoisy == NULL) 3579 { 3580 PLogError("SR_RecognizerIsSignalNoisyImpl", ESR_INVALID_ARGUMENT); 3581 return ESR_INVALID_ARGUMENT; 3582 } 3583 if (!impl->isSignalQualityInitialized) 3584 CHKLOG(rc, doSignalQualityInit(impl)); 3585 *isNoisy = impl->isSignalNoisy; 3586 return ESR_SUCCESS; 3587CLEANUP: 3588 return rc; 3589} 3590 3591ESR_ReturnCode SR_RecognizerIsSignalTooQuietImpl(SR_Recognizer* self, ESR_BOOL* isTooQuiet) 3592{ 3593 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 3594 ESR_ReturnCode rc; 3595 3596 if (isTooQuiet == NULL) 3597 { 3598 PLogError("SR_RecognizerIsSignalTooQuietImpl", ESR_INVALID_ARGUMENT); 3599 return ESR_INVALID_ARGUMENT; 3600 } 3601 if (!impl->isSignalQualityInitialized) 3602 CHKLOG(rc, doSignalQualityInit(impl)); 3603 *isTooQuiet = impl->isSignalTooQuiet; 3604 return ESR_SUCCESS; 3605CLEANUP: 3606 return rc; 3607} 3608 3609ESR_ReturnCode SR_RecognizerIsSignalTooFewSamplesImpl(SR_Recognizer* self, ESR_BOOL* isTooFewSamples) 3610{ 3611 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 3612 ESR_ReturnCode rc; 3613 3614 if (isTooFewSamples == NULL) 3615 { 3616 PLogError("SR_RecognizerIsSignalTooFewSamplesImpl", ESR_INVALID_ARGUMENT); 3617 return ESR_INVALID_ARGUMENT; 3618 } 3619 if (!impl->isSignalQualityInitialized) 3620 CHKLOG(rc, doSignalQualityInit(impl)); 3621 *isTooFewSamples = impl->isSignalTooFewSamples; 3622 return ESR_SUCCESS; 3623CLEANUP: 3624 return rc; 3625} 3626 3627ESR_ReturnCode SR_RecognizerIsSignalTooManySamplesImpl(SR_Recognizer* self, ESR_BOOL* isTooManySamples) 3628{ 3629 SR_RecognizerImpl* impl = (SR_RecognizerImpl*) self; 3630 ESR_ReturnCode rc; 3631 3632 if (isTooManySamples == NULL) 3633 { 3634 PLogError("SR_RecognizerIsSignalTooManySamplesImpl", ESR_INVALID_ARGUMENT); 3635 return ESR_INVALID_ARGUMENT; 3636 } 3637 if (!impl->isSignalQualityInitialized) 3638 CHKLOG(rc, doSignalQualityInit(impl)); 3639 *isTooManySamples = impl->isSignalTooManySamples; 3640 return ESR_SUCCESS; 3641CLEANUP: 3642 return rc; 3643} 3644 3645 3646 3647/**************************************/ 3648/* Waveform Buffer stuff */ 3649/**************************************/ 3650ESR_ReturnCode WaveformBuffer_Create(WaveformBuffer** waveformBuffer, size_t frame_size) 3651{ 3652 ESR_ReturnCode rc; 3653 WaveformBuffer *buf; 3654 size_t val_size_t; 3655 int val_int; 3656 ESR_BOOL exists; 3657 3658 buf = NEW(WaveformBuffer, L("SR_RecognizerImpl.wvfmbuf")); 3659 if (buf == NULL) 3660 { 3661 rc = ESR_OUT_OF_MEMORY; 3662 PLogError(L("%s: could not create WaveformBuffer"), ESR_rc2str(rc)); 3663 goto CLEANUP; 3664 } 3665 3666 ESR_SessionContains(L("SREC.voice_enroll.bufsz_kB"), &exists); 3667 if (exists) 3668 ESR_SessionGetSize_t(L("SREC.voice_enroll.bufsz_kB"), &val_size_t); 3669 else 3670 val_size_t = DEFAULT_WAVEFORM_BUFFER_MAX_SIZE; 3671 val_size_t *= 1024; /* convert to kB*/ 3672 CHKLOG(rc, CircularBufferCreate(val_size_t, L("SR_RecognizerImpl.wvfmbuf.cbuffer"), &buf->cbuffer)); 3673 3674 ESR_SessionContains(L("CREC.Frontend.start_windback"), &exists); 3675 if (exists) 3676 ESR_SessionGetInt(L("CREC.Frontend.start_windback"), &val_int); 3677 else 3678 val_int = DEFAULT_WAVEFORM_WINDBACK_FRAMES; 3679 val_int *= frame_size; /* convert frames to bytes */ 3680 buf->windback_buffer_sz = (size_t) val_int; 3681 buf->windback_buffer = MALLOC(buf->windback_buffer_sz, L("SR_RecognizerImpl.wvfmbuf.windback")); 3682 if (buf->windback_buffer == NULL) 3683 { 3684 rc = ESR_OUT_OF_MEMORY; 3685 PLogError(L("%s: could not create Waveform windback buffer"), ESR_rc2str(rc)); 3686 goto CLEANUP; 3687 } 3688 3689 3690 ESR_SessionContains(L("SREC.voice_enroll.eos_comfort_frames"), &exists); 3691 if (exists) 3692 ESR_SessionGetSize_t(L("SREC.voice_enroll.eos_comfort_frames"), &val_size_t); 3693 else 3694 val_size_t = DEFAULT_EOS_COMFORT_FRAMES; 3695 buf->eos_comfort_frames = val_size_t; 3696 3697 ESR_SessionContains(L("SREC.voice_enroll.bos_comfort_frames"), &exists); 3698 if (exists) 3699 ESR_SessionGetSize_t(L("SREC.voice_enroll.bos_comfort_frames"), &val_size_t); 3700 else 3701 val_size_t = DEFAULT_BOS_COMFORT_FRAMES; 3702 buf->bos_comfort_frames = val_size_t; 3703 3704 /* initially off */ 3705 buf->state = WAVEFORM_BUFFERING_OFF; 3706 3707 *waveformBuffer = buf; 3708 return ESR_SUCCESS; 3709CLEANUP: 3710 WaveformBuffer_Destroy(buf); 3711 return rc; 3712} 3713 3714ESR_ReturnCode WaveformBuffer_Write(WaveformBuffer* waveformBuffer, void *data, size_t num_bytes) 3715{ 3716 size_t available_bytes; 3717 size_t done_bytes; 3718 3719 /* do nothing if not active */ 3720 switch (waveformBuffer->state) 3721 { 3722 case WAVEFORM_BUFFERING_OFF: 3723 return ESR_SUCCESS; 3724 3725 case WAVEFORM_BUFFERING_ON_CIRCULAR: 3726 available_bytes = CircularBufferGetAvailable(waveformBuffer->cbuffer); 3727 if (available_bytes < num_bytes) 3728 { 3729 done_bytes = CircularBufferSkip(waveformBuffer->cbuffer, num_bytes - available_bytes); 3730 if (done_bytes != num_bytes - available_bytes) 3731 { 3732 PLogError("WaveformBuffer_Write: error when skipping bytes"); 3733 return ESR_INVALID_STATE; 3734 } 3735 } 3736 done_bytes = CircularBufferWrite(waveformBuffer->cbuffer, data, num_bytes); 3737 if (done_bytes != num_bytes) 3738 { 3739 PLogError("WaveformBuffer_Write: error when writing bytes"); 3740 return ESR_INVALID_STATE; 3741 } 3742 return ESR_SUCCESS; 3743 3744 case WAVEFORM_BUFFERING_ON_LINEAR: 3745 available_bytes = CircularBufferGetAvailable(waveformBuffer->cbuffer); 3746 if (available_bytes < num_bytes) 3747 { 3748 waveformBuffer->overflow_count += num_bytes; 3749 return ESR_BUFFER_OVERFLOW; 3750 } 3751 done_bytes = CircularBufferWrite(waveformBuffer->cbuffer, data, num_bytes); 3752 if (done_bytes != num_bytes) 3753 { 3754 PLogError("WaveformBuffer_Write: error when writing bytes"); 3755 return ESR_INVALID_STATE; 3756 } 3757 return ESR_SUCCESS; 3758 3759 default: 3760 PLogError("WaveformBuffer_Write: bad control path"); 3761 return ESR_INVALID_STATE; 3762 } 3763} 3764 3765ESR_ReturnCode WaveformBuffer_Read(WaveformBuffer* waveformBuffer, void *data, size_t* num_bytes) 3766{ 3767 size_t bytes_to_read; 3768 ESR_ReturnCode rc; 3769 3770 if (num_bytes == NULL) 3771 { 3772 rc = ESR_INVALID_ARGUMENT; 3773 PLogError(ESR_rc2str(rc)); 3774 goto CLEANUP; 3775 } 3776 if (waveformBuffer->overflow_count > 0) 3777 { 3778 memset(data, 0, *num_bytes); 3779 *num_bytes = 0; 3780 PLogError(L("WaveformBuffer_Read: previous overflow causes read to return NULL")); 3781 return ESR_SUCCESS; 3782 } 3783 3784 if (waveformBuffer->read_size != 0 && *num_bytes > waveformBuffer->read_size) 3785 { 3786 PLogError(L("ESR_OUT_OF_MEMORY: waveform buffer too small for read, increase from %d to %d"), *num_bytes, waveformBuffer->read_size); 3787 return ESR_OUT_OF_MEMORY; 3788 } 3789 3790 if (waveformBuffer->read_size == 0) 3791 bytes_to_read = *num_bytes; 3792 else 3793 bytes_to_read = MIN(waveformBuffer->read_size, *num_bytes); 3794 waveformBuffer->read_size -= bytes_to_read; 3795 *num_bytes = CircularBufferRead(waveformBuffer->cbuffer, data, bytes_to_read); 3796 if (*num_bytes != bytes_to_read) 3797 { 3798 PLogError("WaveformBuffer_Read: error reading buffer"); 3799 return ESR_INVALID_STATE; 3800 } 3801 return ESR_SUCCESS; 3802CLEANUP: 3803 return rc; 3804} 3805 3806/* WindBack will save the last num_bytes recorded, reset the buffer, and then load the 3807 saved bytes at the beginning of the buffer */ 3808ESR_ReturnCode WaveformBuffer_WindBack(WaveformBuffer* waveformBuffer, const size_t num_bytes) 3809{ 3810 ESR_ReturnCode rc; 3811 size_t bufferSize; 3812 3813 if (num_bytes <= 0) 3814 { 3815 CHKLOG(rc, WaveformBuffer_Reset(waveformBuffer)); 3816 return ESR_SUCCESS; 3817 } 3818 3819 /* make sure windback buffer is big enough */ 3820 if (num_bytes > waveformBuffer->windback_buffer_sz) 3821 { 3822 rc = ESR_OUT_OF_MEMORY; 3823 PLogError(L("%s: windback buffer is too small (needed=%d, had=%d)"), ESR_rc2str(rc), num_bytes, waveformBuffer->windback_buffer_sz); 3824 goto CLEANUP; 3825 } 3826 3827 CHKLOG(rc, WaveformBuffer_GetSize(waveformBuffer, &bufferSize)); 3828 /* skip the first few bytes written */ 3829 if (bufferSize < num_bytes) 3830 { 3831 PLogError("bufferSize %d num_bytes %d (ESR_INVALID_STATE)\n", bufferSize, num_bytes); 3832 bufferSize = 0; 3833 } 3834 else 3835 { 3836 bufferSize -= num_bytes; 3837 } 3838 CHKLOG(rc, WaveformBuffer_Skip(waveformBuffer, bufferSize)); 3839 /* read the last few bytes written */ 3840 bufferSize = num_bytes; 3841 CHKLOG(rc, WaveformBuffer_Read(waveformBuffer, waveformBuffer->windback_buffer, &bufferSize)); 3842 3843 /* reset buffer */ 3844 CHKLOG(rc, WaveformBuffer_Reset(waveformBuffer)); 3845 3846 /* rewrite the saved bytes at the beginning */ 3847 CHKLOG(rc, WaveformBuffer_Write(waveformBuffer, waveformBuffer->windback_buffer, bufferSize)); 3848 return ESR_SUCCESS; 3849CLEANUP: 3850 return rc; 3851} 3852 3853ESR_ReturnCode WaveformBuffer_Destroy(WaveformBuffer* waveformBuffer) 3854{ 3855 if (waveformBuffer->cbuffer) 3856 FREE(waveformBuffer->cbuffer); 3857 if (waveformBuffer->windback_buffer) 3858 FREE(waveformBuffer->windback_buffer); 3859 if (waveformBuffer) 3860 FREE(waveformBuffer); 3861 return ESR_SUCCESS; 3862} 3863 3864ESR_ReturnCode WaveformBuffer_SetBufferingState(WaveformBuffer* waveformBuffer, waveform_buffering_state_t state) 3865{ 3866 waveformBuffer->state = state; 3867 return ESR_SUCCESS; 3868} 3869 3870ESR_ReturnCode WaveformBuffer_GetBufferingState(WaveformBuffer* waveformBuffer, waveform_buffering_state_t* state) 3871{ 3872 *state = waveformBuffer->state; 3873 return ESR_SUCCESS; 3874} 3875 3876/** 3877 * @return ESR_BUFFER_OVERFLOW if nametag EOS occured beyond end of buffer 3878 */ 3879ESR_ReturnCode WaveformBuffer_ParseEndPointedResultAndTrim(WaveformBuffer* waveformBuffer, const LCHAR* end_pointed_result, const size_t bytes_per_frame) 3880{ 3881 const LCHAR *p; 3882 size_t bos_frame, eos_frame, bufferSize, read_start_offset; 3883 ESR_ReturnCode rc; 3884 3885 /* potential end pointed results 3886 3887 -pau-@19 tape@36 scan@64 down@88 -pau2-@104 3888 -pau-@19 tape@34 off@55 -pau2-@78 3889 -pau-@19 tape@47 help@66 -pau2-@80 3890 -pau-@16 tape@36 reverse@71 -pau2-@91 3891 -pau-@21 tape@42 scan@59 down@80 -pau2-@91 3892 3893 what I need to extract is the integer between "-pau-@" and ' ' 3894 and the integer between '@' and " -pau2-" 3895 */ 3896 3897 3898 p = LSTRSTR( end_pointed_result, PREFIX_WORD); 3899 if(p) p+=PREFIX_WORD_LEN; while(p && *p == '@') p++; 3900 rc = p ? lstrtoui(p, &bos_frame, 10) : ESR_INVALID_ARGUMENT; 3901 if (rc == ESR_INVALID_ARGUMENT) 3902 { 3903 PLogError(L("%s: extracting bos from text=%s"), ESR_rc2str(rc), end_pointed_result); 3904 goto CLEANUP; 3905 } 3906 else if (rc != ESR_SUCCESS) 3907 goto CLEANUP; 3908 3909 p = LSTRSTR( end_pointed_result, SUFFIX_WORD); 3910 while(p && p>end_pointed_result && p[-1]!='@') --p; 3911 rc = p ? lstrtoui(p, &eos_frame, 10) : ESR_INVALID_ARGUMENT; 3912 if (rc == ESR_INVALID_ARGUMENT) 3913 { 3914 PLogError(L("%s: extracting eos from text=%s"), ESR_rc2str(rc), end_pointed_result); 3915 goto CLEANUP; 3916 } 3917 else if (rc != ESR_SUCCESS) 3918 goto CLEANUP; 3919 3920 bos_frame -= (bos_frame > waveformBuffer->bos_comfort_frames ? waveformBuffer->bos_comfort_frames : 0); 3921 eos_frame += waveformBuffer->eos_comfort_frames; 3922 3923 /* 3924 * I know where speech started, so I want to skip frames 0 to bos_frame. 3925 * I also know where speech ended so I want to set the amount of frames(bytes) to read for 3926 * the nametag audio buffer (i.e. the read_size) 3927 */ 3928 3929 read_start_offset = bos_frame * bytes_per_frame * 2 /* times 2 because of skip even frames */; 3930 waveformBuffer->read_size = (eos_frame - bos_frame) * bytes_per_frame * 2 /* times 2 because of skip even frames */; 3931 3932 CHKLOG(rc, WaveformBuffer_GetSize(waveformBuffer, &bufferSize)); 3933 if (read_start_offset + waveformBuffer->read_size > bufferSize) 3934 { 3935 waveformBuffer->overflow_count += read_start_offset + waveformBuffer->read_size - bufferSize; 3936 passert(waveformBuffer->overflow_count > 0); 3937 PLogMessage(L("Warning: Voice Enrollment audio buffer overflow (spoke too much, over by %d bytes)"), 3938 waveformBuffer->overflow_count); 3939 return ESR_BUFFER_OVERFLOW; 3940 } 3941 CHKLOG(rc, WaveformBuffer_Skip(waveformBuffer, read_start_offset)); 3942#ifdef SREC_ENGINE_VERBOSE_LOGGING 3943 PLogMessage(L("Voice Enrollment: bos@%d, eos@%d, therefore sizeof(waveform) should be %d"), bos_frame, eos_frame, waveformBuffer->read_size); 3944#endif 3945 return ESR_SUCCESS; 3946CLEANUP: 3947 return rc; 3948} 3949 3950 3951ESR_ReturnCode WaveformBuffer_Reset(WaveformBuffer* waveformBuffer) 3952{ 3953 CircularBufferReset(waveformBuffer->cbuffer); 3954 waveformBuffer->overflow_count = 0; 3955 waveformBuffer->read_size = 0; 3956 return ESR_SUCCESS; 3957} 3958 3959ESR_ReturnCode WaveformBuffer_GetSize(WaveformBuffer* waveformBuffer, size_t* size) 3960{ 3961 *size = CircularBufferGetSize(waveformBuffer->cbuffer); 3962 return ESR_SUCCESS; 3963} 3964 3965ESR_ReturnCode WaveformBuffer_Skip(WaveformBuffer* waveformBuffer, const size_t bytes) 3966{ 3967 if (CircularBufferSkip(waveformBuffer->cbuffer, bytes) != (int) bytes) 3968 return ESR_INVALID_STATE; 3969 return ESR_SUCCESS; 3970} 3971 3972 3973 3974static ESR_ReturnCode SR_Recognizer_Reset_Buffers ( SR_RecognizerImpl *impl ) 3975 { 3976 ESR_ReturnCode reset_status; 3977 3978 FREE ( impl->audioBuffer ); 3979 impl->audioBuffer = NULL; 3980 impl->audioBuffer = MALLOC ( impl->FRAME_SIZE, MTAG ); 3981 3982 if ( impl->audioBuffer != NULL ) 3983 { 3984 WaveformBuffer_Destroy ( impl->waveformBuffer ); 3985 impl->waveformBuffer = NULL; 3986 reset_status = WaveformBuffer_Create ( &impl->waveformBuffer, impl->FRAME_SIZE ); 3987 } 3988 else 3989 { 3990 reset_status = ESR_OUT_OF_MEMORY; 3991 } 3992 return ( reset_status ); 3993 } 3994 3995 3996 3997static ESR_ReturnCode SR_Recognizer_Validate_Sample_Rate ( size_t sample_rate ) 3998 { 3999 ESR_ReturnCode validate_status; 4000 4001 switch ( sample_rate ) 4002 { 4003 case 8000: 4004 case 11025: 4005 case 16000: 4006 case 22050: 4007 validate_status = ESR_SUCCESS; 4008 break; 4009 4010 default: 4011 validate_status = ESR_INVALID_ARGUMENT; 4012 break; 4013 } 4014 return ( validate_status ); 4015 } 4016 4017 4018 4019static ESR_ReturnCode SR_Recognizer_Sample_Rate_Needs_Change ( size_t new_sample_rate, ESR_BOOL *needs_changing ) 4020 { 4021 ESR_ReturnCode validate_status; 4022 size_t current_sample_rate; 4023 4024 validate_status = ESR_SessionGetSize_t ( "CREC.Frontend.samplerate", ¤t_sample_rate ); 4025 4026 if ( validate_status == ESR_SUCCESS ) 4027 { 4028 if ( new_sample_rate != current_sample_rate ) 4029 *needs_changing = ESR_TRUE; 4030 else 4031 *needs_changing = ESR_TRUE; 4032 } 4033 return ( validate_status ); 4034 } 4035 4036 4037 4038static ESR_ReturnCode SR_Recognizer_Change_Sample_Rate_Session_Params_8K ( void ) 4039 { 4040 ESR_ReturnCode change_status; 4041 LCHAR model_filenames [P_PATH_MAX]; 4042 LCHAR lda_filename [P_PATH_MAX]; 4043 size_t filename_length; 4044 4045 filename_length = P_PATH_MAX; 4046 change_status = ESR_SessionGetLCHAR ( L("cmdline.modelfiles8"), model_filenames, &filename_length ); 4047 4048 if ( change_status == ESR_SUCCESS ) 4049 { 4050 filename_length = P_PATH_MAX; 4051 change_status = ESR_SessionGetLCHAR ( L("cmdline.lda8"), lda_filename, &filename_length ); 4052 4053/* From this point on, if an error occurs, we're screwed and recovery is probably impossible */ 4054 if ( change_status == ESR_SUCCESS ) 4055 { 4056 change_status = ESR_SessionSetSize_t ( "CREC.Frontend.samplerate", 8000 ); 4057 if ( change_status == ESR_SUCCESS ) 4058 { 4059 change_status = ESR_SessionSetInt ( "CREC.Frontend.highcut", 4000 ); 4060 4061 if ( change_status == ESR_SUCCESS ) 4062 { 4063 change_status = ESR_SessionSetLCHAR ( L("cmdline.modelfiles"), model_filenames ); 4064 4065 if ( change_status == ESR_SUCCESS ) 4066 change_status = ESR_SessionSetLCHAR ( L("cmdline.lda"), lda_filename ); 4067 } 4068 } 4069 } 4070 else 4071 { 4072 PLogError (L("\nMissing Parameter lda8\n")); 4073 } 4074 } 4075 else 4076 { 4077 PLogError (L("\nMissing Parameter models8\n")); 4078 } 4079 return ( change_status ); 4080 } 4081 4082 4083 4084static ESR_ReturnCode SR_Recognizer_Change_Sample_Rate_Session_Params_11K_to_22K ( size_t sample_rate ) 4085 { 4086 ESR_ReturnCode change_status; 4087 LCHAR model_filenames [P_PATH_MAX]; 4088 LCHAR lda_filename [P_PATH_MAX]; 4089 size_t filename_length; 4090 4091 filename_length = P_PATH_MAX; 4092 change_status = ESR_SessionGetLCHAR ( L("cmdline.modelfiles11"), model_filenames, &filename_length ); 4093 4094 if ( change_status == ESR_SUCCESS ) 4095 { 4096 filename_length = P_PATH_MAX; 4097 change_status = ESR_SessionGetLCHAR ( L("cmdline.lda11"), lda_filename, &filename_length ); 4098 4099/* From this point on, if an error occurs, we're screwed and recovery is probably impossible */ 4100 if ( change_status == ESR_SUCCESS ) 4101 { 4102 change_status = ESR_SessionSetSize_t ( "CREC.Frontend.samplerate", sample_rate ); 4103 4104 if ( change_status == ESR_SUCCESS ) 4105 { 4106 change_status = ESR_SessionSetInt ( "CREC.Frontend.highcut", 5500 ); 4107 4108 if ( change_status == ESR_SUCCESS ) 4109 { 4110 change_status = ESR_SessionSetLCHAR ( L("cmdline.modelfiles"), model_filenames ); 4111 4112 if ( change_status == ESR_SUCCESS ) 4113 change_status = ESR_SessionSetLCHAR ( L("cmdline.lda"), lda_filename ); 4114 } 4115 } 4116 } 4117 else 4118 { 4119 PLogError (L("\nMissing Parameter lda11\n")); 4120 } 4121 } 4122 else 4123 { 4124 PLogError (L("\nMissing Parameter models11\n")); 4125 } 4126 return ( change_status ); 4127 } 4128 4129 4130 4131static ESR_ReturnCode SR_Recognizer_Change_Sample_Rate_Session_Params ( size_t new_sample_rate ) 4132 { 4133 ESR_ReturnCode change_status; 4134 4135 if ( new_sample_rate == 8000 ) 4136 change_status = SR_Recognizer_Change_Sample_Rate_Session_Params_8K ( ); 4137 else 4138 change_status = SR_Recognizer_Change_Sample_Rate_Session_Params_11K_to_22K ( new_sample_rate ); 4139 4140 return ( change_status ); 4141 } 4142 4143 4144 4145ESR_ReturnCode SR_Recognizer_Change_Sample_RateImpl ( SR_Recognizer *recognizer, size_t new_sample_rate ) 4146 { 4147 ESR_ReturnCode change_status; 4148 ESR_BOOL rate_needs_changing; 4149 SR_RecognizerImpl *impl; 4150 CA_FrontendInputParams *frontendParams; 4151 4152 change_status = SR_Recognizer_Validate_Sample_Rate ( new_sample_rate ); 4153 4154 if ( change_status == ESR_SUCCESS ) 4155 { 4156 change_status = SR_Recognizer_Sample_Rate_Needs_Change ( new_sample_rate, &rate_needs_changing ); 4157 4158 if ( change_status == ESR_SUCCESS ) 4159 { 4160 if ( rate_needs_changing == ESR_TRUE ) 4161 { 4162 change_status = SR_Recognizer_Change_Sample_Rate_Session_Params ( new_sample_rate ); 4163 4164 if ( change_status == ESR_SUCCESS ) 4165 { // SR_RecognizerCreateFrontendImpl 4166 impl = (SR_RecognizerImpl *)recognizer; 4167 change_status = SR_RecognizerUnsetupImpl( recognizer ); 4168 4169 if ( change_status == ESR_SUCCESS ) 4170 { 4171 CA_UnconfigureFrontend ( impl->frontend ); 4172 frontendParams = CA_AllocateFrontendParameters ( ); 4173 4174 if ( frontendParams != NULL ) 4175 { 4176 change_status = SR_RecognizerGetFrontendLegacyParametersImpl ( frontendParams ); 4177 4178 if ( change_status == ESR_SUCCESS ) 4179 { 4180 CA_ConfigureFrontend ( impl->frontend, frontendParams ); 4181 CA_UnconfigureWave ( impl->wavein ); 4182 CA_ConfigureWave ( impl->wavein, impl->frontend ); 4183 impl->sampleRate = new_sample_rate; 4184 impl->FRAME_SIZE = impl->sampleRate / FRAMERATE * SAMPLE_SIZE; 4185 change_status = SR_Recognizer_Reset_Buffers ( impl ); 4186 4187 if ( change_status == ESR_SUCCESS ) 4188 { 4189 change_status = SR_RecognizerSetupImpl( recognizer ); 4190 4191 if ( change_status == ESR_SUCCESS ) 4192 change_status = SR_AcousticStateReset ( recognizer ); 4193 } 4194 else 4195 { 4196 SR_RecognizerSetupImpl( recognizer ); /* Otherwise recognizer is in bad state */ 4197 } 4198 } 4199 CA_FreeFrontendParameters ( frontendParams ); 4200 } 4201 else 4202 { 4203 SR_RecognizerSetupImpl( recognizer ); /* Otherwise recognizer is in bad state */ 4204 change_status = ESR_OUT_OF_MEMORY; 4205 } 4206 } 4207 } 4208 } 4209 } 4210 } 4211 return ( change_status ); 4212 } 4213 4214 4215