1/*---------------------------------------------------------------------------*
2 *  test_g2g.c  *
3 *                                                                           *
4 *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
5 *                                                                           *
6 *  Licensed under the Apache License, Version 2.0 (the 'License');          *
7 *  you may not use this file except in compliance with the License.         *
8 *                                                                           *
9 *  You may obtain a copy of the License at                                  *
10 *      http://www.apache.org/licenses/LICENSE-2.0                           *
11 *                                                                           *
12 *  Unless required by applicable law or agreed to in writing, software      *
13 *  distributed under the License is distributed on an 'AS IS' BASIS,        *
14 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 *  See the License for the specific language governing permissions and      *
16 *  limitations under the License.                                           *
17 *                                                                           *
18 *---------------------------------------------------------------------------*/
19
20
21
22#include "pstdio.h"
23#include "pmemory.h"
24#include "plog.h"
25#include "HashMap.h"
26#include "SR_Grammar.h"
27#include "SR_Vocabulary.h"
28#include "SR_SemanticResult.h"
29#include "ESR_Session.h"
30#include "ESR_Locale.h"
31#include "ESR_CommandLine.h"
32#include "LCHAR.h"
33
34#include "PFileSystem.h"
35#include "PANSIFileSystem.h"
36
37#include "SR_GrammarImpl.h"
38
39#include "simapi.h"
40#include "srec_context.h"
41#include "srec_arb.h"
42
43/**
44 * @todo document
45 */
46typedef struct
47{
48  unsigned short nnodes;
49  unsigned long  size;
50  long    phoneme;
51  unsigned short node_pos;
52  unsigned long  node_off;
53  short    low_genone_no;
54  short    high_genone_no;
55  short    low_pel_no;
56  short    high_pel_no;
57}
58tree_head;
59
60
61int usage(LCHAR* exename)
62{
63  pfprintf(PSTDOUT,"usage: %s -base <basefilename> \n",exename);
64  pfprintf(PSTDOUT,"<basefilename> can be a file.g2g or @g2gfilelist\n");
65  pfprintf(PSTDOUT,"[-checkword id] .. also checks word id in the file\n");
66  pfprintf(PSTDOUT,"[-swiarb esr/config/lang/models/generic.swiarb] ... enables word check\n");
67  return 1;
68}
69
70/* protos */
71ESR_ReturnCode find_phonemes_for_ihmms( CA_Arbdata* ca_arbdata, modelID* ihmms, int num_hmms);
72ESR_ReturnCode Parse(SR_Grammar* grammar, LCHAR* trans, PFile* fout);
73int CheckG2G(CA_Arbdata* arbdata, int* p4pTable, const char* base, int wordid, char* outbase);
74void load_filelist(char* filelist, char*** pfiles, int *pnum_files);
75int *phonemecode_for_pel_table(CA_Arbdata* arbdata);
76
77int debug = 0;
78#define MAX_LINE_LENGTH 256
79#define MAX_STR_LENGTH   80
80#define MAX_SEM_RESULTS   3
81#define MAX_KEYS         30
82
83/* main */
84
85int main (int argc, char **argv)
86{
87  ESR_ReturnCode rc;
88  LCHAR base[P_PATH_MAX] = L("");
89  int i;
90  CA_Arbdata* ca_arbdata;
91  char*  arbfile = NULL;
92  char** g2glist;
93  int g2glist_len;
94  char* outbase = NULL;
95  int *p4pTable;
96  int wordid = 0;
97  int log_level = 0;
98
99  /*
100   * Initialize portable library.
101   */
102  CHKLOG(rc, PMemInit());
103/*  CHKLOG(rc, PFileSystemCreate());
104  CHKLOG(rc, PANSIFileSystemCreate());
105  CHKLOG(rc, PANSIFileSystemAddPath(L("/dev/ansi"), L("/")));*/
106
107  /* Set ANSI file-system as default file-system */
108/*  CHKLOG(rc, PANSIFileSystemSetDefault(ESR_TRUE));*/
109  /* Set virtual current working directory to native current working directory */
110/*  len = P_PATH_MAX;
111  CHKLOG(rc, PANSIFileSystemGetcwd(cwd, &len));
112  CHKLOG(rc, PFileSystemChdir(cwd));*/
113
114  if( argc <= 1)
115	{
116    usage(argv[0]);
117    exit(EXIT_FAILURE);
118  }
119
120	for (i = 1; i < argc; ++i)
121	{
122		if(!LSTRCMP(argv[i], L("-base")))
123		{
124			++i;
125			LSTRCPY(base, argv[i]);
126		}
127		else if(!LSTRCMP(argv[i],L("-out")))
128		{
129			outbase = argv[++i];
130		}
131		else if(!LSTRCMP(argv[i],L("-swiarb")))
132		{
133			arbfile = argv[++i];
134		}
135		else if(!LSTRCMP(argv[i],L("-checkword")))
136		{
137			wordid = atoi(argv[++i]);
138		}
139		else if(!LSTRCMP(argv[i],L("-log")))
140		{
141			log_level = 10;
142		}
143		else
144		{
145			printf("unrecog'd argument %s\n", argv[i]);
146			exit(1);
147		}
148	}
149
150	CHK(rc, PLogInit(NULL, log_level));
151
152	if(arbfile) {
153		ca_arbdata = CA_LoadArbdata(arbfile);
154		if(!ca_arbdata) {
155      pfprintf(PSTDOUT, "Error: loading arbfile %s\n", arbfile);
156      goto CLEANUP;
157    }
158    pfprintf(PSTDOUT, "arbdata done\n");
159    p4pTable  = phonemecode_for_pel_table(ca_arbdata);
160    pfprintf(PSTDOUT, "p4pTable done\n");
161  } else {
162    ca_arbdata = 0;
163    p4pTable = 0;
164  }
165
166  if(base[0] == '@') {
167    load_filelist(base+1, &g2glist, &g2glist_len);
168    pfprintf(PSTDOUT, "g2glist %s .. %d entries\n", g2glist_len);
169    for(i=0; i<g2glist_len; i++)
170      CheckG2G( ca_arbdata, p4pTable, g2glist[i], wordid, outbase);
171  }
172  else {
173    CheckG2G( ca_arbdata, p4pTable, base, wordid, outbase);
174  }
175
176CLEANUP:
177  PLogShutdown();
178/*  PANSIFileSystemDestroy();
179  PFileSystemDestroy();*/
180  PMemSetLogFile(PSTDOUT);
181  PMemDumpLogFile();
182  PMemShutdown();
183  return rc;
184}
185
186int CheckG2G(CA_Arbdata* ca_arbdata, int* p4pTable, const char* base, int wordid, char* outbase)
187{
188  ESR_ReturnCode rc;
189  SR_GrammarImpl *grammarImpl;
190  SR_Grammar* grammar = NULL;
191  srec_context* fst;
192  CA_Syntax* syntax;
193  modelID ilabels_preceding[64], num_ilabels_preceding;
194  modelID ilabels_following[64], num_ilabels_following;
195  modelID ilabels[128], num_ilabels;
196  int i,j;
197  unsigned long g2gsize;
198
199  if(1) {
200    FILE* fp;
201    fp = fopen(base, "rb");
202    if(!fp) g2gsize = 0;
203    else {
204      fseek(fp, 0, SEEK_END);
205      g2gsize = ftell(fp);
206      fclose(fp);
207    }
208  }
209
210  rc = SR_GrammarLoad(base, &grammar);
211  if(rc != ESR_SUCCESS) {
212    pfprintf(PSTDOUT, "%s failed at load\n", base);
213    goto CLEANUP;
214  }
215
216  grammarImpl = (SR_GrammarImpl*)grammar;
217  syntax = grammarImpl->syntax;
218  if(outbase) {
219    CA_DumpSyntax( syntax, outbase);
220  }
221
222  fst = syntax->synx;
223  pfprintf(PSTDOUT, "%s %d arcs %d/%d/%d nodes %d/%d/%d words %d/%d chars %d/%d modelver %d\n",
224	   base, g2gsize,
225		 fst->num_arcs, fst->num_base_arcs, fst->FSMarc_list_len,
226		 fst->num_nodes, fst->num_base_nodes, fst->FSMnode_list_len,
227	   fst->olabels->num_words, fst->olabels->max_words,
228	   fst->olabels->next_chars-fst->olabels->chars,
229	   fst->olabels->max_chars,
230#ifdef IMAGE_FORMAT_V2
231	   fst->modelid
232#else
233	   -1
234#endif
235	   );
236
237  if(wordid == 0 || ca_arbdata == 0)
238    goto CLEANUP;
239
240  if(wordid >= fst->olabels->num_words) {
241    pfprintf(PSTDOUT, "%s failed 'cuz numwords(%d) < %d\n", base,
242	     fst->olabels->num_words, wordid);
243    goto CLEANUP;
244  }
245
246  for(i=0; i<fst->num_arcs; i++) {
247    if(fst->FSMarc_list[i].olabel == wordid) {
248      FSMnode* node;
249      FSMarc* arc = &fst->FSMarc_list[i];
250      nodeID fr_node = arc->fr_node;
251      arcID iarc;
252      ilabels_following[0] = arc->ilabel;
253      num_ilabels_following = 1;
254      num_ilabels_preceding = 0;
255      for( ; fr_node!=fst->start_node; fr_node=arc->fr_node) {
256	node = &fst->FSMnode_list[fr_node];
257	iarc = node->first_prev_arc;
258	for( ; iarc!=MAXarcID; iarc=arc->linkl_prev_arc) {
259	  arc = &fst->FSMarc_list[iarc];
260	  if(arc->fr_node != fr_node) break;
261	}
262	if(iarc == MAXarcID) {
263	  pfprintf(PSTDOUT, "%s failed at 11\n", base);
264	  goto CLEANUP;
265	}
266	if(arc->ilabel == WORD_BOUNDARY) break;
267	ilabels_preceding[num_ilabels_preceding++] = arc->ilabel;
268      }
269      arc = &fst->FSMarc_list[i];
270      fr_node = arc->to_node;
271      for( ; fr_node!=fst->end_node; fr_node=arc->to_node) {
272	node = &fst->FSMnode_list[fr_node];
273	iarc = node->un_ptr.first_next_arc;
274	for( ; iarc!=MAXarcID; iarc=arc->linkl_next_arc) {
275	  arc = &fst->FSMarc_list[iarc];
276	  if(arc->to_node != fr_node) break;
277	}
278	if(iarc == MAXarcID) {
279	  pfprintf(PSTDOUT, "%s failed at 12\n", base);
280	  goto CLEANUP;
281	}
282	ilabels_following[num_ilabels_following++] = arc->ilabel;
283	if(arc->ilabel == WORD_BOUNDARY) break;
284      }
285      num_ilabels = 0;
286      for(j=0; j<num_ilabels_preceding; j++)
287	ilabels[num_ilabels++] = ilabels_preceding[num_ilabels_preceding-1-j];
288      for(j=0; j<num_ilabels_following; j++)
289	ilabels[num_ilabels++] = ilabels_following[j];
290      if(ilabels[num_ilabels-1] == WORD_BOUNDARY)
291	num_ilabels--;
292      for(j=0; j<num_ilabels; j++) {
293	if(ilabels[j]<fst->hmm_ilabel_offset) {
294	  pfprintf(PSTDOUT, "%s failed at 15\n", base);
295	  goto CLEANUP;
296	} else
297	  ilabels[j] = ilabels[j] - (labelID)fst->hmm_ilabel_offset;
298      }
299      pfprintf(PSTDOUT, "%s (W%d) ihmms ", fst->olabels->words[wordid], wordid);
300      for(j=0;j<num_ilabels;j++)
301	pfprintf(PSTDOUT, " %d", ilabels[j]);
302      pfprintf(PSTDOUT, "\n");
303      if(num_ilabels < 2) {
304	pfprintf(PSTDOUT, "%s failed at 1\n", base);
305	goto CLEANUP;
306      }
307      if(p4pTable)
308	rc = find_phonemes_for_ihmms( ca_arbdata, ilabels, num_ilabels);
309      else {
310	rc = ESR_SUCCESS;
311	for(j=0; j<num_ilabels; j++) {
312	  if(p4pTable[ ilabels[j]]<0) {
313	    rc = ESR_NO_MATCH_ERROR;
314	    ilabels[j] = MAXmodelID;
315	  } else {
316	    ilabels[j] = (modelID)p4pTable[ ilabels[j]];
317	  }
318	}
319      }
320
321      if(rc) {
322	pfprintf(PSTDOUT, "%s failed at 2\n", base);
323	goto CLEANUP;
324      }
325      pfprintf(PSTDOUT, "%s ", fst->olabels->words[wordid]);
326      for(j=0;j<num_ilabels;j++) pfprintf(PSTDOUT, "%c", ilabels[j]);
327      pfprintf(PSTDOUT, "\n");
328      rc = Parse( grammar, fst->olabels->words[wordid], PSTDOUT);
329      if(rc) {
330	pfprintf(PSTDOUT, "%s failed at 3\n", base);
331	goto CLEANUP;
332      }
333      pfprintf(PSTDOUT, "%s PASSED (on %s)\n", base, fst->olabels->words[wordid]);
334      break;
335    }
336  }
337
338  return 0;
339 CLEANUP:
340  if(grammar) SR_GrammarDestroy(grammar);
341  return 1;
342
343}
344
345
346int traverse_tree(tree_node* node, tree_head *tree_topo, int *num_terminal_nodes)
347{
348  if(node)
349    tree_topo->nnodes++;
350
351  if(node->node.quest_index < 0) {
352    if(num_terminal_nodes)
353      (*num_terminal_nodes)++;
354    if( node->term.pelid < tree_topo->low_pel_no)
355      tree_topo->low_pel_no = tree_topo->low_genone_no = node->term.pelid;
356    if( node->term.pelid > tree_topo->high_pel_no)
357      tree_topo->high_pel_no = tree_topo->high_genone_no = node->term.pelid;
358  } else {
359    traverse_tree( (tree_node*)node->node.fail, tree_topo, num_terminal_nodes);
360    traverse_tree( (tree_node*)node->node.pass, tree_topo, num_terminal_nodes);
361  }
362  return 0;
363
364}
365
366int num_nodes_in_tree(tree_node* node, int *num_terminal_nodes)
367{
368  tree_head topo;
369  *num_terminal_nodes = 0;
370  topo.nnodes = 0;
371  traverse_tree(node, &topo, num_terminal_nodes);
372  return topo.nnodes;
373}
374
375ESR_ReturnCode find_phonemes_for_ihmms( CA_Arbdata* ca_arbdata, modelID* ihmms, int num_ihmms)
376{
377  int ii, i;
378  int num_hmms_in_phoneme;
379  tree_head topo;
380  srec_arbdata* a = (srec_arbdata*)ca_arbdata;
381  int num_phonemes_for_ihmms = 0;
382
383  for(ii=0; ii<num_ihmms; ii++) {
384    for(i=0; i<a->num_phonemes; i++) {
385      num_hmms_in_phoneme = 0;
386      topo.low_pel_no  = 32567;
387      topo.high_pel_no = 0;
388      traverse_tree(a->pdata[i].model_nodes, &topo, &num_hmms_in_phoneme);
389      if(debug)printf("phoneme %d num_hmms %d (%d-%d)\n", i, num_hmms_in_phoneme,
390		      topo.low_pel_no, topo.high_pel_no);
391      if(ihmms[ii] >= topo.low_pel_no && ihmms[ii]<= topo.high_pel_no) {
392	ihmms[ii] = (modelID)i;
393	num_phonemes_for_ihmms++;
394	break;
395      }
396    }
397    if( i==a->num_phonemes) {
398      if(ihmms[ii]<=5) {
399	ihmms[ii] = 0;
400	num_phonemes_for_ihmms++;
401      } else {
402	PLogError("error: could not find hmm%d under any phoneme! ",ihmms[ii]);
403      }
404    }
405
406  }
407  if(num_phonemes_for_ihmms != num_ihmms)
408    return ESR_INVALID_ARGUMENT;
409  else {
410    for(ii=0; ii<num_ihmms; ii++) ihmms[ii] =  a->pdata[ ihmms[ii]].code;
411    return ESR_SUCCESS;
412  }
413}
414
415void display_results(SR_SemanticResult *result, PFile* fout)
416{
417  size_t i, size, len;
418  LCHAR* keys[MAX_KEYS]; /* array of pointers to strings */
419  LCHAR  value[MAX_STR_LENGTH];
420  ESR_ReturnCode rc;
421
422  size = MAX_KEYS;
423  rc = result->getKeyList(result, (LCHAR**) &keys, &size); /* get the key list */
424  if(rc == ESR_SUCCESS)
425  {
426    for(i=0; i<size; i++)
427    {
428      len = MAX_STR_LENGTH;
429      if ((rc = result->getValue(result,keys[i],value,&len)) == ESR_SUCCESS)
430        pfprintf(fout,"{%s : %s}\n",keys[i],value);
431      else
432        pfprintf(fout,"Error: %s\n",ESR_rc2str(rc));
433    }
434  }
435  else
436    pfprintf(fout,"Error: %s\n",ESR_rc2str(rc));
437}
438
439ESR_ReturnCode Parse(SR_Grammar* grammar, LCHAR* trans, PFile* fout)
440{
441  ESR_ReturnCode rc;
442  int i, result_count;
443  SR_SemanticResult* semanticResults[MAX_SEM_RESULTS];
444
445  result_count = MAX_SEM_RESULTS; /* initially not greater than MAX */
446  for(i =0; i<result_count; i++)
447    SR_SemanticResultCreate(&semanticResults[i]); /* create the result holders */
448
449  lstrtrim(trans);
450
451  rc = grammar->checkParse(grammar, trans, semanticResults, (size_t*) &result_count);
452  if(rc != ESR_SUCCESS)
453    return rc;
454
455  if(result_count < 1)
456  {
457    pfprintf(fout,"no parse\n\n");
458    return ESR_NO_MATCH_ERROR;
459  }
460  else
461  {
462    pfprintf(fout,"parse ok (%d results)\n", result_count);
463    for(i=0; i < result_count; i++)
464      display_results(semanticResults[i],fout);
465
466    for(i=0; i < MAX_SEM_RESULTS; i++)
467    {
468      rc = semanticResults[i]->destroy(semanticResults[i]);
469      if(rc != ESR_SUCCESS)
470        return rc;
471    }
472    return ESR_SUCCESS;
473  }
474}
475
476void load_filelist(char* filelist, char*** pfiles, int *pnum_files)
477{
478  int i = 0;
479  FILE* fp;
480  char line[512];
481  char **files = 0, *file;
482  int num_files = 0;
483
484  fp = fopen(filelist, "r");
485  if(!fp) {
486    pfprintf(PSTDOUT, "failed to open %s\n", filelist);
487    goto DONE;
488  }
489
490  while( fgets(line, sizeof(line), fp)) {
491    if(line[0] == '#') continue;
492    i++;
493  }
494  fclose(fp);
495
496  num_files = i;
497  *files = CALLOC( num_files, sizeof(char*), __FILE__);
498  fp = fopen(filelist, "r");
499  for(i=0; fgets(line,sizeof(line),fp) && i<num_files; i++) {
500    if(line[0] == '#') continue;
501    strtok(line,"\n\r\t");
502    file = files[i++] = CALLOC(strlen(line)+1,sizeof(char),__FILE__);
503    strcpy( file, line);
504  }
505  fclose(fp);
506  num_files = i;
507
508 DONE:
509  *pfiles = files;
510  *pnum_files = num_files;
511}
512
513int* phonemecode_for_pel_table(CA_Arbdata* ca_arbdata)
514{
515  static int table[2048];
516  int i,j;
517  tree_head topo;
518  srec_arbdata* a = (srec_arbdata*)ca_arbdata;
519  int num_hmms_in_phoneme;
520
521  for(j=0; j< (int)(sizeof(table)/sizeof(int)); j++)
522    table[j] = 0;
523
524  for(i=0; i<a->num_phonemes; i++) {
525    num_hmms_in_phoneme = 0;
526    topo.low_pel_no  = 32567;
527    topo.high_pel_no = 0;
528    traverse_tree(a->pdata[i].model_nodes, &topo, &num_hmms_in_phoneme);
529    if(debug)printf("phoneme %d num_hmms %d (%d-%d)\n", i, num_hmms_in_phoneme,
530		    topo.low_pel_no, topo.high_pel_no);
531
532    for(j=topo.low_pel_no; j<=topo.high_pel_no; j++)
533      table[j] = a->pdata[i].code;
534  }
535  return &table[0];
536}
537