1/*---------------------------------------------------------------------------*
2 *  grxmldoc.cpp  *
3 *                                                                           *
4 *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
5 *                                                                           *
6 *  Licensed under the Apache License, Version 2.0 (the 'License');          *
7 *  you may not use this file except in compliance with the License.         *
8 *                                                                           *
9 *  You may obtain a copy of the License at                                  *
10 *      http://www.apache.org/licenses/LICENSE-2.0                           *
11 *                                                                           *
12 *  Unless required by applicable law or agreed to in writing, software      *
13 *  distributed under the License is distributed on an 'AS IS' BASIS,        *
14 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 *  See the License for the specific language governing permissions and      *
16 *  limitations under the License.                                           *
17 *                                                                           *
18 *---------------------------------------------------------------------------*/
19
20#include <assert.h>
21#include <stdlib.h>
22#include <fstream>
23#include <sstream>
24#include <iostream>
25#include <algorithm> // for std::sort
26#include "tinyxml.h"
27#include "grph.h"       // The word graph object and interface
28#include "sub_grph.h"	// The sub-graph object and interface
29#include "hashmap.h"
30#include "grxmldoc.h"
31#include "ESR_Session.h"
32//#include "LCHAR.h"
33
34#define GRXML_DEBUG 0
35#define MAX_PATH_NAME 512
36
37#define FATAL_ERROR(x,y) { std::cout << (x) << std::endl; exit ((y)); }
38#define WARNING(x) std::cout << (x) << std::endl;
39
40#if GRXML_DEBUG
41//#define DEBUG_PRINT(x) //
42#define DEBUG_PRINT(x) std::cout << (x) << std::endl;
43#define PRINT_EXPRESSION(x)
44//#define PRINT_EXPRESSION(x) std::cout << (x) << std::endl;
45#else
46#define DEBUG_PRINT(x) //
47#define PRINT_EXPRESSION(x) //
48
49#endif
50
51using namespace std;
52
53#define CHECK_NOT_EMPTY(s, t) { if (s.empty()) \
54				{ \
55				std::cout << "ERROR: Empty string of type "  << t <<std::endl; \
56				} \
57			     }
58
59int get_range(const std::string& s, int* minCnt, int* maxCnt)
60{
61  std::string sval;
62  unsigned int p1 =s.find("-");
63  if ( p1 !=string::npos ) {
64    sval.assign( s, 0, p1 );
65    if(strspn(sval.c_str(),"0123456789")<1) return 1;
66    *minCnt = atoi( sval.c_str() );
67    sval.assign( s, p1+1, s.size() );
68    *maxCnt = -1;    // 0== any?
69    // If max is given then use BeginCount otherwise use BeginItemRepeat
70    if (!sval.empty() ) {
71      if(strspn(sval.c_str(),"0123456789")<1) return 1;
72      *maxCnt = atoi( sval.c_str() );
73    }
74    return 0;
75  }
76  p1 = s.find("+");
77  if( p1 != string::npos) {
78    sval.assign( s, 0, p1 );
79    if(strspn(sval.c_str(),"0123456789")<1) return 1;
80    *minCnt = atoi( sval.c_str() );
81    *maxCnt = -1;
82    return 0;
83  }
84  if(strspn(s.c_str(),"0123456789")<1) return 1;
85  *minCnt = *maxCnt = atoi( s.c_str());
86  return 0;
87}
88
89GRXMLDoc::GRXMLDoc()
90{
91    m_NodeKeyWords.insert(make_pair("grammar", NodeTypeGrammar));
92    m_NodeKeyWords.insert(make_pair("rule", NodeTypeRule));
93    m_NodeKeyWords.insert(make_pair("ruleref", NodeTypeRuleReference));
94    m_NodeKeyWords.insert(make_pair("one-of", NodeTypeOneOf));
95    m_NodeKeyWords.insert(make_pair("item", NodeTypeItem));
96    m_NodeKeyWords.insert(make_pair("tag", NodeTypeTag));
97    m_NodeKeyWords.insert(make_pair("count", NodeTypeCount));
98    m_NodeKeyWords.insert(make_pair("meta", NodeTypeMeta));
99    m_pGraph = 0;
100    m_RuleAutoIndex = 0;
101    m_TagAutoIndex = 0;
102    m_LabelAutoIndex = 0;
103    m_ExpandedRulesAutoIndex = 0;
104    m_XMLFileName = "dummy.xml";
105}
106
107
108GRXMLDoc::~GRXMLDoc()
109{
110    deleteRules();
111    if (m_pGraph) {
112        delete m_pGraph;
113    }
114}
115
116
117bool GRXMLDoc::parseGrammar( XMLNode &node, std::string & xMLFileName )
118{
119    m_XMLFileName = xMLFileName;
120    // Set up the internally defined rules, etc.
121    initializeLists();
122    // The top level "document" node is given to this fn
123    // Create the container for the word graph.
124    if (m_pGraph) {
125        delete m_pGraph;
126    }
127    m_pGraph = new Graph("XML grammar");
128    SubGraph *p_SubGraph;
129
130    parseNode( node, p_SubGraph, 1 );     // NB Subgraph pointed to will change in recursive fn.
131
132    if (findSubGraph( m_RootRule, p_SubGraph )) {
133	m_pGraph->ExpandRules (p_SubGraph);
134	p_SubGraph->RemoveInternalConnections ();
135	//Print the root rule.
136	//printSubgraph( *p_SubGraph );
137    }
138    return true;
139}
140
141
142bool GRXMLDoc::parseNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
143{
144    // We will create a new subgraph for each rule node.
145    // The "current" subgraph is substituted with the new subgraph for all ops on child nodes.
146    // After processing child nodes the original subgraph is reinstated
147    // for final operations in the endNode() fn.
148
149    // Initial processing of the current node before processing children
150#if 0 && GRXML_DEBUG
151	if(node.Type() == TiXmlNode::ELEMENT)
152		node.ToElement()->Print( stdout, level);
153	else if(node.Type() == TiXmlNode::DOCUMENT)
154		node.ToDocument()->Print( stdout, level);
155	else if(node.Type() == TiXmlNode::TEXT)
156		node.ToText()->Print( stdout, level);
157	else if(node.Type() == TiXmlNode::DECLARATION)
158		node.ToDeclaration()->Print( stdout, level);
159	else {
160		const char* text = node.Value();
161		if(!text) text = "__NULL__";
162		printf("processing node type %d text %s\n", node.Type(), text);
163	}
164#endif
165    beginNode( node, p_SubGraph, level );
166
167    SubGraph *p_LocalSubGraph;
168    p_LocalSubGraph = p_SubGraph;
169	TiXmlNode* child;
170	for( child = node.FirstChild(); child; child = child->NextSibling() )
171    {
172		parseNode ( *child, p_SubGraph, level+1 );
173    }
174    // Revert current node
175    p_SubGraph = p_LocalSubGraph;
176
177    // Finish processing current node
178    endNode( node, p_SubGraph, level );
179
180    return true;
181} // parseNode
182
183
184bool GRXMLDoc::beginNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
185{
186    std::string name = node.Value();
187    DEBUG_PRINT("Element = " + name);
188
189    // XMLNode::Type type = node.getType();
190    if ( node.Type() == TiXmlNode::TEXT) // isCData()
191    {
192      const char* cc_name = node.Parent()->Value();
193      std::string str_name(cc_name);
194      DEBUG_PRINT (std::string("CDATA ") + name);
195      DEBUG_PRINT (std::string("CDATA ") + str_name);
196
197      processCDATA( node, p_SubGraph );
198    }
199    else if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() /*isLeaf()*/)
200      {
201	//printNode(node, level);
202	// Use enum value
203	KEYWDPAIR::iterator pos;
204	pos = m_NodeKeyWords.find( name );
205	KeywordValues nodeType = NodeTypeBadValue;
206	if ( pos != m_NodeKeyWords.end() )
207	{
208	    nodeType = (*pos).second;
209	    DEBUG_PRINT("nodeType=" + nodeType);
210	} else if(node.Type() == TiXmlNode::COMMENT) {
211		return true;
212	} else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) {
213		return true;
214	} else {
215	  FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT);
216	}
217
218	switch ( nodeType )
219	{
220	case NodeTypeGrammar:
221	    {
222		beginParseGrammarNode( node );
223	    }
224	    break;
225	case NodeTypeRule:
226	    {
227		// NB This fn creates a new subgraph.
228		beginParseRuleNode( node, p_SubGraph );
229	    }
230	    break;
231	    case NodeTypeRuleReference:
232	    {
233		// NB This fn creates a new subgraph.
234		beginRuleRef( node, p_SubGraph );
235	    }
236	    break;
237	    case NodeTypeOneOf:
238	    {
239		beginOneOf( node, p_SubGraph );
240	    }
241	    break;
242	    case NodeTypeItem:
243	    {
244		beginItem( node, p_SubGraph );
245	    }
246	    break;
247	    case NodeTypeTag:
248	    {
249		beginTag( node, p_SubGraph );
250	    }
251	    break;
252	    case NodeTypeCount:
253	    {
254		beginCount( node, p_SubGraph );
255	    }
256	    break;
257	    case NodeTypeMeta:
258	    {
259	        beginParseMetaNode( node );
260	    }
261	    break;
262	    case NodeTypeBadValue:
263	    default:
264		DEBUG_PRINT( "UNKNOWN node name: " + name );
265	    break;
266	}; // switch
267    } //is a Node or Leaf
268    else if ( node.Type() == TiXmlNode::TEXT) // isCData()
269      {
270	DEBUG_PRINT (std::string("CDATA ") + name);
271	processCDATA( node, p_SubGraph );
272    }
273    return true;
274} // beginNode()
275
276
277bool GRXMLDoc::endNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
278{
279    std::string name = node.Value();
280    //XMLNode::Type type = node.getType();
281
282    if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() )
283    {
284	KEYWDPAIR::iterator pos;
285	pos = m_NodeKeyWords.find( name );
286	KeywordValues nodeType = NodeTypeBadValue;
287	if ( pos != m_NodeKeyWords.end() )
288	{
289	    nodeType = (*pos).second;
290	}  else if(node.Type() == TiXmlNode::COMMENT) {
291		return true;
292	} else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) {
293		return true;
294	} else if(node.Type() == TiXmlNode::TEXT) {
295
296	} else {
297	  FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT );
298	}
299
300	switch ( nodeType )
301	{
302	case NodeTypeGrammar:
303	{
304	    endParseGrammarNode( node );
305	}
306	break;
307	case NodeTypeRule:
308	{
309	    endParseRuleNode( node, p_SubGraph );
310	}
311	break;
312	case NodeTypeRuleReference:
313	{
314	    endRuleRef( node, p_SubGraph );
315	}
316	break;
317	case NodeTypeOneOf:
318	{
319	    endOneOf( node, p_SubGraph );
320	}
321	break;
322	case NodeTypeItem:
323	{
324	    endItem(node, p_SubGraph );
325	}
326	break;
327	case NodeTypeTag:
328	{
329	    endTag( node, p_SubGraph );
330	}
331	break;
332	case NodeTypeCount:
333	{
334	    endCount( node, p_SubGraph );
335	}
336	break;
337        case NodeTypeMeta:
338	{
339            endParseMetaNode( node );
340	}
341	break;
342	case NodeTypeBadValue:
343	default:
344	    DEBUG_PRINT( "UNKNOWN node name: ");
345	    DEBUG_PRINT( name.c_str() );
346	//Extend the
347	break;
348	}; // switch
349    } //isNode() or isLeaf()
350    else
351    {
352	// Do nothing?
353    }
354    return true;
355} // endNode()
356
357
358bool GRXMLDoc::beginParseGrammarNode(XMLNode &node)
359{
360	const char* attr;
361#define GETATTR(nAmE) ((attr=node.ToElement()->Attribute(nAmE))!=NULL) ? attr:""
362	m_XMLMode      = GETATTR("mode");
363	m_XMLLanguage  = GETATTR("xml:lang");
364    m_RootRule     = GETATTR("root");	// The root rule name
365
366    DEBUG_PRINT("Root rule = " + m_RootRule);
367
368    m_XMLTagFormat = GETATTR("tag-format");
369    m_XMLVersion   = GETATTR("version");
370    m_XMLBase      = GETATTR("xml:base");
371    return true;
372}
373
374bool GRXMLDoc::beginParseMetaNode(XMLNode &node)
375{
376  const char* attr;
377  std::string meta_name  = GETATTR("name");
378  std::string meta_value = GETATTR("content");
379
380  if(meta_name == "word_penalty") {
381    m_MetaKeyValPairs.insert(meta_name,meta_value);
382    // m_MetaKeyValPairs.print();
383  } else if(meta_name == "do_skip_interword_silence") {
384    for(int j = 0; j<(int)meta_value.size(); j++){
385      meta_value[j] = tolower(meta_value[j]); //lower();
386    }
387    if(meta_value!="true" && meta_value!="false")
388      printf ("\nWarning: %s must be set to 'true' or 'false'; defaulting to 'false'\n", meta_name.c_str());
389    else
390      m_MetaKeyValPairs.insert(meta_name,meta_value);
391  } else if(meta_name == "userdict_name") {
392    printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str());
393  } else {
394    printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str());
395  }
396  return true;
397}
398
399
400bool GRXMLDoc::endParseGrammarNode(XMLNode &node)
401{
402    // End parse operations
403    return true;
404}
405
406
407bool GRXMLDoc::beginParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph)
408{
409	const char* attr;
410    // Note: The subGraph may change if there are forward references. This
411    // is fine as we revert to the previous one when finished parsing the current node.
412    DEBUG_PRINT ( "---- Rule\n" );
413    std::string ruleName = GETATTR("id" );
414    std::string s_tag    = GETATTR("tag" );
415    if( s_tag.length()>0) {
416      FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
417    }
418    CHECK_NOT_EMPTY( ruleName, "id" );
419    // Rule name must be unique within scope of entire grammar.
420    // Put rule on stack - for context
421    m_RuleListStack.push( ruleName );
422
423    // Check whether a ruleref placeholder exists for this rule.
424    int index;
425    bool foundRule = findRuleIndex( ruleName, index );
426    if (foundRule) {
427	// Rule is already declared; it must have been forward referenced
428	// so swap the placeholder subgraph in.
429	// NB subgraph and rule name are already known to lists.
430	SubGraph *p_ExistingSubgraph;
431	if ( findSubGraph( ruleName, p_ExistingSubgraph ) ) {
432	    p_SubGraph = p_ExistingSubgraph;
433	}
434	else {
435	    FATAL_ERROR("ERROR! Subgraph without rule name entry found!", -1);
436        }
437    }
438    else {
439	// Create a Word Graph node for each rule node
440	SubGraph *newGraph;
441	addRuleToList( ruleName, newGraph );
442	p_SubGraph = newGraph;
443    }
444
445    // Make a note of the scope or rules; public, etc - used in map file.
446    findRuleIndex( ruleName, index );
447    std::string ruleScope = GETATTR("scope" );
448    if ( !ruleScope.empty() ) {
449        m_RuleScope.insert(index, ruleScope);
450    }
451
452    // We must accommodate Rules that have CDATA without an <item> element.
453    // We need to infer this element for all rules.
454    m_pGraph->BeginItem( p_SubGraph );
455
456    PRINT_EXPRESSION( ruleName + " = { " );
457    return true;
458} // beginParseRuleNode()
459
460
461bool GRXMLDoc::endParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph )
462{
463    // The rule expression has been built as a subgraph and ID added to the rule list.
464    // Finished editing subgraph
465    DEBUG_PRINT ( "---- /Rule\n" );
466    //m_pGraph->EndRule(&p_SubGraph);
467    // Tell the world
468    //std::string ruleName = attr.get( "id" );
469    std::string ruleName = m_RuleListStack.top();
470    m_RuleListStack.pop();
471    //CHECK_NOT_EMPTY( ruleName, "id" );
472    // Must be unique rule name within scope of entire grammar.
473    // Check whether a ruleref placeholder exists for this rule.
474    m_pGraph->addSubGraph ( p_SubGraph );
475
476    // We must accommodate Rules that have CDATA without an <item> element.
477    // We need to infer this element for all rules.
478    m_pGraph->EndItem( p_SubGraph );
479
480    PRINT_EXPRESSION( " }\n" );
481    return true;
482}
483
484bool GRXMLDoc::processCDATA( XMLNode &node, SubGraph *&p_SubGraph )
485{
486    // Note the Item's CDATA
487    // Strip leading and trailing whitespace
488    const char* cc_name = node.Parent()->Value();
489    std::string str_name(cc_name); // = node.Parent()->ValueStr(); // getName
490    // std::string name = node.Parent()->Value(); // getName
491    //if ( name == "item" ) {
492    if ( str_name != "tag" ) {
493
494	const char* const whitespace = " \t\r\n\v\f";
495	std::string cdata = node.Value(); // getCData()
496	std::string word; // Words are whitespace separated
497
498	cdata.erase(0, cdata.find_first_not_of(whitespace) );
499	cdata.erase(cdata.find_last_not_of(whitespace) + 1);
500#if GRXML_DEBUG
501        std::cout << "/--" << cdata << "--/\n";
502#endif
503
504	std::string::size_type begIdx, endIdx;
505
506        //search beginning of the first word
507        begIdx = cdata.find_first_not_of(whitespace);
508
509        //while beginning of a word found
510	while (begIdx != std::string::npos) {
511            //search end of the actual word
512            endIdx = cdata.find_first_of (whitespace, begIdx);
513            if (endIdx == string::npos) {
514                //end of word is end of line
515                endIdx = cdata.length();
516            }
517            word.clear();
518	    // word.assign(cdata,begIdx,endIdx);
519	    word.append (cdata, begIdx, endIdx - begIdx);
520	    if ( !word.empty() )
521	    {
522#if GRXML_DEBUG
523		std::cout << " -->" << word << "<--\n";
524#endif
525		int index;
526		// If a slot then take note of rule name
527		if ( IsSlot( word ) ) {
528		  const char* xmlBasename;
529		  std::string ruleName = m_RuleListStack.top();
530		  m_SlotList.insert(index, ruleName);
531		  xmlBasename = strrchr(m_XMLFileName.c_str(),'/');
532		  xmlBasename = xmlBasename ? xmlBasename+1 : m_XMLFileName.c_str();
533		  word = (std::string)xmlBasename + "." + ruleName + "@" + word;
534		  addLabelToList( word );
535		  findLabelIndex( word, index );
536		} else {
537		  addLabelToList( word );
538		  findLabelIndex( word, index );
539		}
540		m_pGraph->AddLabel( p_SubGraph, index );
541	    }
542	    begIdx = cdata.find_first_not_of (whitespace, endIdx);
543
544	}
545    } //tag
546    else {
547	// Do nothing with CDATA for elements that are not items.
548	// In particular, do not strip whitespace from tag cdata.
549	// However, CPPDOM appears to remove linefeeds. May need to tidy up.
550
551    }
552    return true;
553} // cdata
554
555bool GRXMLDoc::beginItem( XMLNode &node, SubGraph *&p_SubGraph )
556{
557	const char* attr;
558    DEBUG_PRINT ("---- Item:\n");
559    // First check whethere there is a count/repeat
560    std::string s     = GETATTR("repeat" );
561    int minCnt=0,maxCnt=0;
562    std::string s_tag = GETATTR("tag" );
563    if( s_tag.length()>0) {
564      FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
565    }
566    if( s.length()>0 && get_range( s, &minCnt, &maxCnt) ) {
567      FATAL_ERROR(std::string("error: while parsing range ") + s,1);
568    }
569    if ( !s.empty() ) {
570      // RED FLAG: max should not be 0! A +ve number should have been given.
571      if( maxCnt>0) {
572	m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt );
573      }
574      else {
575	// NB: BeginItemRepeat  can only use min of 0 or 1!
576	m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1);
577      }
578    }
579    else {
580	m_pGraph->BeginItem( p_SubGraph );
581    }
582    return true;
583}
584
585
586bool GRXMLDoc::endItem( XMLNode &node, SubGraph *&p_SubGraph )
587{
588    DEBUG_PRINT ( "---- /Item\n" );
589
590    // What TODO if no tag for an item?
591
592    m_pGraph->EndItem( p_SubGraph );
593    return true;
594}
595
596
597bool GRXMLDoc::beginRuleRef( XMLNode &node, SubGraph *&p_SubGraph )
598{
599    // Extend word FST node with an entire FST subgraph.
600    // Forward referencing of rules is supported.
601    // NB Remove the leading # from the ruleref name!
602    DEBUG_PRINT ( "---- Ruleref\n" );
603
604	const char* attr;
605    std::string s_tag = GETATTR("tag" );
606    if( s_tag.length()>0) {
607      FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
608    }
609    std::string s = GETATTR("uri" );
610    if (s.empty())
611    {
612	//
613	FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 );
614    }
615    // Remove the #:
616    int p1 = s.find("#");
617    if ( p1 !=0 ) {
618	FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'" + ". Rule reference must start with a '#'. External references are not supported.", -1 );
619    }
620    string ruleName;
621    getRuleRefName( node, ruleName );
622
623    //std::string parentRuleName = m_RuleListStack.top();
624    //addRuleDependency( parentRuleName, ruleName );
625
626    int index;
627    bool foundRule = findRuleIndex( ruleName, index );
628    if (!foundRule) {
629	// Forward reference; create a placeholder subgraph ptr.
630	//SubGraph *newGraph = new SubGraph( (char *) ruleName.c_str() );
631	// RED FLAG:  Remember to check fwd ref rule was filled in at end.
632	SubGraph *newGraph;
633	addRuleToList( ruleName, newGraph );
634	findRuleIndex( ruleName, index );
635    }
636    // We can now treat a forward-referenced graph as if it was defined.
637    // We will add the subgraph when we have the tag - see endItem().
638    m_pGraph->BeginRule( p_SubGraph );
639    m_pGraph->AddRuleRef( p_SubGraph, index );
640    m_pGraph->EndRule( p_SubGraph );
641
642    return true;
643}
644
645
646bool GRXMLDoc::endRuleRef(XMLNode &grmNode, SubGraph *&p_SubGraph )
647{
648    DEBUG_PRINT ( "---- /Ruleref\n" );
649    // Does nothing
650    // NB The tag is not under the ruleref element - it is in the current item element.
651    // We now add the tag of the AddRuleRef as we see the tag element. See EndTag().
652
653    return true;
654}
655
656
657bool GRXMLDoc::beginOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph)
658{
659    DEBUG_PRINT ( "----OneOf\n" );
660    m_pGraph->BeginOneOf (p_SubGraph);
661    return true;
662}
663
664
665bool GRXMLDoc::endOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph)
666{
667    DEBUG_PRINT ( "----/OneOf\n" );
668    m_pGraph->EndOneOf (p_SubGraph);
669    return true;
670}
671
672
673bool GRXMLDoc::beginTag( XMLNode &node, SubGraph *&p_SubGraph )
674{
675    DEBUG_PRINT ("---- Tag\n");
676    std::string s = node.ToElement()->GetText(); // getCdata();
677#if GRXML_DEBUG
678    std::cout << s;     // debug
679#endif
680    // Store the semantic tag info.
681    // NB Do not strip whitespace from tag cdata
682    if ( !s.empty() )
683    {
684	int index;
685	addTagToList( s );
686	findTagIndex( s, index );
687	m_pGraph->AddTag ( p_SubGraph, index );
688    }
689
690    return true;
691}
692
693
694bool GRXMLDoc::endTag( XMLNode &node, SubGraph *&p_SubGraph )
695{
696    DEBUG_PRINT ("---- /Tag\n");
697    return true;
698}
699
700
701bool GRXMLDoc::beginCount( XMLNode &node, SubGraph *&p_SubGraph )
702{
703	const char* attr;
704    // Count of reps applies to the text elements in this count node
705    DEBUG_PRINT ("---- Count\n");
706    // Get number attr
707    std::string s     = GETATTR("number");
708    std::string s_tag = GETATTR("tag" );
709    if( s_tag.length()>0) {
710      FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
711    }
712    if (s.empty()) {
713		return false;
714    }
715    // not  in subgraph but in graph?!
716    //graph.BeginCount(n);
717
718    int minCnt=-1, maxCnt=-1;
719    if( get_range( s, &minCnt, &maxCnt) ) {
720      FATAL_ERROR(std::string("error: while parsing range ") + s,1);
721    }
722    if ( s.c_str() == std::string("optional") )
723    {
724	m_pGraph->BeginOptional( p_SubGraph );
725    }
726    else if ( minCnt>0 && maxCnt>0)
727    {
728	m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt );
729    }
730    else if( minCnt>0 )
731      {
732	m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1);
733      }
734    else { //
735    	m_pGraph->BeginOptional ( p_SubGraph );
736    }
737
738    return true;
739}
740
741
742bool GRXMLDoc::endCount( XMLNode &node, SubGraph *&p_SubGraph )
743{
744    DEBUG_PRINT ("---- /Count\n");
745    m_pGraph->EndCount( p_SubGraph );
746    return true;
747}
748
749bool GRXMLDoc::endParseMetaNode(XMLNode &node)
750{
751  // End parse operations
752  return true;
753}
754
755void GRXMLDoc::printNode(XMLNode &node, int level)
756{
757    std::string name = node.Value();
758    int type = node.Type();
759    std::string c_data;
760
761    for(int i=0;i<level;i++) std::cout << " ";
762
763    char c = ' ';
764    switch(type)
765    {
766    case TiXmlNode::ELEMENT:
767	// case XMLNode::xml_nt_node: // grammar, rule, one-of, item, count
768	 c = '+';
769	 break;
770	/* case TiXmlNode::TEXT:
771	// case XMLNode::xml_nt_leaf:
772	c = '-';
773	break; */
774    case TiXmlNode::DOCUMENT:
775    // case XMLNode::xml_nt_document:
776	c = '\\';
777	break;
778    case TiXmlNode::TEXT:
779    // case XMLNode::xml_nt_cdata:
780	c = '#';
781	c_data = node.Value(); // getCdata();
782	break;
783	case TiXmlNode::UNKNOWN:
784	case TiXmlNode::COMMENT:
785	case TiXmlNode::TYPECOUNT:
786	case TiXmlNode::DECLARATION:
787	default:
788		std::cout << "Error: not sure what to do here" << std::endl;
789		break;
790    }
791	if(node.Type() == TiXmlNode::TEXT)  // isCData()
792	  std::cout << c << name.c_str() << "[" << c_data << "]" << std::endl;
793	//Extend the tag hashtable
794    else
795	  std::cout << c << name.c_str() << std::endl;
796
797	if( node.Type() == TiXmlNode::ELEMENT) {
798
799		for(TiXmlAttribute* attr=node.ToElement()->FirstAttribute();
800			attr; attr=attr->Next() ) {
801
802		  // guru: added output of attributes
803			for (int i=0; i<level; i++)
804				std::cout << " ";
805			std::cout << "   ";
806			std::cout << attr->Name() << ": " << attr->Value() << std::endl;
807		}
808	}
809
810}
811
812/** Function: addRuleToList
813    Extends list of SubGraphs with given subGraph
814    and extends list of rule names too.
815    TODO: Can we use one hash and use internal numeric index for rule IDs?
816*/
817
818
819bool GRXMLDoc::addRuleToList(std::string const & ruleName, SubGraph *&p_SubGraph)
820{
821    int index;
822    if ( findRuleIndex ( ruleName, index ) ) {
823	FATAL_ERROR("ERROR! Rule name " + ruleName + " is already defined!", -1 );
824    }
825
826    addLabelToList( m_XMLFileName + "@" + ruleName);
827    findLabelIndex( m_XMLFileName + "@" + ruleName, index );
828#if GRXML_DEBUG
829    std::cout << "Rule " << ruleName << std::endl;
830#endif
831    // Create the new subgraph and update lists
832    m_RuleList.insert( ruleName, index );
833    p_SubGraph = new SubGraph( (char *) ruleName.c_str(), index );
834
835    bool success = m_SubgraphList.insert( ruleName, p_SubGraph );
836    if (!success) {
837	FATAL_ERROR("ERROR! subgraph for " + ruleName + " is already defined!", -1 );
838    }
839#if ADD_BRACES
840    addLabelToList( "{" );
841    std::stringstream  ss;
842    ss << "}(" << index << ")";
843    addLabelToList( ss.str());
844#endif
845    return success;
846}
847
848
849bool GRXMLDoc::deleteRules()
850{
851    // Delete all allocated subgraphs.
852    // The rule strings are part of the hashtables and get deleted by them.
853    int index;
854    SubGraph *p_SubGraph;
855    std::string ruleName;
856    while ( !m_RuleList.isEmpty() ) {
857	m_RuleList.getFirst( &ruleName, &index );
858	m_RuleList.remove( ruleName );
859	if (m_SubgraphList.getValue( ruleName, &p_SubGraph ) ) {
860	    delete p_SubGraph;
861	}
862	else {
863	    FATAL_ERROR("No subgraph for rule " + ruleName + "! Mismatched rules and subgraph hashtables!", -1);
864	}
865    }
866    m_SubgraphList.clear();
867    m_RuleList.clear();
868    m_LabelList.clear();
869    m_TagList.clear();
870    return true;
871}
872
873bool GRXMLDoc::findSubGraph(std::string & s, SubGraph *&p_SubGraph)
874{
875    return m_SubgraphList.getValue(s, &p_SubGraph);
876}
877
878bool GRXMLDoc::findRule(int i, std::string &s )
879{
880    return m_RuleList.getIndex( i, &s );
881}
882
883bool GRXMLDoc::findTag(int i, std::string &s )
884{
885    return m_TagList.getValue( i, &s );
886}
887
888bool GRXMLDoc::findLabel(int i, std::string &s )
889{
890    return m_LabelList.getValue( i, &s );
891}
892
893bool GRXMLDoc::findSubGraphIndex( SubGraph *p_SubGraph, std::string &s )
894{
895    return m_SubgraphList.getIndex( p_SubGraph, &s );
896}
897
898bool GRXMLDoc::findRuleIndex( std::string s, int &i )
899{
900    return m_RuleList.getValue( s, &i );
901}
902bool GRXMLDoc::findTagIndex( std::string s, int &i )
903{
904    return m_TagList.getIndex( s, &i );
905}
906bool GRXMLDoc::findLabelIndex( std::string s, int &i )
907{
908    return m_LabelList.getIndex( s, &i );
909}
910bool GRXMLDoc::findMeta(const std::string & sn, std::string &s)
911{
912    return m_MetaKeyValPairs.getValue( sn, &s );
913}
914bool GRXMLDoc::setMeta(const std::string & sn, const std::string &s)
915{
916  std::string tmp;
917  if(findMeta(sn,tmp))
918    m_MetaKeyValPairs.remove(sn);
919  return m_MetaKeyValPairs.insert(sn,s);
920}
921
922bool GRXMLDoc::addTagToList( std::string const& s )
923{
924    bool success = true;
925    // Make values unique
926    int index;
927    if ( !findTagIndex( s, index ) )
928	success = m_TagList.insert( m_TagAutoIndex++, s );
929    return success;
930}
931
932
933bool GRXMLDoc::addLabelToList( std::string const& s )
934{
935  // TODO: Labels should be unique. Change key.
936  int index;
937  bool bRes = m_LabelList.getIndex( s, &index );
938  if(bRes == true) {
939    return false; // exists
940  }
941  bRes = m_LabelList.insert( m_LabelAutoIndex++, s );
942  return  bRes;
943}
944
945void GRXMLDoc::printLists()
946{
947    m_SubgraphList.print();
948    m_RuleList.print();
949    m_TagList.print();
950    m_LabelList.print();
951}
952
953
954void GRXMLDoc::printSubgraphs()
955{
956    SubGraph *p_SubGraph;
957    std::string rule;
958    int index;
959    if ( m_RuleList.getFirst( &rule, &index) ) {
960	if ( findSubGraph( rule, p_SubGraph ) ) {
961	    DEBUG_PRINT("============ Rule: " + rule + "============");
962	    printSubgraph( *p_SubGraph );
963	    while ( m_RuleList.getNext( &rule, &index) ) {
964		if ( findSubGraph( rule, p_SubGraph ) ) {
965		    printSubgraph( *p_SubGraph );
966		}
967	    }
968	}
969    }
970}
971
972
973void GRXMLDoc::printSubgraph( SubGraph &p_SubGraph )
974{
975    p_SubGraph.PrintWithLabels( *this );
976}
977
978
979bool GRXMLDoc::getRuleRefName(XMLNode &node, std::string &ruleName)
980{
981  const char* attr;
982  std::string s = GETATTR("uri" );
983  if (s.empty()) {
984    FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 );
985  }
986  // Remove the #:
987  int p1 = s.find("#");
988  if ( p1 !=0 ) {
989    FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'", -1 );
990  }
991  ruleName.assign( s, 1, s.size() );
992  return true;
993}
994
995void GRXMLDoc::initializeLists()
996{
997  m_SubgraphList.setName("Subgraphs");
998  m_RuleList.setName("Rules");
999  m_TagList.setName("Tags");
1000  m_LabelList.setName("Labels");
1001
1002  /* Predefined rules. NB Labels are also created for each rule added.
1003  // The required order for these labels in the .map output file is:
1004  //     0   eps
1005  //     next come slots
1006  //     pau and pau2
1007  //     everything else
1008  // We will add all these now in case they are referenced and we will
1009  // reindex after we have parsed the grammar -- when we have the list
1010  // of slots. This re-indexing is for the output files .map and .P.txt.
1011  //
1012  */
1013    addLabelToList( "eps" );
1014
1015    addLabelToList( "-pau-" );
1016    addLabelToList( "-pau2-" );
1017}
1018
1019void GRXMLDoc::writeMapFile( std::string & fileName )
1020{
1021    // We need to re-index in order to put the labels in correct order:
1022    // 1. eps
1023    // 2. all slots
1024    // 3. all rules
1025    // 4. -pau- words
1026    // 5. remaining labels
1027    ofstream outfile;
1028    int index, origIndex;
1029    std::string label;
1030    std::string slotRuleName;
1031    std::string scope; // For rules
1032    HashMap<int,std::string> orderedList;
1033    int orderedIndex=0;
1034    // 1. eps
1035    orderedList.insert( orderedIndex++, "eps" );
1036
1037    // 2. slots
1038    if ( m_LabelList.getFirst( &origIndex, &label ) ) {
1039	if ( IsSlot( label ) ) {
1040	    orderedList.insert( orderedIndex++, label );
1041	}
1042	while (m_LabelList.getNext( &origIndex, &label ) ) {
1043	    if ( IsSlot( label ) ) {
1044		orderedList.insert( orderedIndex++, label );
1045	    }
1046	}
1047    }
1048
1049    // 3.  Now rules, or anything with @
1050    if ( m_LabelList.getFirst( &origIndex, &label ) ) {
1051	do {
1052#if GRXML_DEBUG
1053	    std::cout << label << " "<< label.find_first_of ("@") << std::endl;
1054#endif
1055            if (!IsSlot(label) && label.find_first_of ("@") != string::npos) {
1056#if GRXML_DEBUG
1057		std::cout << "    Adding " << label << std::endl;
1058#endif
1059		orderedList.insert( orderedIndex++, label );
1060	    }
1061	} while (m_LabelList.getNext( &origIndex, &label ) );
1062    }
1063
1064    // 4. pau
1065    orderedList.insert( orderedIndex++, "-pau-" );
1066    orderedList.insert( orderedIndex++, "-pau2-" );
1067
1068    // 5. Remaining stuff. NB We depend upon the label not
1069    //    being added twice.
1070    if ( m_LabelList.getFirst( &origIndex, &label ) ) {
1071	if ( !orderedList.getIndex( label, &index ) ) {
1072	  orderedList.insert( orderedIndex++, label );
1073	}
1074	while (m_LabelList.getNext( &origIndex, &label ) ) {
1075	    if ( !orderedList.getIndex( label, &index ) ) {
1076	      orderedList.insert( orderedIndex++, label );
1077	    }
1078	}
1079    }
1080    outfile.open ( fileName.c_str() );
1081
1082    bool bRes = orderedList.getFirst( &index, &label );
1083    do {
1084      if(!bRes) break;
1085      // Look up scope using original index
1086      m_LabelList.getIndex( label, &origIndex );
1087      if (m_RuleScope.getValue(origIndex, &scope) )
1088	label = scope + ":" + label;
1089      outfile << label << " " << index << std::endl;
1090      bRes = orderedList.getNext( &index, &label );
1091    } while(bRes);
1092
1093    outfile.close();
1094}
1095
1096
1097void GRXMLDoc::writeScriptFile( std::string & fileName )
1098{
1099    ofstream outfile;
1100    int index;
1101    std::string label;
1102    outfile.open ( fileName.c_str() );
1103    if ( m_TagList.getFirst( &index, &label ) ) {
1104    	outfile << index << " " << label << std::endl;
1105    }
1106    while (m_TagList.getNext( &index, &label ) ) {
1107    	outfile << index << " " << label << std::endl;
1108    }
1109    outfile.close();
1110
1111    //m_LabelList.writeFile( fileName );
1112}
1113
1114void GRXMLDoc::writeParamsFile( std::string & fileName )
1115{
1116  std::string wtw;
1117  ofstream outfile;
1118  bool bRes;
1119
1120  outfile.open(fileName.c_str());
1121
1122  std::string metaname = "word_penalty";
1123  bRes = findMeta(metaname, wtw);
1124  if(bRes)
1125    outfile << metaname.c_str() << "\t=\t" << wtw.c_str() << std::endl;
1126
1127  // outfile << "locale"  << "\t=\t" << m_XMLLanguage << std::endl;
1128  outfile.close();
1129}
1130
1131void GRXMLDoc::writeGraphFiles( std::string& prefix, bool bDoWriteRecogGraphs)
1132{
1133    SubGraph *p_SubGraph;
1134    SubGraph *p_SemGraph;
1135    std::string fileName;
1136    if ( !findSubGraph( m_RootRule, p_SubGraph ) ) {
1137	FATAL_ERROR ("ERROR: writeGraphFiles - no root rule "+ m_RootRule + " defined. No file created", -1 );
1138    }
1139
1140    //  Create .P.txt
1141    printf ("\nCreating semantic graph file\n");
1142    p_SemGraph = new SubGraph( (char *) "Main", -1);
1143    m_pGraph->BeginRule( p_SemGraph );
1144    m_pGraph->AddRuleRef( p_SemGraph, p_SubGraph->getRuleId());
1145    m_pGraph->EndRule( p_SemGraph );
1146    m_pGraph->ExpandRules (p_SemGraph);
1147    p_SemGraph->RemoveInternalConnections ();
1148
1149    p_SemGraph->AddTerminalConnections ();
1150    p_SemGraph->ReduceArcsByEquivalence();
1151    p_SemGraph->RemoveUnreachedConnections (-1, -1);
1152    p_SemGraph->DeterminizeArcs();
1153    p_SemGraph->RemoveUnreachedConnections (-1, -1);
1154    p_SemGraph->ReduceArcsByEquivalence();
1155    p_SemGraph->RemoveUnreachedConnections (-1, -1);
1156    fileName = prefix + ".P.txt";
1157    p_SemGraph->WriteForwardGraphWithSemantic( fileName, *this );
1158    delete p_SemGraph;
1159
1160    fileName = prefix + ".omap";
1161    this->WriteOLabels(fileName);
1162}
1163
1164void GRXMLDoc::sortLabels()
1165{
1166    // We need to re-index in order to put the labels in correct order:
1167    int index=0, origIndex;
1168    std::string label;
1169    std::string slotRuleName;
1170    std::string scope; // For rules
1171    std::vector <std::string> orderedList;
1172    if ( m_LabelList.getFirst( &origIndex, &label ) ) {
1173        // Look up scope using original index
1174        orderedList.push_back( label );
1175        while (m_LabelList.getNext( &origIndex, &label ) ) {
1176            orderedList.push_back( label );
1177        }
1178    }
1179    std::sort(orderedList.begin(), orderedList.end() );
1180    m_SortedLabelList.clear();
1181    index=0;
1182    for (std::vector<std::string>::const_iterator citer = orderedList.begin();
1183     citer != orderedList.end(); ++citer) {
1184        label = *citer;
1185        m_LabelList.getIndex( label, &origIndex );
1186        m_SortedLabelList.insert( index, label );
1187        index++;
1188        // std::cout <<"Sorted: " << index <<" " << label <<std::endl;
1189    }
1190    return;
1191}
1192
1193bool GRXMLDoc::findSortedLabel(int i, std::string &s )
1194{
1195    if (m_SortedLabelList.isEmpty() ) {
1196        sortLabels(); // Create the sorted label list.
1197    }
1198    return m_SortedLabelList.getValue( i, &s );
1199}
1200
1201bool GRXMLDoc::findSortedLabelIndex( int i, int &sortedIndex )
1202{
1203    std::string s;
1204    if (m_SortedLabelList.isEmpty() ) {
1205        sortLabels(); // Create the sorted label list.
1206    }
1207    if ( m_LabelList.getValue( i, &s ) ) {
1208        if ( m_SortedLabelList.getIndex(s, &sortedIndex )) {
1209            return true;
1210        }
1211    }
1212    return false;
1213}
1214
1215void GRXMLDoc::addOLabelToOList( std::string &s)
1216{
1217    m_OutputPtxtLabels.insert( s, 0);
1218}
1219
1220bool GRXMLDoc::WriteOLabels(const std::string& fileName)
1221{
1222  HashMap<int,std::string> invMap;
1223  int count = 0;
1224  int max_script_label = 0;
1225  int scriptID = 0;
1226  std::map<std::string, int>::iterator iter;
1227  bool bFound;
1228  int tmp;
1229
1230  std::string strIndex = "eps";
1231  bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp);
1232  if(bFound)
1233    m_OutputPtxtLabels.remove(strIndex);
1234  m_OutputPtxtLabels.insert(strIndex, count);
1235  invMap.insert( count, strIndex);
1236  count++;
1237
1238  strIndex = "{";
1239  bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp);
1240  if(bFound)
1241    m_OutputPtxtLabels.remove(strIndex);
1242  m_OutputPtxtLabels.insert(strIndex, count);
1243  invMap.insert( count, strIndex);
1244  count++;
1245
1246  iter = m_OutputPtxtLabels.begin();
1247  for( ; iter!=m_OutputPtxtLabels.end(); iter++) {
1248    const char* label = iter->first.c_str();
1249    if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN)
1250	&& strspn(label+SCRIPT_LABEL_PREFIX_LEN,"0123456789")==strlen(label+SCRIPT_LABEL_PREFIX_LEN) ) {
1251      scriptID = atoi(label+SCRIPT_LABEL_PREFIX_LEN);
1252      if(max_script_label < scriptID)
1253	max_script_label = scriptID;
1254    }/* else if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN)) {
1255      invMap.insert(count, iter->first);
1256      iter->second = count;
1257      count++;
1258      }*/
1259    else if(!invMap.getIndex((iter->first), &tmp)){
1260      invMap.insert(count, iter->first);
1261      iter->second = count;
1262      count++;
1263    }
1264  }
1265
1266  cout << "found max_script_label " << max_script_label << endl;
1267  for(int j=0; j<=max_script_label; j++) {
1268    std::stringstream ss;
1269    ss << SCRIPT_LABEL_PREFIX << j;
1270    if(!invMap.getIndex( ss.str(), &tmp)) {
1271      invMap.insert( count++, ss.str());
1272    }
1273  }
1274
1275  std::ofstream outfile(fileName.c_str());
1276  std::string outscript;
1277  if(!outfile) {
1278    FATAL_ERROR( "Error: opening the omap file for output", 1);
1279    WARNING( "Error: opening the omap file for output");
1280    return 1;
1281  }
1282  for(int i=0; i<count; i++) {
1283    outscript = "";
1284    invMap.getValue(i,&outscript);
1285    if(outscript.length() == 0) {
1286      cout << "error: internal error while making .omap " << i << endl;
1287      FATAL_ERROR("error",1);
1288    }
1289    outfile << outscript.c_str() << " " << i << std::endl;
1290  }
1291  outfile.close();
1292  return 0;
1293}
1294