1/*
2 ******************************************************************************
3 * Copyright (C) 2005, International Business Machines Corporation and   *
4 * others. All Rights Reserved.                                               *
5 ******************************************************************************
6 */
7/*
8  WBNF, Weighted BNF, is an extend BNF. The most difference between WBNF
9  and standard BNF is the WBNF accepts weight for its alternation items.
10  The weight specifies the opportunity it will be selected.
11
12  The purpose of WBNF is to help generate a random string from a given grammar
13  which can be described with standard BNF. The introduction of 'weight'
14  is to guide the generator to give the specific parts different chances to be
15  generated.
16
17  Usually, the user gives LanguageGenerator the grammar description in WBNF,
18  then LanguageGenerator will generate a random string on every next() call.
19  The return code of parseBNF() can help user to determine the error,
20  either in the grammar description or in the WBNF parser itself.
21
22
23  The grammar of WBNF itself can be described in standard BNF,
24
25    escaping        = _single character with a leading back slash, either inside or outside quoting_
26    quoting         = _quoted with a pair of single quotation marks_
27    string          = string alphabet | string digit | string quoting | string escaping |
28                      alphabet | quoting | escaping
29    alphabet        =
30    digit           =
31    integer         = integer digit | digit
32    weight          = integer %
33    weight-list     = weight-list weight | weight
34    var             = var alphabet | var digit | $ alphabet
35
36    var-defs        = var-defs var-def | var-def
37    var-def         = var '=' definition;
38
39    alternation     = alternation '|' alt-item | alt-item
40    alt-item        = sequence | sequence weight
41
42    sequence        = sequence modified | modified
43
44    modified        = core | morph | quote | repeat
45    morph           = modified ~
46    quote           = modified @
47    repeat          = modified quantifier | modified quantifier weight-list
48    quantifier      = ? | * | + | { integer , integer} | {integer, } | {integer}
49
50    core            = var | string | '(' definition ')'
51
52    definition      = core | modified | sequence | alternation
53    definition      = alternation
54
55    Remarks:
56    o Following characters are literals in preceding definition
57      but are syntax symbols in WBNF
58
59      % $ ~ @ ? * + { } ,
60
61    o Following character are syntax symbols in preceding definition
62              (sapce) contact operation, or separators to increase readability
63      =       definition
64      |       selection operation
65      ( )     precedence select
66      ' '     override special-character to plain character
67
68    o the definition of 'escaping' and 'quoting' are preceding definition text
69    o infinite is actually a predefine value PSEUDO_INFINIT defined in this file
70    o if weight is not presented in "alt-item' and 'repeat',
71      a default weight DEFAULT_WEIGHT defined in this file is used
72
73    o * == {0,  }
74      + == {1,  }
75      ? == {0, 1}
76
77    o the weight-list for repeat assigns the weights for repeat itmes one by one
78
79      demo{1,3} 30% 40% 100%  ==  (demo)30% | (demodemo)40% | (demodemodemo)100%
80
81      To find more explain of the weight-list, please see the LIMITATION of the grammar
82
83    o but the weight-list for question mark has different meaning
84
85      demo ? 30%   != demo{0,1} 30% 100%
86      demo ? 30%   == demo{0,1} 70% 30%
87
88      the 70% is calculated from (DEFAULT_WEIGHT - weight)
89
90
91  Known LIMITATION of the grammar
92    For 'repeat', the parser will eat up as much as possible weights at one time,
93    discard superfluous weights if it is too much,
94    fill insufficient weights with default weight if it is too less.
95    This behavior means following definitions are equal
96
97        demo{1,3} 30% 40% 100%
98        demo{1,3} 30% 40% 100% 50%
99        demo{1,3} 30% 40%
100
101    This behavior will cause a little confusion when defining an alternation
102
103        demo{1,3} 30% 40% 100% 50% | show 20%
104
105    is interpreted as
106
107        (demo{1,3} 30% 40% 100%) 100% | show 20%
108
109    not
110
111        (demo{1,3} 30% 40% 100%) 50% | show 20%
112
113    to get an expected definition, please use parentheses.
114
115  Known LIMITATION of current implement
116    Due to the well known point alias problem, current Parser will be effectively
117    crashed if the definition looks like
118
119        $a = demo;
120        $b = $a;
121        $c = $a;
122    or
123        $a = demo;
124        $b = $a $a;
125    or
126        $a = demo;
127        $b = $b $a;
128
129    The crash will occur at delete operation in destructor or other memory release code.
130    Several plans are on hard to fix the problem. Use a smart point with reference count,
131    or use a central memory management solution. But now, it works well with collation
132    monkey test, which is the only user for WBNF.
133*/
134
135#ifndef _WBNF
136#define _WBNF
137
138#include "unicode/utypes.h"
139
140const int DEFAULT_WEIGHT = 100;
141const int PSEUDO_INFINIT = 200;
142
143class LanguageGenerator_impl;
144
145class LanguageGenerator{
146    LanguageGenerator_impl * lang_gen;
147public:
148    enum PARSE_RESULT {OK, BNF_DEF_WRONG, INCOMPLETE, NO_TOP_NODE};
149    LanguageGenerator();
150    ~LanguageGenerator();
151    PARSE_RESULT parseBNF(const char *const bnf_definition /*in*/, const char *const top_node/*in*/, UBool debug=FALSE);
152    const char *next(); /* Return a null-terminated c-string. The buffer is owned by callee. */
153};
154
155void TestWbnf(void);
156
157#endif /* _WBNF */
158