1f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* 2f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ********************************************************************** 3f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Copyright (C) 1999-2008, International Business Machines 4f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Corporation and others. All Rights Reserved. 5f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ********************************************************************** 6f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Date Name Description 7f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 11/17/99 aliu Creation. 8f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ********************************************************************** 9f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 10f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 11f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/utypes.h" 12f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 13f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#if !UCONFIG_NO_TRANSLITERATION 14f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 15f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/uobject.h" 16f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/parseerr.h" 17f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/parsepos.h" 18f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/putil.h" 19f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/uchar.h" 20f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/ustring.h" 21f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/uniset.h" 22f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "cstring.h" 23f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "funcrepl.h" 24f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "hash.h" 25f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "quant.h" 26f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "rbt.h" 27f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "rbt_data.h" 28f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "rbt_pars.h" 29f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "rbt_rule.h" 30f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "strmatch.h" 31f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "strrepl.h" 32f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/symtable.h" 33f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "tridpars.h" 34f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uvector.h" 35f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "hash.h" 36f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "util.h" 37f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "cmemory.h" 38f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uprops.h" 39f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "putilimp.h" 40f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 41f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Operators 42f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define VARIABLE_DEF_OP ((UChar)0x003D) /*=*/ 43f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define FORWARD_RULE_OP ((UChar)0x003E) /*>*/ 44f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define REVERSE_RULE_OP ((UChar)0x003C) /*<*/ 45f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define FWDREV_RULE_OP ((UChar)0x007E) /*~*/ // internal rep of <> op 46f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 47f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Other special characters 48f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define QUOTE ((UChar)0x0027) /*'*/ 49f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define ESCAPE ((UChar)0x005C) /*\*/ 50f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define END_OF_RULE ((UChar)0x003B) /*;*/ 51f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define RULE_COMMENT_CHAR ((UChar)0x0023) /*#*/ 52f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 53f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define SEGMENT_OPEN ((UChar)0x0028) /*(*/ 54f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define SEGMENT_CLOSE ((UChar)0x0029) /*)*/ 55f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define CONTEXT_ANTE ((UChar)0x007B) /*{*/ 56f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define CONTEXT_POST ((UChar)0x007D) /*}*/ 57f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define CURSOR_POS ((UChar)0x007C) /*|*/ 58f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define CURSOR_OFFSET ((UChar)0x0040) /*@*/ 59f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define ANCHOR_START ((UChar)0x005E) /*^*/ 60f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define KLEENE_STAR ((UChar)0x002A) /***/ 61f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define ONE_OR_MORE ((UChar)0x002B) /*+*/ 62f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define ZERO_OR_ONE ((UChar)0x003F) /*?*/ 63f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 64f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define DOT ((UChar)46) /*.*/ 65f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 66f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]"; 67f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90, 68f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 108, 58, 93, 92, 114, 92, 110, 36, 93, 0 69f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}; 70f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 71f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// A function is denoted &Source-Target/Variant(text) 72f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define FUNCTION ((UChar)38) /*&*/ 73f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 74f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Aliases for some of the syntax characters. These are provided so 75f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// transliteration rules can be expressed in XML without clashing with 76f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// XML syntax characters '<', '>', and '&'. 77f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow 78f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow 79f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define ALT_FWDREV_RULE_OP ((UChar)0x2194) // Left Right Arrow 80f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define ALT_FUNCTION ((UChar)0x2206) // Increment (~Greek Capital Delta) 81f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 82f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Special characters disallowed at the top level 83f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar ILLEGAL_TOP[] = {41,0}; // ")" 84f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 85f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Special characters disallowed within a segment 86f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar ILLEGAL_SEG[] = {123,125,124,64,0}; // "{}|@" 87f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 88f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Special characters disallowed within a function argument 89f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(.*+?{}|@" 90f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 91f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// By definition, the ANCHOR_END special character is a 92f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// trailing SymbolTable.SYMBOL_REF character. 93f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// private static final char ANCHOR_END = '$'; 94f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 95f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar gOPERATORS[] = { // "=><" 96f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, 97f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, 98f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 0 99f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}; 100f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 101f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar HALF_ENDERS[] = { // "=><;" 102f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, 103f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, 104f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) END_OF_RULE, 105f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 0 106f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}; 107f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 108f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// These are also used in Transliterator::toRules() 109f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const int32_t ID_TOKEN_LEN = 2; 110f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':' 111f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 112f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* 113f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)commented out until we do real ::BEGIN/::END functionality 114f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const int32_t BEGIN_TOKEN_LEN = 5; 115f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN' 116f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 117f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const int32_t END_TOKEN_LEN = 3; 118f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END' 119f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/ 120f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 121f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_NAMESPACE_BEGIN 122f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 123f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)//---------------------------------------------------------------------- 124f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// BEGIN ParseData 125f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)//---------------------------------------------------------------------- 126f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 127f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 128f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This class implements the SymbolTable interface. It is used 129f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * during parsing to give UnicodeSet access to variables that 130f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * have been defined so far. Note that it uses variablesVector, 131f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * _not_ data.setVariables. 132f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 133f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)class ParseData : public UMemory, public SymbolTable { 134f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)public: 135f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const TransliterationRuleData* data; // alias 136f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 137f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UVector* variablesVector; // alias 138f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 139f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const Hashtable* variableNames; // alias 140f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 141f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ParseData(const TransliterationRuleData* data = 0, 142f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UVector* variablesVector = 0, 143f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const Hashtable* variableNames = 0); 144f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 145f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) virtual const UnicodeString* lookup(const UnicodeString& s) const; 146f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 147f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; 148f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 149f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) virtual UnicodeString parseReference(const UnicodeString& text, 150f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ParsePosition& pos, int32_t limit) const; 151f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /** 152f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Return true if the given character is a matcher standin or a plain 153f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * character (non standin). 154f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 155f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool isMatcher(UChar32 ch); 156f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 157f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /** 158f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Return true if the given character is a replacer standin or a plain 159f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * character (non standin). 160f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 161f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool isReplacer(UChar32 ch); 162f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 163f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)private: 164f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ParseData(const ParseData &other); // forbid copying of this class 165f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ParseData &operator=(const ParseData &other); // forbid copying of this class 166f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}; 167f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 168f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)ParseData::ParseData(const TransliterationRuleData* d, 169f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UVector* sets, 170f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const Hashtable* vNames) : 171f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) data(d), variablesVector(sets), variableNames(vNames) {} 172f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 173f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 174f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Implement SymbolTable API. 175f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 176f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)const UnicodeString* ParseData::lookup(const UnicodeString& name) const { 177f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return (const UnicodeString*) variableNames->get(name); 178f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 179f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 180f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 181f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Implement SymbolTable API. 182f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 183f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const { 184f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Note that we cannot use data.lookupSet() because the 185f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // set array has not been constructed yet. 186f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeFunctor* set = NULL; 187f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i = ch - data->variablesBase; 188f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (i >= 0 && i < variablesVector->size()) { 189f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i = ch - data->variablesBase; 190f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) set = (i < variablesVector->size()) ? 191f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (UnicodeFunctor*) variablesVector->elementAt(i) : 0; 192f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 193f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return set; 194f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 195f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 196f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 197f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Implement SymbolTable API. Parse out a symbol reference 198f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * name. 199f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 200f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UnicodeString ParseData::parseReference(const UnicodeString& text, 201f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ParsePosition& pos, int32_t limit) const { 202f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t start = pos.getIndex(); 203f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i = start; 204f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString result; 205f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (i < limit) { 206f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar c = text.charAt(i); 207f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { 208f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 209f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 210f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++i; 211f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 212f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (i == start) { // No valid name chars 213f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return result; // Indicate failure with empty string 214f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 215f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos.setIndex(i); 216f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) text.extractBetween(start, i, result); 217f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return result; 218f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 219f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 220f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool ParseData::isMatcher(UChar32 ch) { 221f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Note that we cannot use data.lookup() because the 222f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // set array has not been constructed yet. 223f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i = ch - data->variablesBase; 224f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (i >= 0 && i < variablesVector->size()) { 225f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i); 226f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return f != NULL && f->toMatcher() != NULL; 227f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 228f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return TRUE; 229f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 230f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 231f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 232f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Return true if the given character is a replacer standin or a plain 233f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * character (non standin). 234f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 235f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool ParseData::isReplacer(UChar32 ch) { 236f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Note that we cannot use data.lookup() because the 237f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // set array has not been constructed yet. 238f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int i = ch - data->variablesBase; 239f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (i >= 0 && i < variablesVector->size()) { 240f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i); 241f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return f != NULL && f->toReplacer() != NULL; 242f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 243f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return TRUE; 244f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 245f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 246f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)//---------------------------------------------------------------------- 247f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// BEGIN RuleHalf 248f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)//---------------------------------------------------------------------- 249f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 250f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 251f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * A class representing one side of a rule. This class knows how to 252f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * parse half of a rule. It is tightly coupled to the method 253f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * RuleBasedTransliterator.Parser.parseRule(). 254f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 255f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)class RuleHalf : public UMemory { 256f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 257f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)public: 258f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 259f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString text; 260f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 261f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t cursor; // position of cursor in text 262f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t ante; // position of ante context marker '{' in text 263f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t post; // position of post context marker '}' in text 264f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 265f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Record the offset to the cursor either to the left or to the 266f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // right of the key. This is indicated by characters on the output 267f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // side that allow the cursor to be positioned arbitrarily within 268f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // the matching text. For example, abc{def} > | @@@ xyz; changes 269f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // def to xyz and moves the cursor to before abc. Offset characters 270f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // must be at the start or end, and they cannot move the cursor past 271f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // the ante- or postcontext text. Placeholders are only valid in 272f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // output text. The length of the ante and post context is 273f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // determined at runtime, because of supplementals and quantifiers. 274f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t cursorOffset; // only nonzero on output side 275f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 276f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Position of first CURSOR_OFFSET on _right_. This will be -1 277f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc. 278f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t cursorOffsetPos; 279f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 280f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool anchorStart; 281f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool anchorEnd; 282f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 283f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /** 284f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The segment number from 1..n of the next '(' we see 285f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * during parsing; 1-based. 286f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 287f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t nextSegmentNumber; 288f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 289f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) TransliteratorParser& parser; 290f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 291f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //-------------------------------------------------- 292f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Methods 293f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 294f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) RuleHalf(TransliteratorParser& parser); 295f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ~RuleHalf(); 296f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 297f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); 298f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 299f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, 300f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString& buf, 301f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString& illegal, 302f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool isSegment, 303f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode& status); 304f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 305f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /** 306f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Remove context. 307f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 308f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) void removeContext(); 309f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 310f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /** 311f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Return true if this half looks like valid output, that is, does not 312f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * contain quantifiers or other special input-only elements. 313f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 314f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool isValidOutput(TransliteratorParser& parser); 315f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 316f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /** 317f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Return true if this half looks like valid input, that is, does not 318f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * contain functions or other special output-only elements. 319f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 320f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool isValidInput(TransliteratorParser& parser); 321f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 322f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int syntaxError(UErrorCode code, 323f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString& rule, 324f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t start, 325f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode& status) { 326f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return parser.syntaxError(code, rule, start, status); 327f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 328f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 329f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)private: 330f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Disallowed methods; no impl. 331f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) RuleHalf(const RuleHalf&); 332f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) RuleHalf& operator=(const RuleHalf&); 333f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}; 334f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 335f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)RuleHalf::RuleHalf(TransliteratorParser& p) : 336f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parser(p) 337f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles){ 338f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) cursor = -1; 339f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ante = -1; 340f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) post = -1; 341f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) cursorOffset = 0; 342f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) cursorOffsetPos = 0; 343f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) anchorStart = anchorEnd = FALSE; 344f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) nextSegmentNumber = 1; 345f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 346f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 347f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)RuleHalf::~RuleHalf() { 348f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 349f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 350f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 351f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Parse one side of a rule, stopping at either the limit, 352f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the END_OF_RULE character, or an operator. 353f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @return the index after the terminating character, or 354f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * if limit was reached, limit 355f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 356f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 357f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t start = pos; 358f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) text.truncate(0); 359f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = parseSection(rule, pos, limit, text, ILLEGAL_TOP, FALSE, status); 360f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 361f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (cursorOffset > 0 && cursor != cursorOffsetPos) { 362f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 363f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 364f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 365f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; 366f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 367f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 368f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 369f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Parse a section of one side of a rule, stopping at either 370f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the limit, the END_OF_RULE character, an operator, or a 371f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * segment close character. This method parses both a 372f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * top-level rule half and a segment within such a rule half. 373f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * It calls itself recursively to parse segments and nested 374f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * segments. 375f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param buf buffer into which to accumulate the rule pattern 376f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * characters, either literal characters from the rule or 377f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * standins for UnicodeMatcher objects including segments. 378f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param illegal the set of special characters that is illegal during 379f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * this parse. 380f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param isSegment if true, then we've already seen a '(' and 381f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * pos on entry points right after it. Accumulate everything 382f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * up to the closing ')', put it in a segment matcher object, 383f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * generate a standin for it, and add the standin to buf. As 384f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * a side effect, update the segments vector with a reference 385f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * to the segment matcher. This works recursively for nested 386f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * segments. If isSegment is false, just accumulate 387f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * characters into buf. 388f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @return the index after the terminating character, or 389f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * if limit was reached, limit 390f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 391f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, 392f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString& buf, 393f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString& illegal, 394f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool isSegment, UErrorCode& status) { 395f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t start = pos; 396f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ParsePosition pp; 397f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString scratch; 398f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool done = FALSE; 399f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t quoteStart = -1; // Most recent 'single quoted string' 400f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t quoteLimit = -1; 401f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t varStart = -1; // Most recent $variableReference 402f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t varLimit = -1; 403f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t bufStart = buf.length(); 404f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 405f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (pos < limit && !done) { 406f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Since all syntax characters are in the BMP, fetching 407f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // 16-bit code units suffices here. 408f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar c = rule.charAt(pos++); 409f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (uprv_isRuleWhiteSpace(c)) { 410f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Ignore whitespace. Note that this is not Unicode 411f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // spaces, but Java spaces -- a subset, representing 412f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // whitespace likely to be seen in code. 413f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 414f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 415f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (u_strchr(HALF_ENDERS, c) != NULL) { 416f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (isSegment) { 417f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Unclosed segment 418f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_UNCLOSED_SEGMENT, rule, start, status); 419f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 420f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 421f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 422f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (anchorEnd) { 423f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Text after a presumed end anchor is a syntax err 424f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start, status); 425f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 426f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (UnicodeSet::resemblesPattern(rule, pos-1)) { 427f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pp.setIndex(pos-1); // Backup to opening '[' 428f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append(parser.parseSet(rule, pp, status)); 429f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(status)) { 430f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MALFORMED_SET, rule, start, status); 431f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 432f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = pp.getIndex(); 433f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 434f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 435f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Handle escapes 436f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c == ESCAPE) { 437f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (pos == limit) { 438f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_TRAILING_BACKSLASH, rule, start, status); 439f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 440f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\' 441f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (escaped == (UChar32) -1) { 442f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MALFORMED_UNICODE_ESCAPE, rule, start, status); 443f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 444f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (!parser.checkVariableRange(escaped)) { 445f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 446f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 447f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append(escaped); 448f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 449f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 450f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Handle quoted matter 451f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c == QUOTE) { 452f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t iq = rule.indexOf(QUOTE, pos); 453f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (iq == pos) { 454f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append(c); // Parse [''] outside quotes as ['] 455f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++pos; 456f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 457f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* This loop picks up a run of quoted text of the 458f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * form 'aaaa' each time through. If this run 459f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * hasn't really ended ('aaaa''bbbb') then it keeps 460f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * looping, each time adding on a new run. When it 461f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * reaches the final quote it breaks. 462f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 463f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) quoteStart = buf.length(); 464f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (;;) { 465f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (iq < 0) { 466f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_UNTERMINATED_QUOTE, rule, start, status); 467f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 468f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) scratch.truncate(0); 469f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rule.extractBetween(pos, iq, scratch); 470f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append(scratch); 471f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = iq+1; 472f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (pos < limit && rule.charAt(pos) == QUOTE) { 473f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Parse [''] inside quotes as ['] 474f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) iq = rule.indexOf(QUOTE, pos+1); 475f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Continue looping 476f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 477f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 478f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 479f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 480f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) quoteLimit = buf.length(); 481f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 482f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (iq=quoteStart; iq<quoteLimit; ++iq) { 483f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (!parser.checkVariableRange(buf.charAt(iq))) { 484f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 485f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 486f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 487f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 488f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 489f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 490f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 491f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (!parser.checkVariableRange(c)) { 492f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 493f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 494f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 495f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (illegal.indexOf(c) >= 0) { 496f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) syntaxError(U_ILLEGAL_CHARACTER, rule, start, status); 497f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 498f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 499f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) switch (c) { 500f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 501f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //------------------------------------------------------ 502f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Elements allowed within and out of segments 503f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //------------------------------------------------------ 504f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case ANCHOR_START: 505f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (buf.length() == 0 && !anchorStart) { 506f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) anchorStart = TRUE; 507f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 508f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MISPLACED_ANCHOR_START, 509f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rule, start, status); 510f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 511f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 512f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case SEGMENT_OPEN: 513f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) { 514f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // bufSegStart is the offset in buf to the first 515f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // character of the segment we are parsing. 516f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t bufSegStart = buf.length(); 517f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 518f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Record segment number now, since nextSegmentNumber 519f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // will be incremented during the call to parseSection 520f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // if there are nested segments. 521f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t segmentNumber = nextSegmentNumber++; // 1-based 522f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 523f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Parse the segment 524f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = parseSection(rule, pos, limit, buf, ILLEGAL_SEG, TRUE, status); 525f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 526f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // After parsing a segment, the relevant characters are 527f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // in buf, starting at offset bufSegStart. Extract them 528f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // into a string matcher, and replace them with a 529f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // standin for that matcher. 530f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) StringMatcher* m = 531f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) new StringMatcher(buf, bufSegStart, buf.length(), 532f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) segmentNumber, *parser.curData); 533f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (m == NULL) { 534f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 535f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 536f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 537f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Record and associate object and segment number 538f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parser.setSegmentObject(segmentNumber, m, status); 539f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.truncate(bufSegStart); 540f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append(parser.getSegmentStandin(segmentNumber, status)); 541f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 542f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 543f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case FUNCTION: 544f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case ALT_FUNCTION: 545f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) { 546f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t iref = pos; 547f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) TransliteratorIDParser::SingleID* single = 548f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) TransliteratorIDParser::parseFilterID(rule, iref); 549f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The next character MUST be a segment open 550f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (single == NULL || 551f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) !ICU_Utility::parseChar(rule, iref, SEGMENT_OPEN)) { 552f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_INVALID_FUNCTION, rule, start, status); 553f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 554f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 555f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) Transliterator *t = single->createInstance(); 556f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete single; 557f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (t == NULL) { 558f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_INVALID_FUNCTION, rule, start, status); 559f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 560f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 561f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // bufSegStart is the offset in buf to the first 562f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // character of the segment we are parsing. 563f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t bufSegStart = buf.length(); 564f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 565f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Parse the segment 566f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = parseSection(rule, iref, limit, buf, ILLEGAL_FUNC, TRUE, status); 567f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 568f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // After parsing a segment, the relevant characters are 569f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // in buf, starting at offset bufSegStart. 570f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString output; 571f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.extractBetween(bufSegStart, buf.length(), output); 572f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) FunctionReplacer *r = 573f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) new FunctionReplacer(t, new StringReplacer(output, parser.curData)); 574f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (r == NULL) { 575f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 576f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 577f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 578f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Replace the buffer contents with a stand-in 579f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.truncate(bufSegStart); 580f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append(parser.generateStandInFor(r, status)); 581f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 582f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 583f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case SymbolTable::SYMBOL_REF: 584f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Handle variable references and segment references "$1" .. "$9" 585f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) { 586f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // A variable reference must be followed immediately 587f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // by a Unicode identifier start and zero or more 588f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Unicode identifier part characters, or by a digit 589f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // 1..9 if it is a segment reference. 590f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (pos == limit) { 591f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // A variable ref character at the end acts as 592f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // an anchor to the context limit, as in perl. 593f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) anchorEnd = TRUE; 594f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 595f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 596f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Parse "$1" "$2" .. "$9" .. (no upper limit) 597f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) c = rule.charAt(pos); 598f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t r = u_digit(c, 10); 599f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (r >= 1 && r <= 9) { 600f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) r = ICU_Utility::parseNumber(rule, pos, 10); 601f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (r < 0) { 602f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, 603f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rule, start, status); 604f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 605f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append(parser.getSegmentStandin(r, status)); 606f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 607f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pp.setIndex(pos); 608f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString name = parser.parseData-> 609f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parseReference(rule, pp, limit); 610f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (name.length() == 0) { 611f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // This means the '$' was not followed by a 612f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // valid name. Try to interpret it as an 613f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // end anchor then. If this also doesn't work 614f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // (if we see a following character) then signal 615f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // an error. 616f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) anchorEnd = TRUE; 617f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 618f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 619f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = pp.getIndex(); 620f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // If this is a variable definition statement, 621f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // then the LHS variable will be undefined. In 622f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // that case appendVariableDef() will append the 623f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // special placeholder char variableLimit-1. 624f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) varStart = buf.length(); 625f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parser.appendVariableDef(name, buf, status); 626f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) varLimit = buf.length(); 627f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 628f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 629f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 630f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case DOT: 631f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append(parser.getDotStandIn(status)); 632f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 633f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case KLEENE_STAR: 634f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case ONE_OR_MORE: 635f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case ZERO_OR_ONE: 636f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Quantifiers. We handle single characters, quoted strings, 637f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // variable references, and segments. 638f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // a+ matches aaa 639f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // 'foo'+ matches foofoofoo 640f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // $v+ matches xyxyxy if $v == xy 641f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // (seg)+ matches segsegseg 642f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) { 643f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (isSegment && buf.length() == bufStart) { 644f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The */+ immediately follows '(' 645f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MISPLACED_QUANTIFIER, rule, start, status); 646f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 647f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 648f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t qstart, qlimit; 649f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The */+ follows an isolated character or quote 650f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // or variable reference 651f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (buf.length() == quoteLimit) { 652f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The */+ follows a 'quoted string' 653f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) qstart = quoteStart; 654f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) qlimit = quoteLimit; 655f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else if (buf.length() == varLimit) { 656f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The */+ follows a $variableReference 657f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) qstart = varStart; 658f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) qlimit = varLimit; 659f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 660f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The */+ follows a single character, possibly 661f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // a segment standin 662f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) qstart = buf.length() - 1; 663f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) qlimit = qstart + 1; 664f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 665f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 666f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeFunctor *m = 667f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) new StringMatcher(buf, qstart, qlimit, 0, *parser.curData); 668f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (m == NULL) { 669f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 670f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 671f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t min = 0; 672f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t max = Quantifier::MAX; 673f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) switch (c) { 674f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case ONE_OR_MORE: 675f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) min = 1; 676f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 677f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case ZERO_OR_ONE: 678f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) min = 0; 679f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) max = 1; 680f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 681f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // case KLEENE_STAR: 682f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // do nothing -- min, max already set 683f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 684f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) m = new Quantifier(m, min, max); 685f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (m == NULL) { 686f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 687f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 688f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.truncate(qstart); 689f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append(parser.generateStandInFor(m, status)); 690f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 691f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 692f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 693f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //------------------------------------------------------ 694f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Elements allowed ONLY WITHIN segments 695f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //------------------------------------------------------ 696f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case SEGMENT_CLOSE: 697f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // assert(isSegment); 698f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // We're done parsing a segment. 699f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) done = TRUE; 700f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 701f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 702f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //------------------------------------------------------ 703f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Elements allowed ONLY OUTSIDE segments 704f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //------------------------------------------------------ 705f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case CONTEXT_ANTE: 706f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (ante >= 0) { 707f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start, status); 708f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 709f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ante = buf.length(); 710f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 711f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case CONTEXT_POST: 712f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (post >= 0) { 713f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start, status); 714f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 715f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) post = buf.length(); 716f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 717f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case CURSOR_POS: 718f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (cursor >= 0) { 719f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MULTIPLE_CURSORS, rule, start, status); 720f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 721f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) cursor = buf.length(); 722f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 723f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case CURSOR_OFFSET: 724f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (cursorOffset < 0) { 725f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (buf.length() > 0) { 726f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 727f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 728f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --cursorOffset; 729f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else if (cursorOffset > 0) { 730f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (buf.length() != cursorOffsetPos || cursor >= 0) { 731f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 732f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 733f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++cursorOffset; 734f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 735f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (cursor == 0 && buf.length() == 0) { 736f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) cursorOffset = -1; 737f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else if (cursor < 0) { 738f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) cursorOffsetPos = buf.length(); 739f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) cursorOffset = 1; 740f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 741f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 742f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 743f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 744f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 745f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 746f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 747f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //------------------------------------------------------ 748f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Non-special characters 749f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //------------------------------------------------------ 750f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) default: 751f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Disallow unquoted characters other than [0-9A-Za-z] 752f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // in the printable ASCII range. These characters are 753f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // reserved for possible future use. 754f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c >= 0x0021 && c <= 0x007E && 755f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 756f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 757f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) { 758f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); 759f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 760f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append(c); 761f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 762f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 763f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 764f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 765f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; 766f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 767f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 768f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 769f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Remove context. 770f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 771f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)void RuleHalf::removeContext() { 772f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //text = text.substring(ante < 0 ? 0 : ante, 773f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // post < 0 ? text.length() : post); 774f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (post >= 0) { 775f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) text.remove(post); 776f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 777f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (ante >= 0) { 778f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) text.removeBetween(0, ante); 779f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 780f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ante = post = -1; 781f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) anchorStart = anchorEnd = FALSE; 782f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 783f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 784f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 785f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Return true if this half looks like valid output, that is, does not 786f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * contain quantifiers or other special input-only elements. 787f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 788f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) { 789f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (int32_t i=0; i<text.length(); ) { 790f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 c = text.char32At(i); 791f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) i += UTF_CHAR_LENGTH(c); 792f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (!transParser.parseData->isReplacer(c)) { 793f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return FALSE; 794f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 795f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 796f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return TRUE; 797f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 798f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 799f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 800f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Return true if this half looks like valid input, that is, does not 801f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * contain functions or other special output-only elements. 802f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 803f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool RuleHalf::isValidInput(TransliteratorParser& transParser) { 804f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (int32_t i=0; i<text.length(); ) { 805f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 c = text.char32At(i); 806f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) i += UTF_CHAR_LENGTH(c); 807f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (!transParser.parseData->isMatcher(c)) { 808f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return FALSE; 809f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 810f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 811f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return TRUE; 812f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 813f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 814f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)//---------------------------------------------------------------------- 815f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// PUBLIC API 816f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)//---------------------------------------------------------------------- 817f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 818f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 819f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Constructor. 820f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 821f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) : 822f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)dataVector(statusReturn), 823f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)idBlockVector(statusReturn), 824f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)variablesVector(statusReturn), 825f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)segmentObjects(statusReturn) 826f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles){ 827f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) idBlockVector.setDeleter(uhash_deleteUnicodeString); 828f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) curData = NULL; 829f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) compoundFilter = NULL; 830f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parseData = NULL; 831f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) variableNames.setValueDeleter(uhash_deleteUnicodeString); 832f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 833f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 834f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 835f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Destructor. 836f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 837f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)TransliteratorParser::~TransliteratorParser() { 838f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (!dataVector.isEmpty()) 839f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete (TransliterationRuleData*)(dataVector.orphanElementAt(0)); 840f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete compoundFilter; 841f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete parseData; 842f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (!variablesVector.isEmpty()) 843f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete (UnicodeFunctor*)variablesVector.orphanElementAt(0); 844f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 845f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 846f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)void 847f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)TransliteratorParser::parse(const UnicodeString& rules, 848f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UTransDirection transDirection, 849f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UParseError& pe, 850f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode& ec) { 851f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_SUCCESS(ec)) { 852f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parseRules(rules, transDirection, ec); 853f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pe = parseError; 854f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 855f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 856f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 857f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 858f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Return the compound filter parsed by parse(). Caller owns result. 859f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 860f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UnicodeSet* TransliteratorParser::orphanCompoundFilter() { 861f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeSet* f = compoundFilter; 862f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) compoundFilter = NULL; 863f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return f; 864f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 865f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 866f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)//---------------------------------------------------------------------- 867f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// Private implementation 868f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)//---------------------------------------------------------------------- 869f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 870f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 871f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Parse the given string as a sequence of rules, separated by newline 872f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * characters ('\n'), and cause this object to implement those rules. Any 873f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * previous rules are discarded. Typically this method is called exactly 874f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * once, during construction. 875f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @exception IllegalArgumentException if there is a syntax error in the 876f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * rules 877f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 878f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)void TransliteratorParser::parseRules(const UnicodeString& rule, 879f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UTransDirection theDirection, 880f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode& status) 881f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles){ 882f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Clear error struct 883f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_memset(&parseError, 0, sizeof(parseError)); 884f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parseError.line = parseError.offset = -1; 885f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 886f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool parsingIDs = TRUE; 887f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t ruleCount = 0; 888f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 889f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (!dataVector.isEmpty()) { 890f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete (TransliterationRuleData*)(dataVector.orphanElementAt(0)); 891f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 892f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(status)) { 893f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 894f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 895f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 896f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) idBlockVector.removeAllElements(); 897f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) curData = NULL; 898f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) direction = theDirection; 899f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ruleCount = 0; 900f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 901f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete compoundFilter; 902f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) compoundFilter = NULL; 903f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 904f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (!variablesVector.isEmpty()) { 905f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete (UnicodeFunctor*)variablesVector.orphanElementAt(0); 906f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 907f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) variableNames.removeAll(); 908f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parseData = new ParseData(0, &variablesVector, &variableNames); 909f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (parseData == NULL) { 910f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_MEMORY_ALLOCATION_ERROR; 911f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 912f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 913f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 914f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) dotStandIn = (UChar) -1; 915f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 916f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString *tempstr = NULL; // used for memory allocation error checking 917f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString str; // scratch 918f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString idBlockResult; 919f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t pos = 0; 920f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t limit = rule.length(); 921f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 922f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // The compound filter offset is an index into idBlockResult. 923f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // If it is 0, then the compound filter occurred at the start, 924f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // and it is the offset to the _start_ of the compound filter 925f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // pattern. Otherwise it is the offset to the _limit_ of the 926f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // compound filter pattern within idBlockResult. 927f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) compoundFilter = NULL; 928f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t compoundFilterOffset = -1; 929f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 930f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (pos < limit && U_SUCCESS(status)) { 931f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar c = rule.charAt(pos++); 932f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (uprv_isRuleWhiteSpace(c)) { 933f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Ignore leading whitespace. 934f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 935f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 936f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Skip lines starting with the comment character 937f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c == RULE_COMMENT_CHAR) { 938f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = rule.indexOf((UChar)0x000A /*\n*/, pos) + 1; 939f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (pos == 0) { 940f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; // No "\n" found; rest of rule is a commnet 941f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 942f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; // Either fall out or restart with next line 943f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 944f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 945f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // skip empty rules 946f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c == END_OF_RULE) 947f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 948f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 949f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // keep track of how many rules we've seen 950f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++ruleCount; 951f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 952f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // We've found the start of a rule or ID. c is its first 953f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // character, and pos points past c. 954f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --pos; 955f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 956f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // chars left. 957f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ((pos + ID_TOKEN_LEN + 1) <= limit && 958f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) { 959f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos += ID_TOKEN_LEN; 960f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) c = rule.charAt(pos); 961f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (uprv_isRuleWhiteSpace(c) && pos < limit) { 962f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++pos; 963f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) c = rule.charAt(pos); 964f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 965f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 966f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t p = pos; 967f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 968f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (!parsingIDs) { 969f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (curData != NULL) { 970f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (direction == UTRANS_FORWARD) 971f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) dataVector.addElement(curData, status); 972f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) else 973f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) dataVector.insertElementAt(curData, 0, status); 974f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) curData = NULL; 975f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 976f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parsingIDs = TRUE; 977f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 978f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 979f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) TransliteratorIDParser::SingleID* id = 980f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) TransliteratorIDParser::parseSingleID(rule, p, direction, status); 981f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) { 982f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Successful ::ID parse. 983f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 984f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (direction == UTRANS_FORWARD) { 985f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) idBlockResult.append(id->canonID).append(END_OF_RULE); 986f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 987f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) idBlockResult.insert(0, END_OF_RULE); 988f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) idBlockResult.insert(0, id->canonID); 989f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 990f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 991f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 992f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Couldn't parse an ID. Try to parse a global filter 993f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t withParens = -1; 994f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, NULL); 995f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (f != NULL) { 996f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (ICU_Utility::parseChar(rule, p, END_OF_RULE) 997f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) && (direction == UTRANS_FORWARD) == (withParens == 0)) 998f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) { 999f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (compoundFilter != NULL) { 1000f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Multiple compound filters 1001f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rule, pos, status); 1002f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete f; 1003f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1004f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) compoundFilter = f; 1005f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) compoundFilterOffset = ruleCount; 1006f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1007f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1008f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete f; 1009f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1010f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1011f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Invalid ::id 1012f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Can be parsed as neither an ID nor a global filter 1013f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) syntaxError(U_INVALID_ID, rule, pos, status); 1014f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1015f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1016f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete id; 1017f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = p; 1018f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1019f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (parsingIDs) { 1020f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) tempstr = new UnicodeString(idBlockResult); 1021f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // NULL pointer check 1022f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (tempstr == NULL) { 1023f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_MEMORY_ALLOCATION_ERROR; 1024f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 1025f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1026f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (direction == UTRANS_FORWARD) 1027f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) idBlockVector.addElement(tempstr, status); 1028f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) else 1029f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) idBlockVector.insertElementAt(tempstr, 0, status); 1030f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) idBlockResult.remove(); 1031f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parsingIDs = FALSE; 1032f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) curData = new TransliterationRuleData(status); 1033f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // NULL pointer check 1034f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (curData == NULL) { 1035f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_MEMORY_ALLOCATION_ERROR; 1036f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 1037f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1038f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parseData->data = curData; 1039f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1040f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // By default, rules use part of the private use area 1041f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // E000..F8FF for variables and other stand-ins. Currently 1042f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // the range F000..F8FF is typically sufficient. The 'use 1043f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // variable range' pragma allows rule sets to modify this. 1044f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) setVariableRange(0xF000, 0xF8FF, status); 1045f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1046f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1047f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (resemblesPragma(rule, pos, limit)) { 1048f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t ppp = parsePragma(rule, pos, limit, status); 1049f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (ppp < 0) { 1050f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) syntaxError(U_MALFORMED_PRAGMA, rule, pos, status); 1051f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1052f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = ppp; 1053f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Parse a rule 1054f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1055f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = parseRule(rule, pos, limit, status); 1056f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1057f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1058f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1059f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1060f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (parsingIDs && idBlockResult.length() > 0) { 1061f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) tempstr = new UnicodeString(idBlockResult); 1062f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // NULL pointer check 1063f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (tempstr == NULL) { 1064f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_MEMORY_ALLOCATION_ERROR; 1065f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 1066f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1067f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (direction == UTRANS_FORWARD) 1068f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) idBlockVector.addElement(tempstr, status); 1069f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) else 1070f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) idBlockVector.insertElementAt(tempstr, 0, status); 1071f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1072f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) else if (!parsingIDs && curData != NULL) { 1073f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (direction == UTRANS_FORWARD) 1074f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) dataVector.addElement(curData, status); 1075f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) else 1076f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) dataVector.insertElementAt(curData, 0, status); 1077f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1078f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1079f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_SUCCESS(status)) { 1080f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Convert the set vector to an array 1081f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i, dataVectorSize = dataVector.size(); 1082f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (i = 0; i < dataVectorSize; i++) { 1083f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i); 1084f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) data->variablesLength = variablesVector.size(); 1085f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (data->variablesLength == 0) { 1086f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) data->variables = 0; 1087f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1088f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) data->variables = (UnicodeFunctor**)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor*)); 1089f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // NULL pointer check 1090f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (data->variables == NULL) { 1091f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_MEMORY_ALLOCATION_ERROR; 1092f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 1093f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1094f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) data->variablesAreOwned = (i == 0); 1095f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1096f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1097f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (int32_t j = 0; j < data->variablesLength; j++) { 1098f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) data->variables[j] = 1099f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ((UnicodeSet*)variablesVector.elementAt(j)); 1100f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1101f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1102f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) data->variableNames.removeAll(); 1103f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t pos = -1; 1104f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UHashElement* he = variableNames.nextElement(pos); 1105f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (he != NULL) { 1106f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString* tempus = (UnicodeString*)(((UnicodeString*)(he->value.pointer))->clone()); 1107f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (tempus == NULL) { 1108f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_MEMORY_ALLOCATION_ERROR; 1109f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 1110f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1111f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) data->variableNames.put(*((UnicodeString*)(he->key.pointer)), 1112f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) tempus, status); 1113f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) he = variableNames.nextElement(pos); 1114f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1115f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1116f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) variablesVector.removeAllElements(); // keeps them from getting deleted when we succeed 1117f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1118f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Index the rules 1119f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (compoundFilter != NULL) { 1120f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) || 1121f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) { 1122f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_MISPLACED_COMPOUND_FILTER; 1123f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1124f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1125f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1126f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (i = 0; i < dataVectorSize; i++) { 1127f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i); 1128f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) data->ruleSet.freeze(parseError, status); 1129f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1130f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (idBlockVector.size() == 1 && ((UnicodeString*)idBlockVector.elementAt(0))->isEmpty()) { 1131f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) idBlockVector.removeElementAt(0); 1132f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1133f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1134f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1135f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1136f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1137f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Set the variable range to [start, end] (inclusive). 1138f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1139f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) { 1140f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (start > end || start < 0 || end > 0xFFFF) { 1141f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_MALFORMED_PRAGMA; 1142f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 1143f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1144f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1145f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) curData->variablesBase = (UChar) start; 1146f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (dataVector.size() == 0) { 1147f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) variableNext = (UChar) start; 1148f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) variableLimit = (UChar) (end + 1); 1149f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1150f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1151f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1152f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1153f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Assert that the given character is NOT within the variable range. 1154f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If it is, return FALSE. This is neccesary to ensure that the 1155f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * variable range does not overlap characters used in a rule. 1156f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1157f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool TransliteratorParser::checkVariableRange(UChar32 ch) const { 1158f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return !(ch >= curData->variablesBase && ch < variableLimit); 1159f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1160f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1161f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1162f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Set the maximum backup to 'backup', in response to a pragma 1163f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * statement. 1164f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1165f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) { 1166f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //TODO Finish 1167f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1168f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1169f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1170f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Begin normalizing all rules using the given mode, in response 1171f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * to a pragma statement. 1172f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1173f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /*mode*/) { 1174f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //TODO Finish 1175f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1176f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1177f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar PRAGMA_USE[] = {0x75,0x73,0x65,0x20,0}; // "use " 1178f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1179f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar PRAGMA_VARIABLE_RANGE[] = {0x7E,0x76,0x61,0x72,0x69,0x61,0x62,0x6C,0x65,0x20,0x72,0x61,0x6E,0x67,0x65,0x20,0x23,0x20,0x23,0x7E,0x3B,0}; // "~variable range # #~;" 1180f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1181f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar PRAGMA_MAXIMUM_BACKUP[] = {0x7E,0x6D,0x61,0x78,0x69,0x6D,0x75,0x6D,0x20,0x62,0x61,0x63,0x6B,0x75,0x70,0x20,0x23,0x7E,0x3B,0}; // "~maximum backup #~;" 1182f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1183f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar PRAGMA_NFD_RULES[] = {0x7E,0x6E,0x66,0x64,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfd rules~;" 1184f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1185f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)static const UChar PRAGMA_NFC_RULES[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfc rules~;" 1186f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1187f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1188f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Return true if the given rule looks like a pragma. 1189f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param pos offset to the first non-whitespace character 1190f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * of the rule. 1191f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param limit pointer past the last character of the rule. 1192f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1193f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) { 1194f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Must start with /use\s/i 1195f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_USE, NULL) >= 0; 1196f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1197f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1198f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1199f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Parse a pragma. This method assumes resemblesPragma() has 1200f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * already returned true. 1201f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param pos offset to the first non-whitespace character 1202f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * of the rule. 1203f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param limit pointer past the last character of the rule. 1204f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @return the position index after the final ';' of the pragma, 1205f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * or -1 on failure. 1206f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1207f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 1208f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t array[2]; 1209f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1210f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // resemblesPragma() has already returned true, so we 1211f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // know that pos points to /use\s/i; we can skip 4 characters 1212f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // immediately 1213f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos += 4; 1214f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1215f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Here are the pragmas we recognize: 1216f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // use variable range 0xE000 0xEFFF; 1217f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // use maximum backup 16; 1218f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // use nfd rules; 1219f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // use nfc rules; 1220f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_VARIABLE_RANGE, array); 1221f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (p >= 0) { 1222f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) setVariableRange(array[0], array[1], status); 1223f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return p; 1224f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1225f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1226f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_MAXIMUM_BACKUP, array); 1227f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (p >= 0) { 1228f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pragmaMaximumBackup(array[0]); 1229f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return p; 1230f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1231f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1232f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_NFD_RULES, NULL); 1233f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (p >= 0) { 1234f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pragmaNormalizeRules(UNORM_NFD); 1235f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return p; 1236f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1237f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1238f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_NFC_RULES, NULL); 1239f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (p >= 0) { 1240f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pragmaNormalizeRules(UNORM_NFC); 1241f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return p; 1242f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1243f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1244f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Syntax error: unable to parse pragma 1245f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return -1; 1246f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1247f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1248f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1249f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * MAIN PARSER. Parse the next rule in the given rule string, starting 1250f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * at pos. Return the index after the last character parsed. Do not 1251f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * parse characters at or after limit. 1252f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 1253f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Important: The character at pos must be a non-whitespace character 1254f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * that is not the comment character. 1255f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 1256f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This method handles quoting, escaping, and whitespace removal. It 1257f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * parses the end-of-rule character. It recognizes context and cursor 1258f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * indicators. Once it does a lexical breakdown of the rule at pos, it 1259f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * creates a rule object and adds it to our rule list. 1260f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1261f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 1262f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Locate the left side, operator, and right side 1263f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t start = pos; 1264f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar op = 0; 1265f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t i; 1266f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1267f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Set up segments data 1268f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) segmentStandins.truncate(0); 1269f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) segmentObjects.removeAllElements(); 1270f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1271f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Use pointers to automatics to make swapping possible. 1272f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) RuleHalf _left(*this), _right(*this); 1273f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) RuleHalf* left = &_left; 1274f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) RuleHalf* right = &_right; 1275f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1276f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) undefinedVariableName.remove(); 1277f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = left->parse(rule, pos, limit, status); 1278f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(status)) { 1279f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return start; 1280f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1281f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1282f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) { 1283f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MISSING_OPERATOR, rule, start, status); 1284f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1285f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++pos; 1286f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1287f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Found an operator char. Check for forward-reverse operator. 1288f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (op == REVERSE_RULE_OP && 1289f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { 1290f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++pos; 1291f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) op = FWDREV_RULE_OP; 1292f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1293f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1294f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Translate alternate op characters. 1295f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) switch (op) { 1296f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case ALT_FORWARD_RULE_OP: 1297f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) op = FORWARD_RULE_OP; 1298f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1299f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case ALT_REVERSE_RULE_OP: 1300f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) op = REVERSE_RULE_OP; 1301f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1302f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case ALT_FWDREV_RULE_OP: 1303f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) op = FWDREV_RULE_OP; 1304f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1305f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1306f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1307f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) pos = right->parse(rule, pos, limit, status); 1308f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (U_FAILURE(status)) { 1309f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return start; 1310f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1311f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1312f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (pos < limit) { 1313f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (rule.charAt(--pos) == END_OF_RULE) { 1314f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++pos; 1315f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1316f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // RuleHalf parser must have terminated at an operator 1317f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); 1318f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1319f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1320f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1321f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (op == VARIABLE_DEF_OP) { 1322f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // LHS is the name. RHS is a single character, either a literal 1323f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // or a set (already parsed). If RHS is longer than one 1324f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // character, it is either a multi-character string, or multiple 1325f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // sets, or a mixture of chars and sets -- syntax error. 1326f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1327f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // We expect to see a single undefined variable (the one being 1328f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // defined). 1329f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (undefinedVariableName.length() == 0) { 1330f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // "Missing '$' or duplicate definition" 1331f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_BAD_VARIABLE_DEFINITION, rule, start, status); 1332f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1333f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (left->text.length() != 1 || left->text.charAt(0) != variableLimit) { 1334f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // "Malformed LHS" 1335f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); 1336f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1337f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (left->anchorStart || left->anchorEnd || 1338f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) right->anchorStart || right->anchorEnd) { 1339f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); 1340f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1341f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // We allow anything on the right, including an empty string. 1342f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString* value = new UnicodeString(right->text); 1343f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // NULL pointer check 1344f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (value == NULL) { 1345f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1346f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1347f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) variableNames.put(undefinedVariableName, value, status); 1348f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++variableLimit; 1349f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; 1350f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1351f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1352f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // If this is not a variable definition rule, we shouldn't have 1353f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // any undefined variable names. 1354f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (undefinedVariableName.length() != 0) { 1355f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(// "Undefined variable $" + undefinedVariableName, 1356f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U_UNDEFINED_VARIABLE, 1357f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rule, start, status); 1358f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1359f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1360f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Verify segments 1361f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (segmentStandins.length() > segmentObjects.size()) { 1362f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status); 1363f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1364f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (i=0; i<segmentStandins.length(); ++i) { 1365f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (segmentStandins.charAt(i) == 0) { 1366f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen 1367f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1368f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1369f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (i=0; i<segmentObjects.size(); ++i) { 1370f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (segmentObjects.elementAt(i) == NULL) { 1371f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen 1372f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1373f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1374f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1375f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // If the direction we want doesn't match the rule 1376f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // direction, do nothing. 1377f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (op != FWDREV_RULE_OP && 1378f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ((direction == UTRANS_FORWARD) != (op == FORWARD_RULE_OP))) { 1379f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; 1380f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1381f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1382f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Transform the rule into a forward rule by swapping the 1383f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // sides if necessary. 1384f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (direction == UTRANS_REVERSE) { 1385f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) left = &_right; 1386f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) right = &_left; 1387f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1388f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1389f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Remove non-applicable elements in forward-reverse 1390f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // rules. Bidirectional rules ignore elements that do not 1391f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // apply. 1392f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (op == FWDREV_RULE_OP) { 1393f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) right->removeContext(); 1394f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) left->cursor = -1; 1395f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) left->cursorOffset = 0; 1396f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1397f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1398f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Normalize context 1399f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (left->ante < 0) { 1400f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) left->ante = 0; 1401f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1402f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (left->post < 0) { 1403f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) left->post = left->text.length(); 1404f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1405f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1406f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Context is only allowed on the input side. Cursors are only 1407f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // allowed on the output side. Segment delimiters can only appear 1408f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // on the left, and references on the right. Cursor offset 1409f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // cannot appear without an explicit cursor. Cursor offset 1410f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // cannot place the cursor outside the limits of the context. 1411f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Anchors are only allowed on the input side. 1412f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 || 1413f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (right->cursorOffset != 0 && right->cursor < 0) || 1414f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // - The following two checks were used to ensure that the 1415f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // - the cursor offset stayed within the ante- or postcontext. 1416f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // - However, with the addition of quantifiers, we have to 1417f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // - allow arbitrary cursor offsets and do runtime checking. 1418f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //(right->cursorOffset > (left->text.length() - left->post)) || 1419f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //(-right->cursorOffset > left->ante) || 1420f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) right->anchorStart || right->anchorEnd || 1421f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) !left->isValidInput(*this) || !right->isValidOutput(*this) || 1422f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) left->ante > left->post) { 1423f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1424f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MALFORMED_RULE, rule, start, status); 1425f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1426f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1427f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Flatten segment objects vector to an array 1428f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeFunctor** segmentsArray = NULL; 1429f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (segmentObjects.size() > 0) { 1430f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) segmentsArray = (UnicodeFunctor **)uprv_malloc(segmentObjects.size() * sizeof(UnicodeFunctor *)); 1431f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Null pointer check 1432f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (segmentsArray == NULL) { 1433f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1434f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1435f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) segmentObjects.toArray((void**) segmentsArray); 1436f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1437f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) TransliterationRule* temptr = new TransliterationRule( 1438f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) left->text, left->ante, left->post, 1439f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) right->text, right->cursor, right->cursorOffset, 1440f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) segmentsArray, 1441f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) segmentObjects.size(), 1442f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) left->anchorStart, left->anchorEnd, 1443f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) curData, 1444f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status); 1445f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //Null pointer check 1446f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (temptr == NULL) { 1447f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_free(segmentsArray); 1448f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1449f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1450f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1451f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) curData->ruleSet.addRule(temptr, status); 1452f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1453f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; 1454f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1455f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1456f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1457f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Called by main parser upon syntax error. Search the rule string 1458f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * for the probable end of the rule. Of course, if the error is that 1459f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the end of rule marker is missing, then the rule end will not be found. 1460f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * In any case the rule start will be correctly reported. 1461f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param msg error description 1462f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param rule pattern string 1463f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param start position of first character of current rule 1464f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1465f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode, 1466f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString& rule, 1467f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t pos, 1468f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode& status) 1469f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles){ 1470f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parseError.offset = pos; 1471f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parseError.line = 0 ; /* we are not using line numbers */ 1472f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1473f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // for pre-context 1474f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const int32_t LEN = U_PARSE_CONTEXT_LEN - 1; 1475f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t start = uprv_max(pos - LEN, 0); 1476f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t stop = pos; 1477f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1478f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rule.extract(start,stop-start,parseError.preContext); 1479f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //null terminate the buffer 1480f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parseError.preContext[stop-start] = 0; 1481f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1482f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //for post-context 1483f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) start = pos; 1484f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) stop = uprv_min(pos + LEN, rule.length()); 1485f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1486f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) rule.extract(start,stop-start,parseError.postContext); 1487f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //null terminate the buffer 1488f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) parseError.postContext[stop-start]= 0; 1489f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1490f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = (UErrorCode)parseErrorCode; 1491f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return pos; 1492f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1493f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1494f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1495f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1496f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Parse a UnicodeSet out, store it, and return the stand-in character 1497f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * used to represent it. 1498f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1499f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UChar TransliteratorParser::parseSet(const UnicodeString& rule, 1500f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ParsePosition& pos, 1501f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode& status) { 1502f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeSet* set = new UnicodeSet(rule, pos, USET_IGNORE_SPACE, parseData, status); 1503f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Null pointer check 1504f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (set == NULL) { 1505f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_MEMORY_ALLOCATION_ERROR; 1506f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return (UChar)0x0000; // Return empty character with error. 1507f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1508f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) set->compact(); 1509f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return generateStandInFor(set, status); 1510f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1511f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1512f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1513f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Generate and return a stand-in for a new UnicodeFunctor. Store 1514f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the matcher (adopt it). 1515f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1516f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) { 1517f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // assert(obj != null); 1518f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1519f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Look up previous stand-in, if any. This is a short list 1520f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // (typical n is 0, 1, or 2); linear search is optimal. 1521f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for (int32_t i=0; i<variablesVector.size(); ++i) { 1522f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (variablesVector.elementAt(i) == adopted) { // [sic] pointer comparison 1523f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return (UChar) (curData->variablesBase + i); 1524f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1525f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1526f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1527f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (variableNext >= variableLimit) { 1528f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) delete adopted; 1529f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_VARIABLE_RANGE_EXHAUSTED; 1530f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 1531f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1532f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) variablesVector.addElement(adopted, status); 1533f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return variableNext++; 1534f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1535f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1536f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1537f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Return the standin for segment seg (1-based). 1538f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1539f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UChar TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) { 1540f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Special character used to indicate an empty spot 1541f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar empty = curData->variablesBase - 1; 1542f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (segmentStandins.length() < seg) { 1543f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) segmentStandins.append(empty); 1544f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1545f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar c = segmentStandins.charAt(seg-1); 1546f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c == empty) { 1547f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (variableNext >= variableLimit) { 1548f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_VARIABLE_RANGE_EXHAUSTED; 1549f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 1550f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1551f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) c = variableNext++; 1552f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Set a placeholder in the master variables vector that will be 1553f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // filled in later by setSegmentObject(). We know that we will get 1554f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // called first because setSegmentObject() will call us. 1555f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) variablesVector.addElement((void*) NULL, status); 1556f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) segmentStandins.setCharAt(seg-1, c); 1557f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1558f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return c; 1559f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1560f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1561f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1562f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Set the object for segment seg (1-based). 1563f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1564f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) { 1565f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Since we call parseSection() recursively, nested 1566f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // segments will result in segment i+1 getting parsed 1567f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // and stored before segment i; be careful with the 1568f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // vector handling here. 1569f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (segmentObjects.size() < seg) { 1570f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) segmentObjects.setSize(seg, status); 1571f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1572f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t index = getSegmentStandin(seg, status) - curData->variablesBase; 1573f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (segmentObjects.elementAt(seg-1) != NULL || 1574f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) variablesVector.elementAt(index) != NULL) { 1575f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // should never happen 1576f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_INTERNAL_TRANSLITERATOR_ERROR; 1577f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 1578f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1579f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) segmentObjects.setElementAt(adopted, seg-1); 1580f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) variablesVector.setElementAt(adopted, index); 1581f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1582f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1583f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1584f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Return the stand-in for the dot set. It is allocated the first 1585f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * time and reused thereafter. 1586f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1587f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)UChar TransliteratorParser::getDotStandIn(UErrorCode& status) { 1588f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (dotStandIn == (UChar) -1) { 1589f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeSet* tempus = new UnicodeSet(DOT_SET, status); 1590f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // Null pointer check. 1591f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (tempus == NULL) { 1592f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_MEMORY_ALLOCATION_ERROR; 1593f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return (UChar)0x0000; 1594f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1595f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) dotStandIn = generateStandInFor(tempus, status); 1596f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1597f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return dotStandIn; 1598f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1599f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1600f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1601f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Append the value of the given variable name to the given 1602f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * UnicodeString. 1603f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1604f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)void TransliteratorParser::appendVariableDef(const UnicodeString& name, 1605f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString& buf, 1606f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UErrorCode& status) { 1607f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UnicodeString* s = (const UnicodeString*) variableNames.get(name); 1608f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (s == NULL) { 1609f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // We allow one undefined variable so that variable definition 1610f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // statements work. For the first undefined variable we return 1611f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // the special placeholder variableLimit-1, and save the variable 1612f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // name. 1613f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (undefinedVariableName.length() == 0) { 1614f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) undefinedVariableName = name; 1615f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (variableNext >= variableLimit) { 1616f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // throw new RuntimeException("Private use variables exhausted"); 1617f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_ILLEGAL_ARGUMENT_ERROR; 1618f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 1619f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1620f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append((UChar) --variableLimit); 1621f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1622f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //throw new IllegalArgumentException("Undefined variable $" 1623f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) // + name); 1624f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) status = U_ILLEGAL_ARGUMENT_ERROR; 1625f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return; 1626f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1627f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { 1628f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) buf.append(*s); 1629f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1630f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1631f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1632f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 1633f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Glue method to get around access restrictions in C++. 1634f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1635f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { 1636f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return Transliterator::createBasicInstance(id, canonID); 1637f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}*/ 1638f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1639f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_NAMESPACE_END 1640f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1641f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI int32_t 1642f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status) { 1643f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U_NAMESPACE_USE 1644f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1645f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) //const UChar *sourceStart = source; 1646f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *targetStart = target; 1647f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) const UChar *sourceLimit = source+sourceLen; 1648f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar *targetLimit = target+sourceLen; 1649f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 c = 0; 1650f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UBool quoted = FALSE; 1651f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t index; 1652f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1653f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR); 1654f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1655f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* read the rules into the buffer */ 1656f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (source < sourceLimit) 1657f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) { 1658f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) index=0; 1659f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U16_NEXT_UNSAFE(source, index, c); 1660f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) source+=index; 1661f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(c == QUOTE) { 1662f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) quoted = (UBool)!quoted; 1663f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1664f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) else if (!quoted) { 1665f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c == RULE_COMMENT_CHAR) { 1666f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* skip comments and all preceding spaces */ 1667f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (targetStart < target && *(target - 1) == 0x0020) { 1668f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) target--; 1669f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1670f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) do { 1671f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) c = *(source++); 1672f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1673f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (c != CR && c != LF); 1674f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1675f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) else if (c == ESCAPE) { 1676f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UChar32 c2 = *source; 1677f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c2 == CR || c2 == LF) { 1678f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* A backslash at the end of a line. */ 1679f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* Since we're stripping lines, ignore the backslash. */ 1680f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) source++; 1681f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 1682f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1683f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn't unescaped. */ 1684f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t escapeOffset = 0; 1685f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) UnicodeString escapedStr(source, 5); 1686f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) c2 = escapedStr.unescapeAt(escapeOffset); 1687f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1688f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c2 == (UChar32)0xFFFFFFFF || escapeOffset == 0) 1689f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) { 1690f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *status = U_PARSE_ERROR; 1691f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return 0; 1692f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1693f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (!uprv_isRuleWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) { 1694f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* It was escaped for a reason. Write what it was suppose to be. */ 1695f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) source+=5; 1696f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) c = c2; 1697f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1698f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1699f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) else if (c2 == QUOTE) { 1700f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* \' seen. Make sure we don't do anything when we see it again. */ 1701f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) quoted = (UBool)!quoted; 1702f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1703f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1704f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1705f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c == CR || c == LF) 1706f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) { 1707f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* ignore spaces carriage returns, and all leading spaces on the next line. 1708f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and line feed unless in the form \uXXXX 1709f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 1710f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) quoted = FALSE; 1711f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while (source < sourceLimit) { 1712f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) c = *(source); 1713f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (c != CR && c != LF && c != 0x0020) { 1714f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; 1715f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1716f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) source++; 1717f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1718f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) continue; 1719f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1720f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1721f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* Append UChar * after dissembling if c > 0xffff*/ 1722f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) index=0; 1723f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U16_APPEND_UNSAFE(target, index, c); 1724f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) target+=index; 1725f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1726f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if (target < targetLimit) { 1727f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *target = 0; 1728f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } 1729f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) return (int32_t)(target-targetStart); 1730f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 1731f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 1732f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1733