1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Copyright (C) 2001-2011, International Business Machines Corporation 3b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * and others. All Rights Reserved. 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Date Name Description 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 07/23/01 aliu Creation. 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ********************************************************************** 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef STRMATCH_H 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define STRMATCH_H 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_TRANSLITERATION 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unistr.h" 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unifunct.h" 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unimatch.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unirepl.h" 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass TransliterationRuleData; 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * An object that matches a fixed input string, implementing the 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeMatcher API. This object also implements the 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeReplacer API, allowing it to emit the matched text as 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * output. Since the match text may contain flexible match elements, 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * such as UnicodeSets, the emitted text is not the match pattern, but 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * instead a substring of the actual matched text. Following 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * convention, the output text is the leftmost match seen up to this 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * point. 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * A StringMatcher may represent a segment, in which case it has a 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * positive segment number. This affects how the matcher converts 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * itself to a pattern but does not otherwise affect its function. 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * A StringMatcher that is not a segment should not be used as a 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeReplacer. 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru public: 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Construct a matcher that matches the given pattern string. 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param string the pattern to be matched, possibly containing 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * stand-ins that represent nested UnicodeMatcher objects. 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param start inclusive start index of text to be replaced 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param limit exclusive end index of text to be replaced; 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * must be greater than or equal to start 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param segmentNum the segment number from 1..n, or 0 if this is 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * not a segment. 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param data context object mapping stand-ins to 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeMatcher objects. 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru StringMatcher(const UnicodeString& string, 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t start, 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t limit, 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t segmentNum, 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru const TransliterationRuleData& data); 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Copy constructor 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param o the object to be copied. 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru StringMatcher(const StringMatcher& o); 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Destructor 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~StringMatcher(); 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Implement UnicodeFunctor 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return a copy of the object. 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual UnicodeFunctor* clone() const; 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and return the pointer. 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return the UnicodeMatcher point. 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual UnicodeMatcher* toMatcher() const; 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * and return the pointer. 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return the UnicodeReplacer pointer. 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual UnicodeReplacer* toReplacer() const; 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Implement UnicodeMatcher 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param text the text to be matched 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param offset on input, the index into text at which to begin 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * matching. On output, the limit of the matched text. The 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * number of matched characters is the output value of offset 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * minus the input value. Offset should always point to the 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * HIGH SURROGATE (leading code unit) of a pair of surrogates, 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * both on entry and upon return. 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param limit the limit index of text to be matched. Greater 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * than offset for a forward direction match, less than offset for 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * a backward direction match. The last character to be 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * considered for matching will be text.charAt(limit-1) in the 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * forward direction or text.charAt(limit+1) in the backward 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * direction. 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param incremental if TRUE, then assume further characters may 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * be inserted at limit and check for partial matching. Otherwise 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * assume the text as given is complete. 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @return a match degree value indicating a full match, a partial 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * match, or a mismatch. If incremental is FALSE then 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * U_PARTIAL_MATCH should never be returned. 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual UMatchDegree matches(const Replaceable& text, 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t& offset, 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t limit, 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool incremental); 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Implement UnicodeMatcher 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * @param result Output param to receive the pattern. 125 * @param escapeUnprintable if True then escape the unprintable characters. 126 * @return A reference to 'result'. 127 */ 128 virtual UnicodeString& toPattern(UnicodeString& result, 129 UBool escapeUnprintable = FALSE) const; 130 131 /** 132 * Implement UnicodeMatcher 133 * Returns TRUE if this matcher will match a character c, where c 134 * & 0xFF == v, at offset, in the forward direction (with limit > 135 * offset). This is used by <tt>RuleBasedTransliterator</tt> for 136 * indexing. 137 * @param v the given value 138 * @return TRUE if this matcher will match a character c, 139 * where c & 0xFF == v 140 */ 141 virtual UBool matchesIndexValue(uint8_t v) const; 142 143 /** 144 * Implement UnicodeMatcher 145 */ 146 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; 147 148 /** 149 * Implement UnicodeFunctor 150 */ 151 virtual void setData(const TransliterationRuleData*); 152 153 /** 154 * Replace characters in 'text' from 'start' to 'limit' with the 155 * output text of this object. Update the 'cursor' parameter to 156 * give the cursor position and return the length of the 157 * replacement text. 158 * 159 * @param text the text to be matched 160 * @param start inclusive start index of text to be replaced 161 * @param limit exclusive end index of text to be replaced; 162 * must be greater than or equal to start 163 * @param cursor output parameter for the cursor position. 164 * Not all replacer objects will update this, but in a complete 165 * tree of replacer objects, representing the entire output side 166 * of a transliteration rule, at least one must update it. 167 * @return the number of 16-bit code units in the text replacing 168 * the characters at offsets start..(limit-1) in text 169 */ 170 virtual int32_t replace(Replaceable& text, 171 int32_t start, 172 int32_t limit, 173 int32_t& cursor); 174 175 /** 176 * Returns a string representation of this replacer. If the 177 * result of calling this function is passed to the appropriate 178 * parser, typically TransliteratorParser, it will produce another 179 * replacer that is equal to this one. 180 * @param result the string to receive the pattern. Previous 181 * contents will be deleted. 182 * @param escapeUnprintable if TRUE then convert unprintable 183 * character to their hex escape representations, \\uxxxx or 184 * \\Uxxxxxxxx. Unprintable characters are defined by 185 * Utility.isUnprintable(). 186 * @return a reference to 'result'. 187 */ 188 virtual UnicodeString& toReplacerPattern(UnicodeString& result, 189 UBool escapeUnprintable) const; 190 191 /** 192 * Remove any match data. This must be called before performing a 193 * set of matches with this segment. 194 */ 195 void resetMatch(); 196 197 /** 198 * ICU "poor man's RTTI", returns a UClassID for the actual class. 199 */ 200 virtual UClassID getDynamicClassID() const; 201 202 /** 203 * ICU "poor man's RTTI", returns a UClassID for this class. 204 */ 205 static UClassID U_EXPORT2 getStaticClassID(); 206 207 /** 208 * Union the set of all characters that may output by this object 209 * into the given set. 210 * @param toUnionTo the set into which to union the output characters 211 */ 212 virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const; 213 214 private: 215 216 /** 217 * The text to be matched. 218 */ 219 UnicodeString pattern; 220 221 /** 222 * Context object that maps stand-ins to matcher and replacer 223 * objects. 224 */ 225 const TransliterationRuleData* data; 226 227 /** 228 * The segment number, 1-based, or 0 if not a segment. 229 */ 230 int32_t segmentNumber; 231 232 /** 233 * Start offset, in the match text, of the <em>rightmost</em> 234 * match. 235 */ 236 int32_t matchStart; 237 238 /** 239 * Limit offset, in the match text, of the <em>rightmost</em> 240 * match. 241 */ 242 int32_t matchLimit; 243 244}; 245 246U_NAMESPACE_END 247 248#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 249 250#endif 251