16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org********************************************************************** 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Copyright (C) 2013, International Business Machines 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Corporation and others. All Rights Reserved. 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org********************************************************************** 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* indentifier_info.h 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created on: 2013 Jan 7 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created by: Andy Heninger 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/ 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifndef __IDENTIFIER_INFO_H__ 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define __IDENTIFIER_INFO_H__ 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uniset.h" 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uspoof.h" 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uhash.h" 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass ScriptSet; 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// TODO(andy): review consistency of reference vs pointer arguments to the funcions. 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * then setIdentifier. Available methods include: 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <ol> 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * each of these. 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * either Katakana or Hiragana. 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates. 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the identifier. 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <li>call getRestrictionLevel to see what the UTS36 restriction level is. 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * </ol> 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass U_I18N_API IdentifierInfo : public UMemory { 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org public: 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Create an identifier info object. Subsequently, call setIdentifier(), etc. 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IdentifierInfo(UErrorCode &status); 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Destructor 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual ~IdentifierInfo(); 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org private: 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* Disallow copying for now. Can be added if there's a need. */ 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IdentifierInfo(const IdentifierInfo &other); 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org public: 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Set the identifier profile: the characters that are to be allowed in the identifier. 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param identifierProfile the characters that are to be allowed in the identifier 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return this 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile); 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Get the identifier profile: the characters that are to be allowed in the identifier. 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return The characters that are to be allowed in the identifier. 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeSet &getIdentifierProfile() const; 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Set an identifier to analyze. Afterwards, call methods like getScripts() 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param identifier the identifier to analyze 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param status Errorcode, set if errors occur. 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return this 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status); 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Get the identifier that was analyzed. The returned string is owned by the ICU library, 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * and must not be deleted by the caller. 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return the identifier that was analyzed. 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeString *getIdentifier() const; 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Get the scripts found in the identifiers. 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return the set of explicit scripts. 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const ScriptSet *getScripts() const; 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the set consisting of those scripts will be returned. 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return a uhash, with each key being of type (ScriptSet *). 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This is a set, not a map, so the value stored in the uhash is not relevant. 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * (It is, in fact, 1). 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Ownership of the uhash and its contents remains with the IndetifierInfo object, 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * and remains valid until a new identifer is set or until the object is deleted. 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UHashtable *getAlternates() const; 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Get the representative characters (zeros) for the numerics found in the identifier. 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return the set of explicit scripts. 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const UnicodeSet *getNumerics() const; 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Find out which scripts are in common among the alternates. 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return the set of scripts that are in common among the alternates. 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const ScriptSet *getCommonAmongAlternates() const; 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Get the number of scripts appearing in the identifier. 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Note: Common and Inherited scripts are omitted from the count. 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Note: Result may be high when the identifier contains characters 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * with alternate scripts. The distinction between 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 0, 1 and > 1 will remain valid, however. 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return the number of scripts. 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t getScriptCount() const; 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_NORMALIZATION 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Find the "tightest" restriction level that the identifier satisfies. 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return the restriction level. 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org URestrictionLevel getRestrictionLevel(UErrorCode &status) const; 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif /*!UCONFIG_NO_NORMALIZATION */ 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString toString() const; 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Produce a readable string of alternates. 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param alternates a UHashtable of UScriptSets. 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Keys only, no meaningful values in the UHash. 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return display form 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status); 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Static memory cleanup function. 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @internal 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static UBool cleanup(); 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org private: 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org IdentifierInfo & clear(); 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const; 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString *fIdentifier; 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ScriptSet *fRequiredScripts; 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UHashtable *fScriptSetSet; 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ScriptSet *fCommonAmongAlternates; 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeSet *fNumerics; 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeSet *fIdentifierProfile; 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static UnicodeSet *ASCII; 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static ScriptSet *JAPANESE; 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static ScriptSet *CHINESE; 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static ScriptSet *KOREAN; 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static ScriptSet *CONFUSABLE_WITH_LATIN; 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif // __IDENTIFIER_INFO_H__ 2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 205