16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org**********************************************************************
36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Copyright (C) 2013, International Business Machines
46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Corporation and others.  All Rights Reserved.
56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org**********************************************************************
66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*
76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* indentifier_info.h
86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*
96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created on: 2013 Jan 7
106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created by: Andy Heninger
116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/
126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifndef __IDENTIFIER_INFO_H__
146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define __IDENTIFIER_INFO_H__
156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h"
176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uniset.h"
196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uspoof.h"
206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uhash.h"
216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN
236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass ScriptSet;
256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/**
296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * then setIdentifier. Available methods include:
316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <ol>
326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * each of these.
346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * either Katakana or Hiragana.
366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the identifier.
396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * <li>call getRestrictionLevel to see what the UTS36 restriction level is.
406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * </ol>
416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org *
426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */
446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass U_I18N_API IdentifierInfo : public UMemory {
456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org  public:
476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Create an identifier info object. Subsequently, call setIdentifier(), etc.
496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    IdentifierInfo(UErrorCode &status);
526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org      * Destructor
556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org      */
566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    virtual ~IdentifierInfo();
576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org  private:
596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /* Disallow copying for now. Can be added if there's a need. */
606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    IdentifierInfo(const IdentifierInfo &other);
616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org  public:
636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Set the identifier profile: the characters that are to be allowed in the identifier.
666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @param identifierProfile the characters that are to be allowed in the identifier
686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @return this
696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Get the identifier profile: the characters that are to be allowed in the identifier.
756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @return The characters that are to be allowed in the identifier.
776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UnicodeSet &getIdentifierProfile() const;
806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Set an identifier to analyze. Afterwards, call methods like getScripts()
846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @param identifier the identifier to analyze
866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @param status Errorcode, set if errors occur.
876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @return this
886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Get the identifier that was analyzed. The returned string is owned by the ICU library,
956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * and must not be deleted by the caller.
966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @return the identifier that was analyzed.
986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UnicodeString *getIdentifier() const;
1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Get the scripts found in the identifiers.
1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @return the set of explicit scripts.
1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const ScriptSet *getScripts() const;
1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * the set consisting of those scripts will be returned.
1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @return a uhash, with each key being of type (ScriptSet *).
1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *         This is a set, not a map, so the value stored in the uhash is not relevant.
1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *         (It is, in fact, 1).
1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *         Ownership of the uhash and its contents remains with the IndetifierInfo object,
1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *         and remains valid until a new identifer is set or until the object is deleted.
1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UHashtable *getAlternates() const;
1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Get the representative characters (zeros) for the numerics found in the identifier.
1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @return the set of explicit scripts.
1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UnicodeSet *getNumerics() const;
1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Find out which scripts are in common among the alternates.
1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @return the set of scripts that are in common among the alternates.
1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const ScriptSet *getCommonAmongAlternates() const;
1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org      * Get the number of scripts appearing in the identifier.
1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org      *   Note: Common and Inherited scripts are omitted from the count.
1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org      *   Note: Result may be high when the identifier contains characters
1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org      *         with alternate scripts. The distinction between
1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org      *         0, 1 and > 1 will remain valid, however.
1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org      * @return the number of scripts.
1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org      */
1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t getScriptCount() const;
1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_NORMALIZATION
1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Find the "tightest" restriction level that the identifier satisfies.
1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @return the restriction level.
1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif /*!UCONFIG_NO_NORMALIZATION */
1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString toString() const;
1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Produce a readable string of alternates.
1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *
1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @param alternates a UHashtable of UScriptSets.
1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     *        Keys only, no meaningful values in the UHash.
1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @return display form
1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Static memory cleanup function.
1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * @internal
1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static UBool      cleanup();
1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org  private:
1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    IdentifierInfo  & clear();
1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UBool             containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString     *fIdentifier;
1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ScriptSet         *fRequiredScripts;
1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UHashtable        *fScriptSetSet;
1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ScriptSet         *fCommonAmongAlternates;
1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeSet        *fNumerics;
1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeSet        *fIdentifierProfile;
1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static UnicodeSet *ASCII;
1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static ScriptSet  *JAPANESE;
1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static ScriptSet  *CHINESE;
1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static ScriptSet  *KOREAN;
1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static ScriptSet  *CONFUSABLE_WITH_LATIN;
1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org};
2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END
2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif // __IDENTIFIER_INFO_H__
2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
205