1/*
2**********************************************************************
3*   Copyright (C) 2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*
7* indentifier_info.h
8*
9* created on: 2013 Jan 7
10* created by: Andy Heninger
11*/
12
13#ifndef __IDENTIFIER_INFO_H__
14#define __IDENTIFIER_INFO_H__
15
16#include "unicode/utypes.h"
17
18#include "unicode/uniset.h"
19#include "unicode/uspoof.h"
20#include "uhash.h"
21
22U_NAMESPACE_BEGIN
23
24class ScriptSet;
25
26// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
27
28/**
29 * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
30 * then setIdentifier. Available methods include:
31 * <ol>
32 * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
33 * each of these.
34 * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
35 * either Katakana or Hiragana.
36 * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
37 * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
38 * the identifier.
39 * <li>call getRestrictionLevel to see what the UTS36 restriction level is.
40 * </ol>
41 *
42 * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
43 */
44class U_I18N_API IdentifierInfo : public UMemory {
45
46  public:
47    /**
48     * Create an identifier info object. Subsequently, call setIdentifier(), etc.
49     * @internal
50     */
51    IdentifierInfo(UErrorCode &status);
52
53    /**
54      * Destructor
55      */
56    virtual ~IdentifierInfo();
57
58  private:
59    /* Disallow copying for now. Can be added if there's a need. */
60    IdentifierInfo(const IdentifierInfo &other);
61
62  public:
63
64    /**
65     * Set the identifier profile: the characters that are to be allowed in the identifier.
66     *
67     * @param identifierProfile the characters that are to be allowed in the identifier
68     * @return this
69     * @internal
70     */
71    IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
72
73    /**
74     * Get the identifier profile: the characters that are to be allowed in the identifier.
75     *
76     * @return The characters that are to be allowed in the identifier.
77     * @internal
78     */
79    const UnicodeSet &getIdentifierProfile() const;
80
81
82    /**
83     * Set an identifier to analyze. Afterwards, call methods like getScripts()
84     *
85     * @param identifier the identifier to analyze
86     * @param status Errorcode, set if errors occur.
87     * @return this
88     * @internal
89     */
90    IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
91
92
93    /**
94     * Get the identifier that was analyzed. The returned string is owned by the ICU library,
95     * and must not be deleted by the caller.
96     *
97     * @return the identifier that was analyzed.
98     * @internal
99     */
100    const UnicodeString *getIdentifier() const;
101
102
103    /**
104     * Get the scripts found in the identifiers.
105     *
106     * @return the set of explicit scripts.
107     * @internal
108     */
109    const ScriptSet *getScripts() const;
110
111    /**
112     * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
113     * the set consisting of those scripts will be returned.
114     *
115     * @return a uhash, with each key being of type (ScriptSet *).
116     *         This is a set, not a map, so the value stored in the uhash is not relevant.
117     *         (It is, in fact, 1).
118     *         Ownership of the uhash and its contents remains with the IndetifierInfo object,
119     *         and remains valid until a new identifer is set or until the object is deleted.
120     * @internal
121     */
122    const UHashtable *getAlternates() const;
123
124    /**
125     * Get the representative characters (zeros) for the numerics found in the identifier.
126     *
127     * @return the set of explicit scripts.
128     * @internal
129     */
130    const UnicodeSet *getNumerics() const;
131
132    /**
133     * Find out which scripts are in common among the alternates.
134     *
135     * @return the set of scripts that are in common among the alternates.
136     * @internal
137     */
138    const ScriptSet *getCommonAmongAlternates() const;
139
140    /**
141      * Get the number of scripts appearing in the identifier.
142      *   Note: Common and Inherited scripts are omitted from the count.
143      *   Note: Result may be high when the identifier contains characters
144      *         with alternate scripts. The distinction between
145      *         0, 1 and > 1 will remain valid, however.
146      * @return the number of scripts.
147      */
148    int32_t getScriptCount() const;
149
150#if !UCONFIG_NO_NORMALIZATION
151
152    /**
153     * Find the "tightest" restriction level that the identifier satisfies.
154     *
155     * @return the restriction level.
156     * @internal
157     */
158    URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
159
160#endif /*!UCONFIG_NO_NORMALIZATION */
161
162    UnicodeString toString() const;
163
164    /**
165     * Produce a readable string of alternates.
166     *
167     * @param alternates a UHashtable of UScriptSets.
168     *        Keys only, no meaningful values in the UHash.
169     * @return display form
170     * @internal
171     */
172    static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
173
174  private:
175
176    IdentifierInfo  & clear();
177    UBool             containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
178
179    UnicodeString     *fIdentifier;
180    ScriptSet         *fRequiredScripts;
181    UHashtable        *fScriptSetSet;
182    ScriptSet         *fCommonAmongAlternates;
183    UnicodeSet        *fNumerics;
184    UnicodeSet        *fIdentifierProfile;
185};
186
187U_NAMESPACE_END
188
189#endif // __IDENTIFIER_INFO_H__
190
191