/* *************************************************************************** * Copyright (C) 2008-2014, Google, International Business Machines Corporation * and others. All Rights Reserved. *************************************************************************** */ package com.ibm.icu.text; import java.util.BitSet; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.TreeSet; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacterCategory; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.SpoofChecker.RestrictionLevel; /** * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile * then setIdentifier. Available methods include: *
    *
  1. call getScripts for the specific scripts in the identifier. The identifier contains at least one character in * each of these. *
  2. call getAlternates to get cases where a character is not limited to a single script. For example, it could be * either Katakana or Hiragana. *
  3. call getCommonAmongAlternates to find out if any scripts are common to all the alternates. *
  4. call getNumerics to get a representative character (with value zero) for each of the decimal number systems in * the identifier. *
  5. call getRestrictionLevel to see what the UTS36 restriction level is. *
* * @author markdavis * @internal * @deprecated This API is ICU internal only. */ @Deprecated public class IdentifierInfo { private static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze(); private String identifier; private final BitSet requiredScripts = new BitSet(); private final Set scriptSetSet = new HashSet(); private final BitSet commonAmongAlternates = new BitSet(); private final UnicodeSet numerics = new UnicodeSet(); private final UnicodeSet identifierProfile = new UnicodeSet(0, 0x10FFFF); /** * Create an identifier info object. Subsequently, call {@link #setIdentifier(String)}, etc. * {@link #setIdentifierProfile(UnicodeSet)} * @internal * @deprecated This API is ICU internal only. */ @Deprecated public IdentifierInfo() { super(); } private IdentifierInfo clear() { requiredScripts.clear(); scriptSetSet.clear(); numerics.clear(); commonAmongAlternates.clear(); return this; } /** * Set the identifier profile: the characters that are to be allowed in the identifier. * * @param identifierProfile the characters that are to be allowed in the identifier * @return self * @internal * @deprecated This API is ICU internal only. */ @Deprecated public IdentifierInfo setIdentifierProfile(UnicodeSet identifierProfile) { this.identifierProfile.set(identifierProfile); return this; } /** * Get the identifier profile: the characters that are to be allowed in the identifier. * * @return The characters that are to be allowed in the identifier. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public UnicodeSet getIdentifierProfile() { return new UnicodeSet(identifierProfile); } /** * Set an identifier to analyze. Afterwards, call methods like getScripts() * * @param identifier the identifier to analyze * @return self * @internal * @deprecated This API is ICU internal only. */ @Deprecated public IdentifierInfo setIdentifier(String identifier) { this.identifier = identifier; clear(); BitSet scriptsForCP = new BitSet(); int cp; for (int i = 0; i < identifier.length(); i += Character.charCount(i)) { cp = Character.codePointAt(identifier, i); // Store a representative character for each kind of decimal digit if (UCharacter.getType(cp) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) { // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value numerics.add(cp - UCharacter.getNumericValue(cp)); } UScript.getScriptExtensions(cp, scriptsForCP); scriptsForCP.clear(UScript.COMMON); scriptsForCP.clear(UScript.INHERITED); // if (temp.cardinality() == 0) { // // HACK for older version of ICU // requiredScripts.set(UScript.getScript(cp)); // } else switch (scriptsForCP.cardinality()) { case 0: break; case 1: // Single script, record it. requiredScripts.or(scriptsForCP); break; default: if (!requiredScripts.intersects(scriptsForCP) && scriptSetSet.add(scriptsForCP)) { scriptsForCP = new BitSet(); } break; } } // Now make a final pass through to remove alternates that came before singles. // [Kana], [Kana Hira] => [Kana] // This is relatively infrequent, so doesn't have to be optimized. // We also compute any commonalities among the alternates. if (scriptSetSet.size() > 0) { commonAmongAlternates.set(0, UScript.CODE_LIMIT); for (Iterator it = scriptSetSet.iterator(); it.hasNext();) { final BitSet next = it.next(); // [Kana], [Kana Hira] => [Kana] if (requiredScripts.intersects(next)) { it.remove(); } else { // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] commonAmongAlternates.and(next); // get the intersection. for (BitSet other : scriptSetSet) { if (next != other && contains(next, other)) { it.remove(); break; } } } } } if (scriptSetSet.size() == 0) { commonAmongAlternates.clear(); } return this; } /** * Get the identifier that was analyzed. * * @return the identifier that was analyzed. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public String getIdentifier() { return identifier; } /** * Get the scripts found in the identifiers. * * @return the set of explicit scripts. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public BitSet getScripts() { return (BitSet) requiredScripts.clone(); } /** * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then * the set consisting of those scripts will be returned. * * @return the set of explicit scripts. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public Set getAlternates() { Set result = new HashSet(); for (BitSet item : scriptSetSet) { result.add((BitSet) item.clone()); } return result; } /** * Get the representative characters (zeros) for the numerics found in the identifier. * * @return the set of explicit scripts. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public UnicodeSet getNumerics() { return new UnicodeSet(numerics); } /** * Find out which scripts are in common among the alternates. * * @return the set of scripts that are in common among the alternates. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public BitSet getCommonAmongAlternates() { return (BitSet) commonAmongAlternates.clone(); } // BitSet doesn't support "contains(...)", so we have inverted constants // They are private; they can't be made immutable in Java. private final static BitSet JAPANESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HIRAGANA, UScript.KATAKANA); private final static BitSet CHINESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.BOPOMOFO); private final static BitSet KOREAN = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HANGUL); private final static BitSet CONFUSABLE_WITH_LATIN = set(new BitSet(), UScript.CYRILLIC, UScript.GREEK, UScript.CHEROKEE); /** * Find the "tightest" restriction level that the identifier satisfies. * * @return the restriction level. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public RestrictionLevel getRestrictionLevel() { if (!identifierProfile.containsAll(identifier) || getNumerics().size() > 1) { return RestrictionLevel.UNRESTRICTIVE; } if (ASCII.containsAll(identifier)) { return RestrictionLevel.ASCII; } // This is a bit tricky. We look at a number of factors. // The number of scripts in the text. // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the // time it is created, in setIdentifier(). final int cardinalityPlus = requiredScripts.cardinality() + (commonAmongAlternates.cardinality() == 0 ? scriptSetSet.size() : 1); if (cardinalityPlus < 2) { return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE; } if (containsWithAlternates(JAPANESE, requiredScripts) || containsWithAlternates(CHINESE, requiredScripts) || containsWithAlternates(KOREAN, requiredScripts)) { return RestrictionLevel.HIGHLY_RESTRICTIVE; } if (cardinalityPlus == 2 && requiredScripts.get(UScript.LATIN) && !requiredScripts.intersects(CONFUSABLE_WITH_LATIN)) { return RestrictionLevel.MODERATELY_RESTRICTIVE; } return RestrictionLevel.MINIMALLY_RESTRICTIVE; } /** * Get the number of scripts appearing in the identifier. * Note: Common and Inherited scripts are omitted from the count. * Note: If the identifier contains characters with alternate scripts * (the character is used with more than one script), minimize * the reported number of scripts by considering the character * to be of a script that already appears elsewhere in the identifier * when possible. * The alternate script computation may not be perfect. The distinction * between 0, 1 and > 1 scripts will be valid, however. * @return the number of scripts. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public int getScriptCount() { // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. int count = requiredScripts.cardinality() + (commonAmongAlternates.cardinality() == 0 ? scriptSetSet.size() : 1); return count; } /** * See Object.toString() * @internal * @deprecated This API is ICU internal only. */ @Deprecated @Override public String toString() { return identifier + ", " + identifierProfile.toPattern(false) + ", " + getRestrictionLevel() + ", " + displayScripts(requiredScripts) + ", " + displayAlternates(scriptSetSet) + ", " + numerics.toPattern(false); } private boolean containsWithAlternates(BitSet container, BitSet containee) { if (!contains(container, containee)) { return false; } for (BitSet alternatives : scriptSetSet) { if (!container.intersects(alternatives)) { return false; } } return true; } /** * Produce a readable string of alternates. * * @param alternates a set of BitSets of script values. * @return display form * @internal * @deprecated This API is ICU internal only. */ @Deprecated public static String displayAlternates(Set alternates) { if (alternates.size() == 0) { return ""; } StringBuilder result = new StringBuilder(); // for consistent results Set sorted = new TreeSet(BITSET_COMPARATOR); sorted.addAll(alternates); for (BitSet item : sorted) { if (result.length() != 0) { result.append("; "); } result.append(displayScripts(item)); } return result.toString(); } /** * Order BitSets, first by shortest, then by items. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public static final Comparator BITSET_COMPARATOR = new Comparator() { public int compare(BitSet arg0, BitSet arg1) { int diff = arg0.cardinality() - arg1.cardinality(); if (diff != 0) return diff; int i0 = arg0.nextSetBit(0); int i1 = arg1.nextSetBit(0); while ((diff = i0-i1) == 0 && i0 > 0) { i0 = arg0.nextSetBit(i0+1); i1 = arg1.nextSetBit(i1+1); } return diff; } }; /** * Produce a readable string of a set of scripts * * @param scripts a BitSet of UScript values * @return a readable string of a set of scripts * @internal * @deprecated This API is ICU internal only. */ @Deprecated public static String displayScripts(BitSet scripts) { StringBuilder result = new StringBuilder(); for (int i = scripts.nextSetBit(0); i >= 0; i = scripts.nextSetBit(i + 1)) { if (result.length() != 0) { result.append(' '); } result.append(UScript.getShortName(i)); } return result.toString(); } /** * Parse a text list of scripts into a BitSet. * * @param scriptsString the string to be parsed * @return BitSet of UScript values. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public static BitSet parseScripts(String scriptsString) { BitSet result = new BitSet(); for (String item : scriptsString.trim().split(",?\\s+")) { if (item.length() != 0) { result.set(UScript.getCodeFromName(item)); } } return result; } /** * Parse a list of alternates into a set of sets of UScript values. * * @param scriptsSetString a list of alternates, separated by ; * @return a set of BitSets of UScript values * @internal * @deprecated This API is ICU internal only. */ @Deprecated public static Set parseAlternates(String scriptsSetString) { Set result = new HashSet(); for (String item : scriptsSetString.trim().split("\\s*;\\s*")) { if (item.length() != 0) { result.add(parseScripts(item)); } } return result; } /** * Test containment. Should be a method on BitSet. * * @param container possible container to be tested * @param containee possible containee to be tested * @return true if container contains containee * @internal * @deprecated This API is ICU internal only. */ @Deprecated public static final boolean contains(BitSet container, BitSet containee) { for (int i = containee.nextSetBit(0); i >= 0; i = containee.nextSetBit(i + 1)) { if (!container.get(i)) { return false; } } return true; } /** * Sets a number of values at once. Should be on BitSet. * * @param bitset bitset to be affected * @param values values to be set in the bitset * @return modified bitset. * @internal * @deprecated This API is ICU internal only. */ @Deprecated public static final BitSet set(BitSet bitset, int... values) { for (int value : values) { bitset.set(value); } return bitset; } // public static final class FreezableBitSet extends BitSet implements Freezable { // private boolean frozen; // // public FreezableBitSet() { // super(); // } // public FreezableBitSet(int nbits) { // super(nbits); // } // /* (non-Javadoc) // * @see java.util.BitSet#and(java.util.BitSet) // */ // @Override // public void and(BitSet set) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.and(set); // } // /* (non-Javadoc) // * @see java.util.BitSet#andNot(java.util.BitSet) // */ // @Override // public void andNot(BitSet set) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.andNot(set); // } // /* (non-Javadoc) // * @see java.util.BitSet#cardinality() // */ // // @Override // public void clear() { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.clear(); // } // /* (non-Javadoc) // * @see java.util.BitSet#clear(int) // */ // @Override // public void clear(int bitIndex) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.clear(bitIndex); // } // /* (non-Javadoc) // * @see java.util.BitSet#clear(int, int) // */ // @Override // public void clear(int fromIndex, int toIndex) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.clear(fromIndex, toIndex); // } // /* (non-Javadoc) // * @see java.util.BitSet#clone() // */ // @Override // public Object clone() { // return super.clone(); // } // /* (non-Javadoc) // * @see java.util.BitSet#equals(java.lang.Object) // */ // @Override // public boolean equals(Object obj) { // if (obj == null || obj.getClass() != FreezableBitSet.class) { // return false; // } // return super.equals((BitSet)obj); // } // // /* (non-Javadoc) // * @see java.util.BitSet#flip(int) // */ // @Override // public void flip(int bitIndex) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.flip(bitIndex); // } // /* (non-Javadoc) // * @see java.util.BitSet#flip(int, int) // */ // @Override // public void flip(int fromIndex, int toIndex) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.flip(fromIndex, toIndex); // } // /* (non-Javadoc) // * @see java.util.BitSet#or(java.util.BitSet) // */ // @Override // public void or(BitSet set) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.or(set); // } // /* (non-Javadoc) // * @see java.util.BitSet#set(int) // */ // @Override // public void set(int bitIndex) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.set(bitIndex); // } // /* (non-Javadoc) // * @see java.util.BitSet#set(int, boolean) // */ // @Override // public void set(int bitIndex, boolean value) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.set(bitIndex, value); // } // /* (non-Javadoc) // * @see java.util.BitSet#set(int, int) // */ // @Override // public void set(int fromIndex, int toIndex) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.set(fromIndex, toIndex); // } // /* (non-Javadoc) // * @see java.util.BitSet#set(int, int, boolean) // */ // @Override // public void set(int fromIndex, int toIndex, boolean value) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.set(fromIndex, toIndex, value); // } // /* (non-Javadoc) // * @see java.util.BitSet#xor(java.util.BitSet) // */ // @Override // public void xor(BitSet set) { // if (frozen) { // throw new UnsupportedOperationException(); // } // super.xor(set); // } // /* (non-Javadoc) // * @see com.ibm.icu.util.Freezable#isFrozen() // */ // public boolean isFrozen() { // return frozen; // } // /* (non-Javadoc) // * @see com.ibm.icu.util.Freezable#freeze() // */ // public FreezableBitSet freeze() { // frozen = true; // return this; // } // /* (non-Javadoc) // * @see com.ibm.icu.util.Freezable#cloneAsThawed() // */ // public FreezableBitSet cloneAsThawed() { // FreezableBitSet result = new FreezableBitSet(size()); // result.or(this); // return result; // } // } }