1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ****************************************************************************** 5 * Copyright (C) 1996-2010, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ****************************************************************************** 8 */ 9 10/* 11 * This is a port of the C++ class UConverterSelector. 12 * 13 * Methods related to serialization are not ported in this version. In addition, 14 * the selectForUTF8 method is not going to be ported, as UTF8 is seldom used 15 * in Java. 16 * 17 * @author Shaopeng Jia 18 */ 19 20package com.ibm.icu.charset; 21 22import java.nio.charset.Charset; 23import java.nio.charset.IllegalCharsetNameException; 24import java.nio.charset.UnsupportedCharsetException; 25import java.util.ArrayList; 26import java.util.List; 27 28import com.ibm.icu.impl.IntTrie; 29import com.ibm.icu.impl.PropsVectors; 30import com.ibm.icu.text.UTF16; 31import com.ibm.icu.text.UnicodeSet; 32 33/** 34 * Charset Selector 35 * 36 * A charset selector is built with a list of charset names and given an input 37 * CharSequence returns the list of names the corresponding charsets which can 38 * convert the CharSequence. 39 * 40 * @stable ICU 4.2 41 */ 42public final class CharsetSelector { 43 private IntTrie trie; 44 private int[] pv; // table of bits 45 private String[] encodings; // encodings users ask to use 46 47 private void generateSelectorData(PropsVectors pvec, 48 UnicodeSet excludedCodePoints, int mappingTypes) { 49 int columns = (encodings.length + 31) / 32; 50 51 // set errorValue to all-ones 52 for (int col = 0; col < columns; ++col) { 53 pvec.setValue(PropsVectors.ERROR_VALUE_CP, 54 PropsVectors.ERROR_VALUE_CP, col, ~0, ~0); 55 } 56 57 for (int i = 0; i < encodings.length; ++i) { 58 Charset testCharset = CharsetICU.forNameICU(encodings[i]); 59 UnicodeSet unicodePointSet = new UnicodeSet(); // empty set 60 ((CharsetICU) testCharset).getUnicodeSet(unicodePointSet, 61 mappingTypes); 62 int column = i / 32; 63 int mask = 1 << (i % 32); 64 // now iterate over intervals on set i 65 int itemCount = unicodePointSet.getRangeCount(); 66 for (int j = 0; j < itemCount; ++j) { 67 int startChar = unicodePointSet.getRangeStart(j); 68 int endChar = unicodePointSet.getRangeEnd(j); 69 pvec.setValue(startChar, endChar, column, ~0, mask); 70 } 71 } 72 73 // handle excluded encodings 74 // Simply set their values to all 1's in the pvec 75 if (!excludedCodePoints.isEmpty()) { 76 int itemCount = excludedCodePoints.getRangeCount(); 77 for (int j = 0; j < itemCount; ++j) { 78 int startChar = excludedCodePoints.getRangeStart(j); 79 int endChar = excludedCodePoints.getRangeEnd(j); 80 for (int col = 0; col < columns; col++) { 81 pvec.setValue(startChar, endChar, col, ~0, ~0); 82 } 83 } 84 } 85 86 trie = pvec.compactToTrieWithRowIndexes(); 87 pv = pvec.getCompactedArray(); 88 } 89 90 // internal function to intersect two sets of masks 91 // returns whether the mask has reduced to all zeros. The 92 // second set of mask consists of len elements in pv starting from 93 // pvIndex 94 private boolean intersectMasks(int[] dest, int pvIndex, int len) { 95 int oredDest = 0; 96 for (int i = 0; i < len; ++i) { 97 oredDest |= (dest[i] &= pv[pvIndex + i]); 98 } 99 return oredDest == 0; 100 } 101 102 // internal function 103 private List<String> selectForMask(int[] mask) { 104 // this is the context we will use. Store a table of indices to which 105 // encodings are legit 106 107 List<String> result = new ArrayList<String>(); 108 int columns = (encodings.length + 31) / 32; 109 int numOnes = countOnes(mask, columns); 110 111 // now we know the exact space we need to index 112 if (numOnes > 0) { 113 int k = 0; 114 for (int j = 0; j < columns; j++) { 115 int v = mask[j]; 116 for (int i = 0; i < 32 && k < encodings.length; i++, k++) { 117 if ((v & 1) != 0) { 118 result.add(encodings[k]); 119 } 120 v >>= 1; 121 } 122 } 123 } 124 125 // otherwise, index will remain NULL 126 return result; 127 } 128 129 // internal function to count how many 1's are there in a mask 130 // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html 131 private int countOnes(int[] mask, int len) { 132 int totalOnes = 0; 133 for (int i = 0; i < len; ++i) { 134 int ent = mask[i]; 135 for (; ent != 0; totalOnes++) { 136 ent &= ent - 1; // clear the least significant bit set 137 } 138 } 139 return totalOnes; 140 } 141 142 /** 143 * Construct a CharsetSelector from a list of charset names. 144 * 145 * @param charsetList 146 * a list of charset names in the form of strings. If charsetList 147 * is empty, a selector for all available charset is constructed. 148 * @param excludedCodePoints 149 * a set of code points to be excluded from consideration. 150 * Excluded code points appearing in the input CharSequence do 151 * not change the selection result. It could be empty when no 152 * code point should be excluded. 153 * @param mappingTypes 154 * an int which determines whether to consider only roundtrip 155 * mappings or also fallbacks, e.g. CharsetICU.ROUNDTRIP_SET. See 156 * CharsetICU.java for the constants that are currently 157 * supported. 158 * @throws IllegalArgumentException 159 * if the parameters is invalid. 160 * @throws IllegalCharsetNameException 161 * If the given charset name is illegal. 162 * @throws UnsupportedCharsetException 163 * If no support for the named charset is available in this 164 * instance of the Java virtual machine. 165 * @stable ICU 4.2 166 */ 167 public CharsetSelector(List<String> charsetList, UnicodeSet excludedCodePoints, 168 int mappingTypes) { 169 if (mappingTypes != CharsetICU.ROUNDTRIP_AND_FALLBACK_SET 170 && mappingTypes != CharsetICU.ROUNDTRIP_SET) { 171 throw new IllegalArgumentException("Unsupported mappingTypes"); 172 } 173 174 int encodingCount = charsetList.size(); 175 if (encodingCount > 0) { 176 encodings = charsetList.toArray(new String[0]); 177 } else { 178 encodings = CharsetProviderICU.getAvailableNames(); 179 encodingCount = encodings.length; 180 } 181 182 PropsVectors pvec = new PropsVectors((encodingCount + 31) / 32); 183 generateSelectorData(pvec, excludedCodePoints, mappingTypes); 184 } 185 186 /** 187 * Select charsets that can map all characters in a CharSequence, ignoring 188 * the excluded code points. 189 * 190 * @param unicodeText 191 * a CharSequence. It could be empty. 192 * @return a list that contains charset names in the form of strings. The 193 * returned encoding names and their order will be the same as 194 * supplied when building the selector. 195 * 196 * @stable ICU 4.2 197 */ 198 public List<String> selectForString(CharSequence unicodeText) { 199 int columns = (encodings.length + 31) / 32; 200 int[] mask = new int[columns]; 201 for (int i = 0; i < columns; i++) { 202 mask[i] = - 1; // set each bit to 1 203 // Note: All integers are signed in Java, assigning 204 // 2 ^ 32 -1 to mask is wrong! 205 } 206 int index = 0; 207 while (index < unicodeText.length()) { 208 int c = UTF16.charAt(unicodeText, index); 209 int pvIndex = trie.getCodePointValue(c); 210 index += UTF16.getCharCount(c); 211 if (intersectMasks(mask, pvIndex, columns)) { 212 break; 213 } 214 } 215 return selectForMask(mask); 216 } 217} 218