1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 ******************************************************************************
5 * Copyright (C) 1996-2010, International Business Machines Corporation and   *
6 * others. All Rights Reserved.                                               *
7 ******************************************************************************
8 */
9
10/*
11 * This is a port of the C++ class UConverterSelector.
12 *
13 * Methods related to serialization are not ported in this version. In addition,
14 * the selectForUTF8 method is not going to be ported, as UTF8 is seldom used
15 * in Java.
16 *
17 * @author Shaopeng Jia
18 */
19
20package com.ibm.icu.charset;
21
22import java.nio.charset.Charset;
23import java.nio.charset.IllegalCharsetNameException;
24import java.nio.charset.UnsupportedCharsetException;
25import java.util.ArrayList;
26import java.util.List;
27
28import com.ibm.icu.impl.IntTrie;
29import com.ibm.icu.impl.PropsVectors;
30import com.ibm.icu.text.UTF16;
31import com.ibm.icu.text.UnicodeSet;
32
33/**
34 * Charset Selector
35 *
36 * A charset selector is built with a list of charset names and given an input
37 * CharSequence returns the list of names the corresponding charsets which can
38 * convert the CharSequence.
39 *
40 * @stable ICU 4.2
41 */
42public final class CharsetSelector {
43    private IntTrie trie;
44    private int[] pv; // table of bits
45    private String[] encodings; // encodings users ask to use
46
47    private void generateSelectorData(PropsVectors pvec,
48            UnicodeSet excludedCodePoints, int mappingTypes) {
49        int columns = (encodings.length + 31) / 32;
50
51        // set errorValue to all-ones
52        for (int col = 0; col < columns; ++col) {
53            pvec.setValue(PropsVectors.ERROR_VALUE_CP,
54                    PropsVectors.ERROR_VALUE_CP, col, ~0, ~0);
55        }
56
57        for (int i = 0; i < encodings.length; ++i) {
58            Charset testCharset = CharsetICU.forNameICU(encodings[i]);
59            UnicodeSet unicodePointSet = new UnicodeSet(); // empty set
60            ((CharsetICU) testCharset).getUnicodeSet(unicodePointSet,
61                    mappingTypes);
62            int column = i / 32;
63            int mask = 1 << (i % 32);
64            // now iterate over intervals on set i
65            int itemCount = unicodePointSet.getRangeCount();
66            for (int j = 0; j < itemCount; ++j) {
67                int startChar = unicodePointSet.getRangeStart(j);
68                int endChar = unicodePointSet.getRangeEnd(j);
69                pvec.setValue(startChar, endChar, column, ~0, mask);
70            }
71        }
72
73        // handle excluded encodings
74        // Simply set their values to all 1's in the pvec
75        if (!excludedCodePoints.isEmpty()) {
76            int itemCount = excludedCodePoints.getRangeCount();
77            for (int j = 0; j < itemCount; ++j) {
78                int startChar = excludedCodePoints.getRangeStart(j);
79                int endChar = excludedCodePoints.getRangeEnd(j);
80                for (int col = 0; col < columns; col++) {
81                    pvec.setValue(startChar, endChar, col, ~0, ~0);
82                }
83            }
84        }
85
86        trie = pvec.compactToTrieWithRowIndexes();
87        pv = pvec.getCompactedArray();
88    }
89
90    // internal function to intersect two sets of masks
91    // returns whether the mask has reduced to all zeros. The
92    // second set of mask consists of len elements in pv starting from
93    // pvIndex
94    private boolean intersectMasks(int[] dest, int pvIndex, int len) {
95        int oredDest = 0;
96        for (int i = 0; i < len; ++i) {
97            oredDest |= (dest[i] &= pv[pvIndex + i]);
98        }
99        return oredDest == 0;
100    }
101
102    // internal function
103    private List<String> selectForMask(int[] mask) {
104        // this is the context we will use. Store a table of indices to which
105        // encodings are legit
106
107        List<String> result = new ArrayList<String>();
108        int columns = (encodings.length + 31) / 32;
109        int numOnes = countOnes(mask, columns);
110
111        // now we know the exact space we need to index
112        if (numOnes > 0) {
113            int k = 0;
114            for (int j = 0; j < columns; j++) {
115                int v = mask[j];
116                for (int i = 0; i < 32 && k < encodings.length; i++, k++) {
117                    if ((v & 1) != 0) {
118                        result.add(encodings[k]);
119                    }
120                    v >>= 1;
121                }
122            }
123        }
124
125        // otherwise, index will remain NULL
126        return result;
127    }
128
129    // internal function to count how many 1's are there in a mask
130    // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
131    private int countOnes(int[] mask, int len) {
132        int totalOnes = 0;
133        for (int i = 0; i < len; ++i) {
134            int ent = mask[i];
135            for (; ent != 0; totalOnes++) {
136                ent &= ent - 1; // clear the least significant bit set
137            }
138        }
139        return totalOnes;
140    }
141
142    /**
143     * Construct a CharsetSelector from a list of charset names.
144     *
145     * @param charsetList
146     *            a list of charset names in the form of strings. If charsetList
147     *            is empty, a selector for all available charset is constructed.
148     * @param excludedCodePoints
149     *            a set of code points to be excluded from consideration.
150     *            Excluded code points appearing in the input CharSequence do
151     *            not change the selection result. It could be empty when no
152     *            code point should be excluded.
153     * @param mappingTypes
154     *            an int which determines whether to consider only roundtrip
155     *            mappings or also fallbacks, e.g. CharsetICU.ROUNDTRIP_SET. See
156     *            CharsetICU.java for the constants that are currently
157     *            supported.
158     * @throws IllegalArgumentException
159     *             if the parameters is invalid.
160     * @throws IllegalCharsetNameException
161     *             If the given charset name is illegal.
162     * @throws UnsupportedCharsetException
163     *             If no support for the named charset is available in this
164     *             instance of the Java virtual machine.
165     * @stable ICU 4.2
166     */
167    public CharsetSelector(List<String> charsetList, UnicodeSet excludedCodePoints,
168            int mappingTypes) {
169        if (mappingTypes != CharsetICU.ROUNDTRIP_AND_FALLBACK_SET
170                && mappingTypes != CharsetICU.ROUNDTRIP_SET) {
171            throw new IllegalArgumentException("Unsupported mappingTypes");
172        }
173
174        int encodingCount = charsetList.size();
175        if (encodingCount > 0) {
176            encodings = charsetList.toArray(new String[0]);
177        } else {
178            encodings = CharsetProviderICU.getAvailableNames();
179            encodingCount = encodings.length;
180        }
181
182        PropsVectors pvec = new PropsVectors((encodingCount + 31) / 32);
183        generateSelectorData(pvec, excludedCodePoints, mappingTypes);
184    }
185
186    /**
187     * Select charsets that can map all characters in a CharSequence, ignoring
188     * the excluded code points.
189     *
190     * @param unicodeText
191     *            a CharSequence. It could be empty.
192     * @return a list that contains charset names in the form of strings. The
193     *         returned encoding names and their order will be the same as
194     *         supplied when building the selector.
195     *
196     * @stable ICU 4.2
197     */
198    public List<String> selectForString(CharSequence unicodeText) {
199        int columns = (encodings.length + 31) / 32;
200        int[] mask = new int[columns];
201        for (int i = 0; i < columns; i++) {
202            mask[i] = - 1; // set each bit to 1
203                           // Note: All integers are signed in Java, assigning
204                           // 2 ^ 32 -1 to mask is wrong!
205        }
206        int index = 0;
207        while (index < unicodeText.length()) {
208            int c = UTF16.charAt(unicodeText, index);
209            int pvIndex = trie.getCodePointValue(c);
210            index += UTF16.getCharCount(c);
211            if (intersectMasks(mask, pvIndex, columns)) {
212                break;
213            }
214        }
215        return selectForMask(mask);
216    }
217}
218