icu/charset/CharsetSelector.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
 ******************************************************************************
 * Copyright (C) 1996-2010, International Business Machines Corporation and   *
 * others. All Rights Reserved.                                               *
 ******************************************************************************
 */

/*
 * This is a port of the C++ class UConverterSelector.
 *
 * Methods related to serialization are not ported in this version. In addition,
 * the selectForUTF8 method is not going to be ported, as UTF8 is seldom used
 * in Java.
 *
 * @author Shaopeng Jia
 */

package com.ibm.icu.charset;

import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.List;

import com.ibm.icu.impl.IntTrie;
import com.ibm.icu.impl.PropsVectors;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;

/**
 * Charset Selector
 *
 * A charset selector is built with a list of charset names and given an input
 * CharSequence returns the list of names the corresponding charsets which can
 * convert the CharSequence.
 *
 * @stable ICU 4.2
 */
public final class CharsetSelector {
    private IntTrie trie;
    private int[] pv; // table of bits
    private String[] encodings; // encodings users ask to use

    private void generateSelectorData(PropsVectors pvec,
            UnicodeSet excludedCodePoints, int mappingTypes) {
        int columns = (encodings.length + 31) / 32;

        // set errorValue to all-ones
        for (int col = 0; col < columns; ++col) {
            pvec.setValue(PropsVectors.ERROR_VALUE_CP,
                    PropsVectors.ERROR_VALUE_CP, col, ~0, ~0);
        }

        for (int i = 0; i < encodings.length; ++i) {
            Charset testCharset = CharsetICU.forNameICU(encodings[i]);
            UnicodeSet unicodePointSet = new UnicodeSet(); // empty set
            ((CharsetICU) testCharset).getUnicodeSet(unicodePointSet,
                    mappingTypes);
            int column = i / 32;
            int mask = 1 << (i % 32);
            // now iterate over intervals on set i
            int itemCount = unicodePointSet.getRangeCount();
            for (int j = 0; j < itemCount; ++j) {
                int startChar = unicodePointSet.getRangeStart(j);
                int endChar = unicodePointSet.getRangeEnd(j);
                pvec.setValue(startChar, endChar, column, ~0, mask);
            }
        }

        // handle excluded encodings
        // Simply set their values to all 1's in the pvec
        if (!excludedCodePoints.isEmpty()) {
            int itemCount = excludedCodePoints.getRangeCount();
            for (int j = 0; j < itemCount; ++j) {
                int startChar = excludedCodePoints.getRangeStart(j);
                int endChar = excludedCodePoints.getRangeEnd(j);
                for (int col = 0; col < columns; col++) {
                    pvec.setValue(startChar, endChar, col, ~0, ~0);
                }
            }
        }

        trie = pvec.compactToTrieWithRowIndexes();
        pv = pvec.getCompactedArray();
    }

    // internal function to intersect two sets of masks
    // returns whether the mask has reduced to all zeros. The
    // second set of mask consists of len elements in pv starting from
    // pvIndex
    private boolean intersectMasks(int[] dest, int pvIndex, int len) {
        int oredDest = 0;
        for (int i = 0; i < len; ++i) {
            oredDest |= (dest[i] &= pv[pvIndex + i]);
        }
        return oredDest == 0;
    }

    // internal function
    private List<String> selectForMask(int[] mask) {
        // this is the context we will use. Store a table of indices to which
        // encodings are legit

        List<String> result = new ArrayList<String>();
        int columns = (encodings.length + 31) / 32;
        int numOnes = countOnes(mask, columns);

        // now we know the exact space we need to index
        if (numOnes > 0) {
            int k = 0;
            for (int j = 0; j < columns; j++) {
                int v = mask[j];
                for (int i = 0; i < 32 && k < encodings.length; i++, k++) {
                    if ((v & 1) != 0) {
                        result.add(encodings[k]);
                    }
                    v >>= 1;
                }
            }
        }

        // otherwise, index will remain NULL
        return result;
    }

    // internal function to count how many 1's are there in a mask
    // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html
    private int countOnes(int[] mask, int len) {
        int totalOnes = 0;
        for (int i = 0; i < len; ++i) {
            int ent = mask[i];
            for (; ent != 0; totalOnes++) {
                ent &= ent - 1; // clear the least significant bit set
            }
        }
        return totalOnes;
    }

    /**
     * Construct a CharsetSelector from a list of charset names.
     *
     * @param charsetList
     *            a list of charset names in the form of strings. If charsetList
     *            is empty, a selector for all available charset is constructed.
     * @param excludedCodePoints
     *            a set of code points to be excluded from consideration.
     *            Excluded code points appearing in the input CharSequence do
     *            not change the selection result. It could be empty when no
     *            code point should be excluded.
     * @param mappingTypes
     *            an int which determines whether to consider only roundtrip
     *            mappings or also fallbacks, e.g. CharsetICU.ROUNDTRIP_SET. See
     *            CharsetICU.java for the constants that are currently
     *            supported.
     * @throws IllegalArgumentException
     *             if the parameters is invalid.
     * @throws IllegalCharsetNameException
     *             If the given charset name is illegal.
     * @throws UnsupportedCharsetException
     *             If no support for the named charset is available in this
     *             instance of the Java virtual machine.
     * @stable ICU 4.2
     */
    public CharsetSelector(List<String> charsetList, UnicodeSet excludedCodePoints,
            int mappingTypes) {
        if (mappingTypes != CharsetICU.ROUNDTRIP_AND_FALLBACK_SET
                && mappingTypes != CharsetICU.ROUNDTRIP_SET) {
            throw new IllegalArgumentException("Unsupported mappingTypes");
        }

        int encodingCount = charsetList.size();
        if (encodingCount > 0) {
            encodings = charsetList.toArray(new String[0]);
        } else {
            encodings = CharsetProviderICU.getAvailableNames();
            encodingCount = encodings.length;
        }

        PropsVectors pvec = new PropsVectors((encodingCount + 31) / 32);
        generateSelectorData(pvec, excludedCodePoints, mappingTypes);
    }

    /**
     * Select charsets that can map all characters in a CharSequence, ignoring
     * the excluded code points.
     *
     * @param unicodeText
     *            a CharSequence. It could be empty.
     * @return a list that contains charset names in the form of strings. The
     *         returned encoding names and their order will be the same as
     *         supplied when building the selector.
     *
     * @stable ICU 4.2
     */
    public List<String> selectForString(CharSequence unicodeText) {
        int columns = (encodings.length + 31) / 32;
        int[] mask = new int[columns];
        for (int i = 0; i < columns; i++) {
            mask[i] = - 1; // set each bit to 1
                           // Note: All integers are signed in Java, assigning
                           // 2 ^ 32 -1 to mask is wrong!
        }
        int index = 0;
        while (index < unicodeText.length()) {
            int c = UTF16.charAt(unicodeText, index);
            int pvIndex = trie.getCodePointValue(c);
            index += UTF16.getCharCount(c);
            if (intersectMasks(mask, pvIndex, columns)) {
                break;
            }
        }
        return selectForMask(mask);
    }
}