151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski/*
22c87ad3a45cecf9e344487cad1abfdebe79f2c7cNarayan Kamath * Copyright (C) 2014 The Android Open Source Project
36e42190c7f7d7cf3d8b787c918de0d797c6ddbbaPaul Duffin * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * This code is free software; you can redistribute it and/or modify it
751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * under the terms of the GNU General Public License version 2 only, as
851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * published by the Free Software Foundation.  Oracle designates this
951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * particular file as subject to the "Classpath" exception as provided
1051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * by Oracle in the LICENSE file that accompanied this code.
1151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
1251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * This code is distributed in the hope that it will be useful, but WITHOUT
1351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
1451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * version 2 for more details (a copy is included in the LICENSE file that
1651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * accompanied this code).
1751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
1851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * You should have received a copy of the GNU General Public License version
1951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * 2 along with this work; if not, write to the Free Software Foundation,
2051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
2151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
2251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
2351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * or visit www.oracle.com if you need additional information or have any
2451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * questions.
2551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski */
2651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski
2751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski/*
2851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *******************************************************************************
2951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
3051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *                                                                             *
3151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * The original version of this source code and documentation is copyrighted   *
3251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * and owned by IBM, These materials are provided under terms of a License     *
3351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * Agreement between IBM and Sun. This technology is protected by multiple     *
3451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * US and International patents. This notice and attribution to IBM may not    *
3551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * to removed.                                                                 *
3651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *******************************************************************************
3751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski */
3851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski
3951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebskipackage java.text;
4051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski
4151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski/**
4251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * This class provides the method <code>normalize</code> which transforms Unicode
4351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * text into an equivalent composed or decomposed form, allowing for easier
4451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * sorting and searching of text.
4551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * The <code>normalize</code> method supports the standard normalization forms
4651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * described in
4751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * <a href="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
4851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
4951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * <p>
5051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * Characters with accents or other adornments can be encoded in
5151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * several different ways in Unicode.  For example, take the character A-acute.
5251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * In Unicode, this can be encoded as a single character (the "composed" form):
5351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
546e42190c7f7d7cf3d8b787c918de0d797c6ddbbaPaul Duffin * <pre>
5551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *      U+00C1    LATIN CAPITAL LETTER A WITH ACUTE</pre>
5651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
5751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * or as two separate characters (the "decomposed" form):
5851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
596e42190c7f7d7cf3d8b787c918de0d797c6ddbbaPaul Duffin * <pre>
6051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *      U+0041    LATIN CAPITAL LETTER A
6151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *      U+0301    COMBINING ACUTE ACCENT</pre>
6251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
6351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * To a user of your program, however, both of these sequences should be
6451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * treated as the same "user-level" character "A with acute accent".  When you
6551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * are searching or comparing text, you must ensure that these two sequences are
6651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * treated as equivalent.  In addition, you must handle characters with more than
6751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * one accent. Sometimes the order of a character's combining accents is
6851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * significant, while in other cases accent sequences in different orders are
6951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * really equivalent.
7051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * <p>
7151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * Similarly, the string "ffi" can be encoded as three separate letters:
7251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
736e42190c7f7d7cf3d8b787c918de0d797c6ddbbaPaul Duffin * <pre>
7451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *      U+0066    LATIN SMALL LETTER F
7551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *      U+0066    LATIN SMALL LETTER F
7651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *      U+0069    LATIN SMALL LETTER I</pre>
7751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
7851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * or as the single character
7951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
806e42190c7f7d7cf3d8b787c918de0d797c6ddbbaPaul Duffin * <pre>
8151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *      U+FB03    LATIN SMALL LIGATURE FFI</pre>
8251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
8351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * The ffi ligature is not a distinct semantic character, and strictly speaking
8451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * it shouldn't be in Unicode at all, but it was included for compatibility
8551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * with existing character sets that already provided it.  The Unicode standard
8651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * identifies such characters by giving them "compatibility" decompositions
8751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * into the corresponding semantic characters.  When sorting and searching, you
8851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * will often want to use these mappings.
8951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * <p>
9051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * The <code>normalize</code> method helps solve these problems by transforming
9151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * text into the canonical composed and decomposed forms as shown in the first
9251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * example above. In addition, you can have it perform compatibility
9351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * decompositions so that you can treat compatibility characters the same as
9451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * their equivalents.
9551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * Finally, the <code>normalize</code> method rearranges accents into the
9651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * proper canonical order, so that you do not have to worry about accent
9751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * rearrangement on your own.
9851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * <p>
9951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * The W3C generally recommends to exchange texts in NFC.
10051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * Note also that most legacy character encodings use only precomposed forms and
10151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * often do not encode any combining marks by themselves. For conversion to such
10251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * character encodings the Unicode text needs to be normalized to NFC.
10351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * For more usage examples, see the Unicode Standard Annex.
10451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski *
10551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski * @since 1.6
10651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski */
10751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebskipublic final class Normalizer {
10851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski
1094b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera    private Normalizer() {
1104b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera    }
11151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski
11251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski    /**
11351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * This enum provides constants of the four Unicode normalization forms
11451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * that are described in
11551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * <a href="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
11651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>
11751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * and two methods to access them.
11851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     *
11951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * @since 1.6
12051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     */
1214b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera    public enum Form {
12251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski
12351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski        /**
12451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski         * Canonical decomposition.
12551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski         */
1264b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera        NFD(android.icu.text.Normalizer.NFD),
12751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski
12851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski        /**
12951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski         * Canonical decomposition, followed by canonical composition.
13051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski         */
1314b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera        NFC(android.icu.text.Normalizer.NFC),
13251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski
13351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski        /**
13451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski         * Compatibility decomposition.
13551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski         */
1364b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera        NFKD(android.icu.text.Normalizer.NFKD),
13751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski
13851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski        /**
13951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski         * Compatibility decomposition, followed by canonical composition.
14051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski         */
1414b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera        NFKC(android.icu.text.Normalizer.NFKC);
1424b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera
1434b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera        private final android.icu.text.Normalizer.Mode icuMode;
1444b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera
1454b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera        Form(android.icu.text.Normalizer.Mode icuMode) {
1464b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera            this.icuMode = icuMode;
1474b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera        }
14851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski    }
14951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski
15051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski    /**
15151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * Normalize a sequence of char values.
15251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * The sequence will be normalized according to the specified normalization
15351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * from.
1544b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *
1554b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     * @param src  The sequence of char values to normalize.
1564b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     * @param form The normalization form; one of
1574b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *             {@link java.text.Normalizer.Form#NFC},
1584b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *             {@link java.text.Normalizer.Form#NFD},
1594b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *             {@link java.text.Normalizer.Form#NFKC},
1604b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *             {@link java.text.Normalizer.Form#NFKD}
16151b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * @return The normalized String
16251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * @throws NullPointerException If <code>src</code> or <code>form</code>
1634b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *                              is null.
16451b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     */
16551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski    public static String normalize(CharSequence src, Form form) {
1664b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera        return android.icu.text.Normalizer.normalize(src.toString(), form.icuMode);
16751b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski    }
16851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski
16951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski    /**
17051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * Determines if the given sequence of char values is normalized.
1714b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *
1724b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     * @param src  The sequence of char values to be checked.
1734b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     * @param form The normalization form; one of
1744b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *             {@link java.text.Normalizer.Form#NFC},
1754b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *             {@link java.text.Normalizer.Form#NFD},
1764b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *             {@link java.text.Normalizer.Form#NFKC},
1774b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *             {@link java.text.Normalizer.Form#NFKD}
17851b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * @return true if the sequence of char values is normalized;
17951b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * false otherwise.
18051b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     * @throws NullPointerException If <code>src</code> or <code>form</code>
1814b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera     *                              is null.
18251b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski     */
18351b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski    public static boolean isNormalized(CharSequence src, Form form) {
1844b62f17388779be105069b68b55347c5b28a9e49Shubham Ajmera        return android.icu.text.Normalizer.isNormalized(src.toString(), form.icuMode, 0);
18551b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski    }
18651b1b6997fd3f980076b8081f7f1165ccc2a4008Piotr Jastrzebski}
187