12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others.
22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/*
47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Copyright (C) 1996-2011, International Business Machines Corporation and
57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * others. All Rights Reserved.
67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text;
97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.UCaseProps;
117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UCharacter;
127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.ULocale;
137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/**
157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * A transliterator that converts all letters (as defined by
167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * <code>UCharacter.isLetter()</code>) to lower case, except for those
177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * letters preceded by non-letters.  The latter are converted to title
187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * case using <code>UCharacter.toTitleCase()</code>.
197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * @author Alan Liu
207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertclass TitlecaseTransliterator extends Transliterator {
227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static final String _ID = "Any-Title";
242d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert    // TODO: Add variants for tr/az, lt, default = default locale: ICU ticket #12720
257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * System registration hook.
287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static void register() {
307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        Transliterator.registerFactory(_ID, new Transliterator.Factory() {
312d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert            @Override
327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            public Transliterator getInstance(String ID) {
337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return new TitlecaseTransliterator(ULocale.US);
347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        });
367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        registerSpecialInverse("Title", "Lower", false);
387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
402d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert    private final ULocale locale;
417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
422d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert    private final UCaseProps csp;
437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private ReplaceableContextIterator iter;
447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private StringBuilder result;
4563cafec8b8cb135e7c06ef6b9fc8c128ed55b140Markus Scherer    private int caseLocale;
467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert   /**
487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Constructs a transliterator.
497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public TitlecaseTransliterator(ULocale loc) {
517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        super(_ID, null);
527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        locale = loc;
537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Need to look back 2 characters in the case of "can't"
547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        setMaximumContextLength(2);
557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        csp=UCaseProps.INSTANCE;
567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        iter=new ReplaceableContextIterator();
577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        result = new StringBuilder();
5863cafec8b8cb135e7c06ef6b9fc8c128ed55b140Markus Scherer        caseLocale = UCaseProps.getCaseLocale(locale);
597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
602d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Implements {@link Transliterator#handleTransliterate}.
637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
642d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert    @Override
657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected synchronized void handleTransliterate(Replaceable text,
667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                       Position offsets, boolean isIncremental) {
677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // TODO reimplement, see ustrcase.c
687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // using a real word break iterator
697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   instead of just looking for a transition between cased and uncased characters
707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // call CaseMapTransliterator::handleTransliterate() for lowercasing? (set fMap)
717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // needs to take isIncremental into account because case mappings are context-sensitive
727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   also detect when lowercasing function did not finish because of context
737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (offsets.start >= offsets.limit) {
757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return;
767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // case type: >0 cased (UCaseProps.LOWER etc.)  ==0 uncased  <0 case-ignorable
797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int type;
807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Our mode; we are either converting letter toTitle or
827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // toLower.
837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        boolean doTitle = true;
847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Determine if there is a preceding context of cased case-ignorable*,
867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // in which case we want to start in toLower mode.  If the
877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // prior context is anything else (including empty) then start
887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // in toTitle mode.
897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int c, start;
907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (start = offsets.start - 1; start >= offsets.contextStart; start -= UTF16.getCharCount(c)) {
917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            c = text.char32At(start);
927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            type=csp.getTypeOrIgnorable(c);
937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(type>0) { // cased
947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                doTitle=false;
957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                break;
967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else if(type==0) { // uncased but not ignorable
977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                break;
987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // else (type<0) case-ignorable: continue
1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Convert things after a cased character toLower; things
1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // after a uncased, non-case-ignorable character toTitle.  Case-ignorable
1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // characters are copied directly and do not change the mode.
1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        iter.setText(text);
1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        iter.setIndex(offsets.start);
1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        iter.setLimit(offsets.limit);
1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        iter.setContextLimits(offsets.contextStart, offsets.contextLimit);
1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        result.setLength(0);
1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Walk through original string
1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // If there is a case change, modify corresponding position in replaceable
1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int delta;
1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        while((c=iter.nextCaseMapCP())>=0) {
1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            type=csp.getTypeOrIgnorable(c);
1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(type>=0) { // not case-ignorable
1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(doTitle) {
12163cafec8b8cb135e7c06ef6b9fc8c128ed55b140Markus Scherer                    c=csp.toFullTitle(c, iter, result, caseLocale);
1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
12363cafec8b8cb135e7c06ef6b9fc8c128ed55b140Markus Scherer                    c=csp.toFullLower(c, iter, result, caseLocale);
1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                doTitle = type==0; // doTitle=isUncased
1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(iter.didReachLimit() && isIncremental) {
1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // the case mapping function tried to look beyond the context limit
1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // wait for more input
1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    offsets.start=iter.getCaseMapCPStart();
1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    return;
1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                /* decode the result */
1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(c<0) {
1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    /* c mapped to itself, no change */
1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    continue;
1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    /* replace by the mapping string */
1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    delta=iter.replace(result.toString());
1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    result.setLength(0);
1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    /* replace by single-code point mapping */
1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    delta=iter.replace(UTF16.valueOf(c));
1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(delta!=0) {
1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    offsets.limit += delta;
1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    offsets.contextLimit += delta;
1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        offsets.start = offsets.limit;
1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1552d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // NOTE: normally this would be static, but because the results vary by locale....
1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    SourceTargetUtility sourceTargetUtility = null;
1582d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /* (non-Javadoc)
1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    @Override
1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        synchronized (this) {
1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (sourceTargetUtility == null) {
1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
1672d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert                    @Override
1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    public String transform(String source) {
1692d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert                        return UCharacter.toTitleCase(locale, source, null);
1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                });
1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert}
177