GraphemeBreak.cpp revision 3d28a3fcebfed4744d1ef0307a8bdc8fc01e364c
13d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien/*
23d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * Copyright (C) 2014 The Android Open Source Project
33d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien *
43d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * Licensed under the Apache License, Version 2.0 (the "License");
53d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * you may not use this file except in compliance with the License.
63d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * You may obtain a copy of the License at
73d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien *
83d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien *      http://www.apache.org/licenses/LICENSE-2.0
93d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien *
103d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * Unless required by applicable law or agreed to in writing, software
113d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * distributed under the License is distributed on an "AS IS" BASIS,
123d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
133d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * See the License for the specific language governing permissions and
143d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * limitations under the License.
153d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien */
163d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien
173d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien#include <stdint.h>
183d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien#include <unicode/uchar.h>
193d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien#include <unicode/utf16.h>
203d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien
213d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien#include <minikin/GraphemeBreak.h>
223d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien
233d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Leviennamespace android {
243d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien
253d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levienbool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count,
263d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        size_t offset) {
273d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // This implementation closely follows Unicode Standard Annex #29 on
283d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
293d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // implementing a tailored version of extended grapheme clusters.
303d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.
313d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien
323d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Rule GB1, sot /; Rule GB2, / eot
333d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    if (offset <= start || offset >= start + count) {
343d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        return true;
353d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
363d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    if (U16_IS_TRAIL(buf[offset])) {
373d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        // Don't break a surrogate pair
383d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        return false;
393d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
403d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    uint32_t c1 = 0;
413d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    uint32_t c2 = 0;
423d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    size_t offset_back = offset;
433d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    U16_PREV(buf, start, offset_back, c1);
443d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    U16_NEXT(buf, offset, count, c2);
453d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    int32_t p1 = u_getIntPropertyValue(c1, UCHAR_GRAPHEME_CLUSTER_BREAK);
463d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    int32_t p2 = u_getIntPropertyValue(c2, UCHAR_GRAPHEME_CLUSTER_BREAK);
473d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Rule GB3, CR x LF
483d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
493d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        return false;
503d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
513d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Rule GB4, (Control | CR | LF) /
523d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) {
533d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        return true;
543d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
553d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Rule GB5, / (Control | CR | LF)
563d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) {
573d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        // exclude zero-width control characters from breaking (tailoring of TR29)
583d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        if (c2 == 0x00ad
593d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien                || (c2 >= 0x200b && c2 <= 0x200f)
603d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien                || (c2 >= 0x2028 && c2 <= 0x202e)
613d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien                || (c2 >= 0x2060 && c2 <= 0x206f)) {
623d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien            return false;
633d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        }
643d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        return true;
653d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
663d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Rule GB6, L x ( L | V | LV | LVT )
673d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) {
683d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        return false;
693d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
703d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Rule GB7, ( LV | V ) x ( V | T )
713d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) {
723d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        return false;
733d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
743d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Rule GB8, ( LVT | T ) x T
753d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    if ((p1 == U_GCB_L || p1 == U_GCB_T) && p2 == U_GCB_T) {
763d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        return false;
773d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
783d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Rule GB8a, Regional_Indicator x Regional_Indicator
793d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
803d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        return false;
813d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
823d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Rule GB9, x Extend; Rule GB9a, x SpacingMark
833d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK) {
843d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        if (c2 == 0xe33) {
853d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien            // most other implementations break THAI CHARACTER SARA AM
863d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien            // (tailoring of TR29)
873d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien            return true;
883d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        }
893d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        return false;
903d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
913d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Cluster indic syllables togeter (tailoring of TR29)
923d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9  // virama
933d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien            && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
943d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        return false;
953d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
963d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    // Rule GB10, Any / Any
973d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    return true;
983d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien}
993d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien
1003d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Leviensize_t GraphemeBreak::getTextRunCursor(const uint16_t* buf, size_t start, size_t count,
1013d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        size_t offset, MoveOpt opt) {
1023d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    switch (opt) {
1033d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    case AFTER:
1043d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        if (offset < start + count) {
1053d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien            offset++;
1063d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        }
1073d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        // fall through
1083d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    case AT_OR_AFTER:
1093d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        while (!isGraphemeBreak(buf, start, count, offset)) {
1103d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien            offset++;
1113d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        }
1123d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        break;
1133d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    case BEFORE:
1143d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        if (offset > start) {
1153d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien            offset--;
1163d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        }
1173d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        // fall through
1183d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    case AT_OR_BEFORE:
1193d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        while (!isGraphemeBreak(buf, start, count, offset)) {
1203d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien            offset--;
1213d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        }
1223d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        break;
1233d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    case AT:
1243d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        if (!isGraphemeBreak(buf, start, count, offset)) {
1253d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien            offset = (size_t)-1;
1263d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        }
1273d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien        break;
1283d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    }
1293d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien    return offset;
1303d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien}
1313d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien
1323d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien}  // namespace android
133