GraphemeBreak.cpp revision 3d28a3fcebfed4744d1ef0307a8bdc8fc01e364c
13d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien/* 23d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * Copyright (C) 2014 The Android Open Source Project 33d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * 43d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 53d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * you may not use this file except in compliance with the License. 63d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * You may obtain a copy of the License at 73d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * 83d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * http://www.apache.org/licenses/LICENSE-2.0 93d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * 103d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * Unless required by applicable law or agreed to in writing, software 113d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * distributed under the License is distributed on an "AS IS" BASIS, 123d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 133d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * See the License for the specific language governing permissions and 143d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien * limitations under the License. 153d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien */ 163d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien 173d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien#include <stdint.h> 183d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien#include <unicode/uchar.h> 193d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien#include <unicode/utf16.h> 203d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien 213d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien#include <minikin/GraphemeBreak.h> 223d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien 233d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Leviennamespace android { 243d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien 253d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levienbool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count, 263d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien size_t offset) { 273d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // This implementation closely follows Unicode Standard Annex #29 on 283d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/), 293d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // implementing a tailored version of extended grapheme clusters. 303d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules. 313d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien 323d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Rule GB1, sot /; Rule GB2, / eot 333d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (offset <= start || offset >= start + count) { 343d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return true; 353d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 363d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (U16_IS_TRAIL(buf[offset])) { 373d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Don't break a surrogate pair 383d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return false; 393d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 403d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien uint32_t c1 = 0; 413d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien uint32_t c2 = 0; 423d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien size_t offset_back = offset; 433d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien U16_PREV(buf, start, offset_back, c1); 443d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien U16_NEXT(buf, offset, count, c2); 453d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien int32_t p1 = u_getIntPropertyValue(c1, UCHAR_GRAPHEME_CLUSTER_BREAK); 463d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien int32_t p2 = u_getIntPropertyValue(c2, UCHAR_GRAPHEME_CLUSTER_BREAK); 473d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Rule GB3, CR x LF 483d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (p1 == U_GCB_CR && p2 == U_GCB_LF) { 493d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return false; 503d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 513d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Rule GB4, (Control | CR | LF) / 523d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) { 533d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return true; 543d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 553d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Rule GB5, / (Control | CR | LF) 563d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) { 573d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // exclude zero-width control characters from breaking (tailoring of TR29) 583d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (c2 == 0x00ad 593d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien || (c2 >= 0x200b && c2 <= 0x200f) 603d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien || (c2 >= 0x2028 && c2 <= 0x202e) 613d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien || (c2 >= 0x2060 && c2 <= 0x206f)) { 623d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return false; 633d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 643d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return true; 653d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 663d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Rule GB6, L x ( L | V | LV | LVT ) 673d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) { 683d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return false; 693d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 703d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Rule GB7, ( LV | V ) x ( V | T ) 713d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) { 723d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return false; 733d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 743d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Rule GB8, ( LVT | T ) x T 753d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if ((p1 == U_GCB_L || p1 == U_GCB_T) && p2 == U_GCB_T) { 763d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return false; 773d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 783d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Rule GB8a, Regional_Indicator x Regional_Indicator 793d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) { 803d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return false; 813d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 823d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Rule GB9, x Extend; Rule GB9a, x SpacingMark 833d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK) { 843d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (c2 == 0xe33) { 853d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // most other implementations break THAI CHARACTER SARA AM 863d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // (tailoring of TR29) 873d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return true; 883d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 893d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return false; 903d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 913d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Cluster indic syllables togeter (tailoring of TR29) 923d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama 933d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) { 943d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return false; 953d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 963d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // Rule GB10, Any / Any 973d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return true; 983d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien} 993d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien 1003d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Leviensize_t GraphemeBreak::getTextRunCursor(const uint16_t* buf, size_t start, size_t count, 1013d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien size_t offset, MoveOpt opt) { 1023d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien switch (opt) { 1033d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien case AFTER: 1043d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (offset < start + count) { 1053d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien offset++; 1063d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 1073d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // fall through 1083d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien case AT_OR_AFTER: 1093d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien while (!isGraphemeBreak(buf, start, count, offset)) { 1103d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien offset++; 1113d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 1123d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien break; 1133d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien case BEFORE: 1143d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (offset > start) { 1153d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien offset--; 1163d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 1173d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien // fall through 1183d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien case AT_OR_BEFORE: 1193d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien while (!isGraphemeBreak(buf, start, count, offset)) { 1203d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien offset--; 1213d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 1223d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien break; 1233d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien case AT: 1243d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien if (!isGraphemeBreak(buf, start, count, offset)) { 1253d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien offset = (size_t)-1; 1263d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 1273d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien break; 1283d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien } 1293d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien return offset; 1303d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien} 1313d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien 1323d28a3fcebfed4744d1ef0307a8bdc8fc01e364cRaph Levien} // namespace android 133