WordBreaker.cpp revision 9c4cc648abcae144f3b99d612e58ef01d5e52cce
157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/* 257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project 357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License. 657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at 757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * http://www.apache.org/licenses/LICENSE-2.0 957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software 1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS, 1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and 1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License. 1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */ 1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin" 1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h> 1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "minikin/WordBreaker.h" 2157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uchar.h> 2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/utf16.h> 2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Leviennamespace android { 2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienconst uint32_t CHAR_SOFT_HYPHEN = 0x00AD; 2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setLocale(const icu::Locale& locale) { 3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UErrorCode status = U_ZERO_ERROR; 3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status)); 3257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // TODO: handle failure status 3357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if (mText != nullptr) { 3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->setText(&mUText, status); 3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 3657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mIteratorWasReset = true; 3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 3857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setText(const uint16_t* data, size_t size) { 4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mText = data; 4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mTextSize = size; 4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mIteratorWasReset = false; 4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mLast = 0; 4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mCurrent = 0; 459c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mScanOffset = 0; 469c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mSuppressHyphen = false; 4757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UErrorCode status = U_ZERO_ERROR; 4857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien utext_openUChars(&mUText, data, size, &status); 4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->setText(&mUText, status); 5057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->first(); 5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 5257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::current() const { 5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return mCurrent; 5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 5657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 579c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levienenum ScanState { 589c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien START, 599c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_AT, 609c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_COLON, 619c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_COLON_SLASH, 629c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_COLON_SLASH_SLASH, 639c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}; 649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 6557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::next() { 6657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mLast = mCurrent; 679c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 689c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // scan forward from current ICU position for email address or URL 699c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (mLast >= mScanOffset) { 709c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien ScanState state = START; 719c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien size_t i; 729c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien for (i = mLast; i < mTextSize; i++) { 739c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t c = mText[i]; 749c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // scan only ASCII characters, stop at space 759c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (!(' ' < c && c <= 0x007E)) { 769c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien break; 779c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 789c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (state == START && c == '@') { 799c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = SAW_AT; 809c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else if (state == START && c == ':') { 819c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = SAW_COLON; 829c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else if (state == SAW_COLON || state == SAW_COLON_SLASH) { 839c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (c == '/') { 849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = static_cast<ScanState>((int)state + 1); // next state adds a slash 859c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else { 869c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = START; 879c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 889c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 899c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 909c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) { 919c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // no line breaks in entire email address or url 929c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // TODO: refine this according to Chicago Manual of Style rules 939c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien while (i < mTextSize && mText[i] == ' ') { 949c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien i++; 959c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 969c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mCurrent = i; 979c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mSuppressHyphen = true; 989c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // Setting mIteratorWasReset will cause next break to be computed following 999c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // mCurrent, rather than following the current ICU iterator location. 1009c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mIteratorWasReset = true; 1019c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (mBreakIterator->isBoundary(mCurrent)) { 1029c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien return mCurrent; 1039c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1049c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else { 1059c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mScanOffset = i; 1069c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mSuppressHyphen = false; 1079c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1089c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1099c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1109c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien int32_t result; 11157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien do { 11257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if (mIteratorWasReset) { 11357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = mBreakIterator->following(mCurrent); 11457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mIteratorWasReset = false; 11557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } else { 11657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = mBreakIterator->next(); 11757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 11857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize 11957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien && mText[result - 1] == CHAR_SOFT_HYPHEN); 12057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mCurrent = (ssize_t)result; 12157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return mCurrent; 12257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 12357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 12457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordStart() const { 1259c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (mSuppressHyphen) { 1269c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien return mLast; 1279c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 12857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t result = mLast; 12957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien while (result < mCurrent) { 13057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UChar32 c; 13157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t ix = result; 13257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien U16_NEXT(mText, ix, mCurrent, c); 13357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 13457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // strip leading punctuation, defined as OP and QU line breaking classes, 13557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // see UAX #14 13657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) { 13757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien break; 13857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 13957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = ix; 14057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 14157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return result; 14257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 14357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 14457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordEnd() const { 1459c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (mSuppressHyphen) { 1469c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien return mLast; 1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 14857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t result = mCurrent; 14957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien while (result > mLast) { 15057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UChar32 c; 15157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t ix = result; 15257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien U16_PREV(mText, mLast, ix, c); 15357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien int32_t gc_mask = U_GET_GC_MASK(c); 15457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // strip trailing space and punctuation 15557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) { 15657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien break; 15757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 15857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = ix; 15957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 16057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return result; 16157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 16257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 16357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::finish() { 16457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mText = nullptr; 16557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // Note: calling utext_close multiply is safe 16657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien utext_close(&mUText); 16757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 16857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 16957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} // namespace android 170