WordBreaker.cpp revision 9c4cc648abcae144f3b99d612e58ef01d5e52cce
1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define LOG_TAG "Minikin" 18#include <cutils/log.h> 19 20#include "minikin/WordBreaker.h" 21 22#include <unicode/uchar.h> 23#include <unicode/utf16.h> 24 25namespace android { 26 27const uint32_t CHAR_SOFT_HYPHEN = 0x00AD; 28 29void WordBreaker::setLocale(const icu::Locale& locale) { 30 UErrorCode status = U_ZERO_ERROR; 31 mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status)); 32 // TODO: handle failure status 33 if (mText != nullptr) { 34 mBreakIterator->setText(&mUText, status); 35 } 36 mIteratorWasReset = true; 37} 38 39void WordBreaker::setText(const uint16_t* data, size_t size) { 40 mText = data; 41 mTextSize = size; 42 mIteratorWasReset = false; 43 mLast = 0; 44 mCurrent = 0; 45 mScanOffset = 0; 46 mSuppressHyphen = false; 47 UErrorCode status = U_ZERO_ERROR; 48 utext_openUChars(&mUText, data, size, &status); 49 mBreakIterator->setText(&mUText, status); 50 mBreakIterator->first(); 51} 52 53ssize_t WordBreaker::current() const { 54 return mCurrent; 55} 56 57enum ScanState { 58 START, 59 SAW_AT, 60 SAW_COLON, 61 SAW_COLON_SLASH, 62 SAW_COLON_SLASH_SLASH, 63}; 64 65ssize_t WordBreaker::next() { 66 mLast = mCurrent; 67 68 // scan forward from current ICU position for email address or URL 69 if (mLast >= mScanOffset) { 70 ScanState state = START; 71 size_t i; 72 for (i = mLast; i < mTextSize; i++) { 73 uint16_t c = mText[i]; 74 // scan only ASCII characters, stop at space 75 if (!(' ' < c && c <= 0x007E)) { 76 break; 77 } 78 if (state == START && c == '@') { 79 state = SAW_AT; 80 } else if (state == START && c == ':') { 81 state = SAW_COLON; 82 } else if (state == SAW_COLON || state == SAW_COLON_SLASH) { 83 if (c == '/') { 84 state = static_cast<ScanState>((int)state + 1); // next state adds a slash 85 } else { 86 state = START; 87 } 88 } 89 } 90 if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) { 91 // no line breaks in entire email address or url 92 // TODO: refine this according to Chicago Manual of Style rules 93 while (i < mTextSize && mText[i] == ' ') { 94 i++; 95 } 96 mCurrent = i; 97 mSuppressHyphen = true; 98 // Setting mIteratorWasReset will cause next break to be computed following 99 // mCurrent, rather than following the current ICU iterator location. 100 mIteratorWasReset = true; 101 if (mBreakIterator->isBoundary(mCurrent)) { 102 return mCurrent; 103 } 104 } else { 105 mScanOffset = i; 106 mSuppressHyphen = false; 107 } 108 } 109 110 int32_t result; 111 do { 112 if (mIteratorWasReset) { 113 result = mBreakIterator->following(mCurrent); 114 mIteratorWasReset = false; 115 } else { 116 result = mBreakIterator->next(); 117 } 118 } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize 119 && mText[result - 1] == CHAR_SOFT_HYPHEN); 120 mCurrent = (ssize_t)result; 121 return mCurrent; 122} 123 124ssize_t WordBreaker::wordStart() const { 125 if (mSuppressHyphen) { 126 return mLast; 127 } 128 ssize_t result = mLast; 129 while (result < mCurrent) { 130 UChar32 c; 131 ssize_t ix = result; 132 U16_NEXT(mText, ix, mCurrent, c); 133 int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 134 // strip leading punctuation, defined as OP and QU line breaking classes, 135 // see UAX #14 136 if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) { 137 break; 138 } 139 result = ix; 140 } 141 return result; 142} 143 144ssize_t WordBreaker::wordEnd() const { 145 if (mSuppressHyphen) { 146 return mLast; 147 } 148 ssize_t result = mCurrent; 149 while (result > mLast) { 150 UChar32 c; 151 ssize_t ix = result; 152 U16_PREV(mText, mLast, ix, c); 153 int32_t gc_mask = U_GET_GC_MASK(c); 154 // strip trailing space and punctuation 155 if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) { 156 break; 157 } 158 result = ix; 159 } 160 return result; 161} 162 163void WordBreaker::finish() { 164 mText = nullptr; 165 // Note: calling utext_close multiply is safe 166 utext_close(&mUText); 167} 168 169} // namespace android 170