WordBreakerTests.cpp revision 9c4cc648abcae144f3b99d612e58ef01d5e52cce
157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/*
257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project
357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License");
557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License.
657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at
757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *      http://www.apache.org/licenses/LICENSE-2.0
957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software
1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS,
1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and
1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License.
1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */
1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <gtest/gtest.h>
1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "ICUTestBase.h"
1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "UnicodeUtils.h"
2057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <minikin/WordBreaker.h>
2157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/locid.h>
2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uclean.h>
2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/udata.h>
2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin"
2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h>
2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#ifndef NELEM
2957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#endif
3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienusing namespace android;
3357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levientypedef ICUTestBase WordBreakerTest;
3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3657b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, basic) {
3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
3857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    WordBreaker breaker;
3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setText(buf, NELEM(buf));
4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.current());
4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.next());  // after "hello "
4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.wordStart());  // "hello"
4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(5, breaker.wordEnd());
4557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.current());
4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
4757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.wordStart());  // "world"
4857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(11, breaker.wordEnd());
4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(11, breaker.current());
5057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
5257b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, softHyphen) {
5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    WordBreaker breaker;
5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
5657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setText(buf, NELEM(buf));
5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.current());
5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(7, breaker.next());  // after "hel{SOFT HYPHEN}lo "
5957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
6057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.wordEnd());
6157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
6257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(7, breaker.wordStart());  // "world"
6357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(12, breaker.wordEnd());
6457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
6557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
6657b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, punct) {
6757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
6857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        '!', '!'};
6957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    WordBreaker breaker;
7057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
7157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setText(buf, NELEM(buf));
7257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.current());
7357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(9, breaker.next());  // after "¡¡hello, "
7457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(2, breaker.wordStart());  // "hello"
7557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(7, breaker.wordEnd());
7657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
7757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(9, breaker.wordStart());  // "world"
7857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(14, breaker.wordEnd());
7957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
809c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
819c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, email) {
829c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
839c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        ' ', 'x'};
849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
859c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
869c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
879c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
889c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(16, breaker.next());  // after "foo@example.com "
899c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
909c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
919c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(16, breaker.wordStart());  // "x"
929c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(17, breaker.wordEnd());
939c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
949c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
959c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, mailto) {
969c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
979c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
989c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
999c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
1009c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
1019c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
1029c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(23, breaker.next());  // after "mailto:foo@example.com "
1039c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
1049c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
1059c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(23, breaker.wordStart());  // "x"
1069c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(24, breaker.wordEnd());
1079c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
1089c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1099c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailNonAscii) {
1109c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
1119c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        0x4E00};
1129c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
1139c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
1149c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
1159c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
1169c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(15, breaker.next());  // after "foo@example.com"
1179c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
1189c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
1199c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(15, breaker.wordStart());  // "一"
1209c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(16, breaker.wordEnd());
1219c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
1229c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1239c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailCombining) {
1249c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
1259c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        0x0303, ' ', 'x'};
1269c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
1279c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
1289c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
1299c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
1309c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(17, breaker.next());  // after "foo@example.com̃"
1319c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
1329c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
1339c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(17, breaker.wordStart());  // "x"
1349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(18, breaker.wordEnd());
1359c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
1369c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1379c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, url) {
1389c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
1399c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        '.', 'c', 'o', 'm', ' ', 'x'};
1409c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
1419c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
1429c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
1439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
1449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(19, breaker.next());  // after "http://example.com "
1459c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
1469c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(19, breaker.wordStart());  // "x"
1489c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(20, breaker.wordEnd());
1499c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
150