WordBreakerTests.cpp revision 9c4cc648abcae144f3b99d612e58ef01d5e52cce
157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/* 257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project 357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License. 657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at 757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * http://www.apache.org/licenses/LICENSE-2.0 957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software 1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS, 1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and 1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License. 1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */ 1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <gtest/gtest.h> 1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "ICUTestBase.h" 1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "UnicodeUtils.h" 2057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <minikin/WordBreaker.h> 2157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/locid.h> 2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uclean.h> 2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/udata.h> 2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin" 2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h> 2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#ifndef NELEM 2957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define NELEM(x) ((sizeof(x) / sizeof((x)[0]))) 3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#endif 3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienusing namespace android; 3357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levientypedef ICUTestBase WordBreakerTest; 3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3657b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, basic) { 3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; 3857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien WordBreaker breaker; 3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setText(buf, NELEM(buf)); 4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.current()); 4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.next()); // after "hello " 4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.wordStart()); // "hello" 4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(5, breaker.wordEnd()); 4557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.current()); 4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 4757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.wordStart()); // "world" 4857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(11, breaker.wordEnd()); 4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(11, breaker.current()); 5057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 5257b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, softHyphen) { 5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; 5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien WordBreaker breaker; 5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 5657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setText(buf, NELEM(buf)); 5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.current()); 5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo " 5957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo" 6057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.wordEnd()); 6157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 6257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(7, breaker.wordStart()); // "world" 6357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(12, breaker.wordEnd()); 6457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 6557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 6657b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, punct) { 6757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', 6857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien '!', '!'}; 6957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien WordBreaker breaker; 7057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 7157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setText(buf, NELEM(buf)); 7257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.current()); 7357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(9, breaker.next()); // after "¡¡hello, " 7457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(2, breaker.wordStart()); // "hello" 7557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(7, breaker.wordEnd()); 7657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 7757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(9, breaker.wordStart()); // "world" 7857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(14, breaker.wordEnd()); 7957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 809c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 819c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, email) { 829c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 839c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien ' ', 'x'}; 849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 859c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 869c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 879c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 889c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(16, breaker.next()); // after "foo@example.com " 899c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 909c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 919c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(16, breaker.wordStart()); // "x" 929c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(17, breaker.wordEnd()); 939c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 949c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 959c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, mailto) { 969c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 979c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'}; 989c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 999c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 1009c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 1019c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 1029c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(23, breaker.next()); // after "mailto:foo@example.com " 1039c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 1049c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 1059c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(23, breaker.wordStart()); // "x" 1069c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(24, breaker.wordEnd()); 1079c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 1089c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1099c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailNonAscii) { 1109c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 1119c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 0x4E00}; 1129c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 1139c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 1149c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 1159c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 1169c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(15, breaker.next()); // after "foo@example.com" 1179c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 1189c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 1199c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(15, breaker.wordStart()); // "一" 1209c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(16, breaker.wordEnd()); 1219c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 1229c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1239c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailCombining) { 1249c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 1259c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 0x0303, ' ', 'x'}; 1269c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 1279c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 1289c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 1299c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 1309c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(17, breaker.next()); // after "foo@example.com̃" 1319c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 1329c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 1339c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(17, breaker.wordStart()); // "x" 1349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(18, breaker.wordEnd()); 1359c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 1369c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1379c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, url) { 1389c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e', 1399c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien '.', 'c', 'o', 'm', ' ', 'x'}; 1409c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 1419c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 1429c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 1439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 1449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(19, breaker.next()); // after "http://example.com " 1459c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 1469c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(19, breaker.wordStart()); // "x" 1489c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(20, breaker.wordEnd()); 1499c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 150