1/*
2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All rights reserved.
3 * Copyright (C) 2005 Alexey Proskuryakov.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "platform/text/UnicodeUtilities.h"
29
30#include "wtf/text/StringBuffer.h"
31#include "wtf/unicode/CharacterNames.h"
32#include <unicode/unorm.h>
33
34using namespace WTF::Unicode;
35
36namespace blink {
37
38enum VoicedSoundMarkType {
39    NoVoicedSoundMark,
40    VoicedSoundMark,
41    SemiVoicedSoundMark
42};
43
44template <typename CharType>
45static inline CharType foldQuoteMarkOrSoftHyphen(CharType c)
46{
47    switch (static_cast<UChar>(c)) {
48    case hebrewPunctuationGershayim:
49    case leftDoubleQuotationMark:
50    case rightDoubleQuotationMark:
51        return '"';
52    case hebrewPunctuationGeresh:
53    case leftSingleQuotationMark:
54    case rightSingleQuotationMark:
55        return '\'';
56    case softHyphen:
57        // Replace soft hyphen with an ignorable character so that their presence or absence will
58        // not affect string comparison.
59        return 0;
60    default:
61        return c;
62    }
63}
64
65void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
66{
67    for (size_t i = 0; i < length; ++i)
68        data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
69}
70
71void foldQuoteMarksAndSoftHyphens(String& s)
72{
73    s.replace(hebrewPunctuationGeresh, '\'');
74    s.replace(hebrewPunctuationGershayim, '"');
75    s.replace(leftDoubleQuotationMark, '"');
76    s.replace(leftSingleQuotationMark, '\'');
77    s.replace(rightDoubleQuotationMark, '"');
78    s.replace(rightSingleQuotationMark, '\'');
79    // Replace soft hyphen with an ignorable character so that their presence or absence will
80    // not affect string comparison.
81    s.replace(softHyphen, 0);
82}
83
84static bool isNonLatin1Separator(UChar32 character)
85{
86    ASSERT_ARG(character, character >= 256);
87
88    return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
89}
90
91bool isSeparator(UChar32 character)
92{
93    static const bool latin1SeparatorTable[256] = {
94        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
97        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, //                         : ; < = > ?
98        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   @
99        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, //                         [ \ ] ^ _
100        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   `
101        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, //                           { | } ~
102        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
105        1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
106        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
107        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
108        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
109        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
110    };
111
112    if (character < 256)
113        return latin1SeparatorTable[character];
114
115    return isNonLatin1Separator(character);
116}
117
118// ICU's search ignores the distinction between small kana letters and ones
119// that are not small, and also characters that differ only in the voicing
120// marks when considering only primary collation strength differences.
121// This is not helpful for end users, since these differences make words
122// distinct, so for our purposes we need these to be considered.
123// The Unicode folks do not think the collation algorithm should be
124// changed. To work around this, we would like to tailor the ICU searcher,
125// but we can't get that to work yet. So instead, we check for cases where
126// these differences occur, and skip those matches.
127
128// We refer to the above technique as the "kana workaround". The next few
129// functions are helper functinos for the kana workaround.
130
131bool isKanaLetter(UChar character)
132{
133    // Hiragana letters.
134    if (character >= 0x3041 && character <= 0x3096)
135        return true;
136
137    // Katakana letters.
138    if (character >= 0x30A1 && character <= 0x30FA)
139        return true;
140    if (character >= 0x31F0 && character <= 0x31FF)
141        return true;
142
143    // Halfwidth katakana letters.
144    if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
145        return true;
146
147    return false;
148}
149
150bool isSmallKanaLetter(UChar character)
151{
152    ASSERT(isKanaLetter(character));
153
154    switch (character) {
155    case 0x3041: // HIRAGANA LETTER SMALL A
156    case 0x3043: // HIRAGANA LETTER SMALL I
157    case 0x3045: // HIRAGANA LETTER SMALL U
158    case 0x3047: // HIRAGANA LETTER SMALL E
159    case 0x3049: // HIRAGANA LETTER SMALL O
160    case 0x3063: // HIRAGANA LETTER SMALL TU
161    case 0x3083: // HIRAGANA LETTER SMALL YA
162    case 0x3085: // HIRAGANA LETTER SMALL YU
163    case 0x3087: // HIRAGANA LETTER SMALL YO
164    case 0x308E: // HIRAGANA LETTER SMALL WA
165    case 0x3095: // HIRAGANA LETTER SMALL KA
166    case 0x3096: // HIRAGANA LETTER SMALL KE
167    case 0x30A1: // KATAKANA LETTER SMALL A
168    case 0x30A3: // KATAKANA LETTER SMALL I
169    case 0x30A5: // KATAKANA LETTER SMALL U
170    case 0x30A7: // KATAKANA LETTER SMALL E
171    case 0x30A9: // KATAKANA LETTER SMALL O
172    case 0x30C3: // KATAKANA LETTER SMALL TU
173    case 0x30E3: // KATAKANA LETTER SMALL YA
174    case 0x30E5: // KATAKANA LETTER SMALL YU
175    case 0x30E7: // KATAKANA LETTER SMALL YO
176    case 0x30EE: // KATAKANA LETTER SMALL WA
177    case 0x30F5: // KATAKANA LETTER SMALL KA
178    case 0x30F6: // KATAKANA LETTER SMALL KE
179    case 0x31F0: // KATAKANA LETTER SMALL KU
180    case 0x31F1: // KATAKANA LETTER SMALL SI
181    case 0x31F2: // KATAKANA LETTER SMALL SU
182    case 0x31F3: // KATAKANA LETTER SMALL TO
183    case 0x31F4: // KATAKANA LETTER SMALL NU
184    case 0x31F5: // KATAKANA LETTER SMALL HA
185    case 0x31F6: // KATAKANA LETTER SMALL HI
186    case 0x31F7: // KATAKANA LETTER SMALL HU
187    case 0x31F8: // KATAKANA LETTER SMALL HE
188    case 0x31F9: // KATAKANA LETTER SMALL HO
189    case 0x31FA: // KATAKANA LETTER SMALL MU
190    case 0x31FB: // KATAKANA LETTER SMALL RA
191    case 0x31FC: // KATAKANA LETTER SMALL RI
192    case 0x31FD: // KATAKANA LETTER SMALL RU
193    case 0x31FE: // KATAKANA LETTER SMALL RE
194    case 0x31FF: // KATAKANA LETTER SMALL RO
195    case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
196    case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
197    case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
198    case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
199    case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
200    case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
201    case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
202    case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
203    case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
204        return true;
205    }
206    return false;
207}
208
209static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
210{
211    ASSERT(isKanaLetter(character));
212
213    switch (character) {
214    case 0x304C: // HIRAGANA LETTER GA
215    case 0x304E: // HIRAGANA LETTER GI
216    case 0x3050: // HIRAGANA LETTER GU
217    case 0x3052: // HIRAGANA LETTER GE
218    case 0x3054: // HIRAGANA LETTER GO
219    case 0x3056: // HIRAGANA LETTER ZA
220    case 0x3058: // HIRAGANA LETTER ZI
221    case 0x305A: // HIRAGANA LETTER ZU
222    case 0x305C: // HIRAGANA LETTER ZE
223    case 0x305E: // HIRAGANA LETTER ZO
224    case 0x3060: // HIRAGANA LETTER DA
225    case 0x3062: // HIRAGANA LETTER DI
226    case 0x3065: // HIRAGANA LETTER DU
227    case 0x3067: // HIRAGANA LETTER DE
228    case 0x3069: // HIRAGANA LETTER DO
229    case 0x3070: // HIRAGANA LETTER BA
230    case 0x3073: // HIRAGANA LETTER BI
231    case 0x3076: // HIRAGANA LETTER BU
232    case 0x3079: // HIRAGANA LETTER BE
233    case 0x307C: // HIRAGANA LETTER BO
234    case 0x3094: // HIRAGANA LETTER VU
235    case 0x30AC: // KATAKANA LETTER GA
236    case 0x30AE: // KATAKANA LETTER GI
237    case 0x30B0: // KATAKANA LETTER GU
238    case 0x30B2: // KATAKANA LETTER GE
239    case 0x30B4: // KATAKANA LETTER GO
240    case 0x30B6: // KATAKANA LETTER ZA
241    case 0x30B8: // KATAKANA LETTER ZI
242    case 0x30BA: // KATAKANA LETTER ZU
243    case 0x30BC: // KATAKANA LETTER ZE
244    case 0x30BE: // KATAKANA LETTER ZO
245    case 0x30C0: // KATAKANA LETTER DA
246    case 0x30C2: // KATAKANA LETTER DI
247    case 0x30C5: // KATAKANA LETTER DU
248    case 0x30C7: // KATAKANA LETTER DE
249    case 0x30C9: // KATAKANA LETTER DO
250    case 0x30D0: // KATAKANA LETTER BA
251    case 0x30D3: // KATAKANA LETTER BI
252    case 0x30D6: // KATAKANA LETTER BU
253    case 0x30D9: // KATAKANA LETTER BE
254    case 0x30DC: // KATAKANA LETTER BO
255    case 0x30F4: // KATAKANA LETTER VU
256    case 0x30F7: // KATAKANA LETTER VA
257    case 0x30F8: // KATAKANA LETTER VI
258    case 0x30F9: // KATAKANA LETTER VE
259    case 0x30FA: // KATAKANA LETTER VO
260        return VoicedSoundMark;
261    case 0x3071: // HIRAGANA LETTER PA
262    case 0x3074: // HIRAGANA LETTER PI
263    case 0x3077: // HIRAGANA LETTER PU
264    case 0x307A: // HIRAGANA LETTER PE
265    case 0x307D: // HIRAGANA LETTER PO
266    case 0x30D1: // KATAKANA LETTER PA
267    case 0x30D4: // KATAKANA LETTER PI
268    case 0x30D7: // KATAKANA LETTER PU
269    case 0x30DA: // KATAKANA LETTER PE
270    case 0x30DD: // KATAKANA LETTER PO
271        return SemiVoicedSoundMark;
272    }
273    return NoVoicedSoundMark;
274}
275
276static inline bool isCombiningVoicedSoundMark(UChar character)
277{
278    switch (character) {
279    case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
280    case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
281        return true;
282    }
283    return false;
284}
285
286bool containsKanaLetters(const String& pattern)
287{
288    const unsigned length = pattern.length();
289    for (unsigned i = 0; i < length; ++i) {
290        if (isKanaLetter(pattern[i]))
291            return true;
292    }
293    return false;
294}
295
296void normalizeCharactersIntoNFCForm(const UChar* characters, unsigned length, Vector<UChar>& buffer)
297{
298    ASSERT(length);
299
300    buffer.resize(length);
301
302    UErrorCode status = U_ZERO_ERROR;
303    size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
304    ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
305    ASSERT(bufferSize);
306
307    buffer.resize(bufferSize);
308
309    if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
310        return;
311
312    status = U_ZERO_ERROR;
313    unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
314    ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
315}
316
317// This function returns kNotFound if |first| and |second| contain different Kana letters.
318// If |first| and |second| contain the same Kana letter
319// then function returns offset in characters from |first|.
320// Pointers to both strings increase simultaneously so so it is possible to use one offset value.
321static inline size_t compareKanaLetterAndComposedVoicedSoundMarks(const UChar* first, const UChar* firstEnd, const UChar* second, const UChar* secondEnd)
322{
323    const UChar* start = first;
324    // Check for differences in the kana letter character itself.
325    if (isSmallKanaLetter(*first) != isSmallKanaLetter(*second))
326        return kNotFound;
327    if (composedVoicedSoundMark(*first) != composedVoicedSoundMark(*second))
328        return kNotFound;
329    ++first;
330    ++second;
331
332    // Check for differences in combining voiced sound marks found after the letter.
333    while (true) {
334        const bool secondIsNotSoundMark = second == secondEnd || !isCombiningVoicedSoundMark(*second);
335        if (first == firstEnd || !isCombiningVoicedSoundMark(*first)) {
336            return secondIsNotSoundMark ? first - start : kNotFound;
337        }
338        if (secondIsNotSoundMark)
339            return kNotFound;
340        if (*first != *second)
341            return kNotFound;
342        ++first;
343        ++second;
344    }
345}
346
347bool checkOnlyKanaLettersInStrings(const UChar* firstData, unsigned firstLength, const UChar* secondData, unsigned secondLength)
348{
349    const UChar* a = firstData;
350    const UChar* aEnd = firstData + firstLength;
351
352    const UChar* b = secondData;
353    const UChar* bEnd = secondData + secondLength;
354    while (true) {
355        // Skip runs of non-kana-letter characters. This is necessary so we can
356        // correctly handle strings where the |firstData| and |secondData| have different-length
357        // runs of characters that match, while still double checking the correctness
358        // of matches of kana letters with other kana letters.
359        while (a != aEnd && !isKanaLetter(*a))
360            ++a;
361        while (b != bEnd && !isKanaLetter(*b))
362            ++b;
363
364        // If we reached the end of either the target or the match, we should have
365        // reached the end of both; both should have the same number of kana letters.
366        if (a == aEnd || b == bEnd) {
367            return a == aEnd && b == bEnd;
368        }
369
370        // Check that single Kana letters in |a| and |b| are the same.
371        const size_t offset = compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd);
372        if (offset == kNotFound)
373            return false;
374
375        // Update values of |a| and |b| after comparing.
376        a += offset;
377        b += offset;
378    }
379}
380
381bool checkKanaStringsEqual(const UChar* firstData, unsigned firstLength, const UChar* secondData, unsigned secondLength)
382{
383    const UChar* a = firstData;
384    const UChar* aEnd = firstData + firstLength;
385
386    const UChar* b = secondData;
387    const UChar* bEnd = secondData + secondLength;
388    while (true) {
389        // Check for non-kana-letter characters.
390        while (a != aEnd && !isKanaLetter(*a) && b != bEnd && !isKanaLetter(*b)) {
391            if (*a++ != *b++)
392                return false;
393        }
394
395        // If we reached the end of either the target or the match, we should have
396        // reached the end of both; both should have the same number of kana letters.
397        if (a == aEnd || b == bEnd) {
398            return a == aEnd && b == bEnd;
399        }
400
401        if (isKanaLetter(*a) != isKanaLetter(*b))
402            return false;
403
404        // Check that single Kana letters in |a| and |b| are the same.
405        const size_t offset = compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd);
406        if (offset == kNotFound)
407            return false;
408
409        // Update values of |a| and |b| after comparing.
410        a += offset;
411        b += offset;
412    }
413}
414
415}
416