15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)#include "ui/gfx/utf16_indexing.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h" 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/third_party/icu/icu_utf.h" 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)namespace gfx { 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)bool IsValidCodePointIndex(const base::string16& s, size_t index) { 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return index == 0 || index == s.length() || 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) !(CBU16_IS_TRAIL(s[index]) && CBU16_IS_LEAD(s[index - 1])); 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)ptrdiff_t UTF16IndexToOffset(const base::string16& s, size_t base, size_t pos) { 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The indices point between UTF-16 words (range 0 to s.length() inclusive). 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // In order to consistently handle indices that point to the middle of a 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // surrogate pair, we count the first word in that surrogate pair and not 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the second. The test "s[i] is not the second half of a surrogate pair" is 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "IsValidCodePointIndex(s, i)". 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK_LE(base, s.length()); 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK_LE(pos, s.length()); 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ptrdiff_t delta = 0; 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (base < pos) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delta += IsValidCodePointIndex(s, base++) ? 1 : 0; 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (pos < base) 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delta -= IsValidCodePointIndex(s, pos++) ? 1 : 0; 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return delta; 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 33a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)size_t UTF16OffsetToIndex(const base::string16& s, 34a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) size_t base, 35a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) ptrdiff_t offset) { 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK_LE(base, s.length()); 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // As in UTF16IndexToOffset, we count the first half of a surrogate pair, not 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the second. When stepping from pos to pos+1 we check s[pos:pos+1] == s[pos] 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (Python syntax), hence pos++. When stepping from pos to pos-1 we check 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // s[pos-1], hence --pos. 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t pos = base; 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (offset > 0 && pos < s.length()) 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) offset -= IsValidCodePointIndex(s, pos++) ? 1 : 0; 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (offset < 0 && pos > 0) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) offset += IsValidCodePointIndex(s, --pos) ? 1 : 0; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If offset != 0 then we ran off the edge of the string, which is a contract 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // violation but is handled anyway (by clamping) in release for safety. 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK_EQ(offset, 0); 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Since the second half of a surrogate pair has "length" zero, there is an 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ambiguity in the returned position. Resolve it by always returning a valid 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // index. 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!IsValidCodePointIndex(s, pos)) 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++pos; 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return pos; 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 57d0247b1b59f9c528cb6df88b4f2b9afaf80d181eTorne (Richard Coles)} // namespace gfx 58