1/* 2 * Copyright (C) 2005 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <utils/String8.h> 18 19#include <utils/Log.h> 20#include <utils/String16.h> 21#include <utils/TextOutput.h> 22#include <utils/threads.h> 23 24#include <private/utils/Static.h> 25 26#include <ctype.h> 27 28/* 29 * Functions outside android is below the namespace android, since they use 30 * functions and constants in android namespace. 31 */ 32 33// --------------------------------------------------------------------------- 34 35namespace android { 36 37static const char32_t kByteMask = 0x000000BF; 38static const char32_t kByteMark = 0x00000080; 39 40// Surrogates aren't valid for UTF-32 characters, so define some 41// constants that will let us screen them out. 42static const char32_t kUnicodeSurrogateHighStart = 0x0000D800; 43static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; 44static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; 45static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; 46static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; 47static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; 48static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; 49 50// Mask used to set appropriate bits in first byte of UTF-8 sequence, 51// indexed by number of bytes in the sequence. 52// 0xxxxxxx 53// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 54// 110yyyyx 10xxxxxx 55// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 56// 1110yyyy 10yxxxxx 10xxxxxx 57// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 58// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 59// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 60static const char32_t kFirstByteMark[] = { 61 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 62}; 63 64// Separator used by resource paths. This is not platform dependent contrary 65// to OS_PATH_SEPARATOR. 66#define RES_PATH_SEPARATOR '/' 67 68// Return number of utf8 bytes required for the character. 69static size_t utf32_to_utf8_bytes(char32_t srcChar) 70{ 71 size_t bytesToWrite; 72 73 // Figure out how many bytes the result will require. 74 if (srcChar < 0x00000080) 75 { 76 bytesToWrite = 1; 77 } 78 else if (srcChar < 0x00000800) 79 { 80 bytesToWrite = 2; 81 } 82 else if (srcChar < 0x00010000) 83 { 84 if ((srcChar < kUnicodeSurrogateStart) 85 || (srcChar > kUnicodeSurrogateEnd)) 86 { 87 bytesToWrite = 3; 88 } 89 else 90 { 91 // Surrogates are invalid UTF-32 characters. 92 return 0; 93 } 94 } 95 // Max code point for Unicode is 0x0010FFFF. 96 else if (srcChar <= kUnicodeMaxCodepoint) 97 { 98 bytesToWrite = 4; 99 } 100 else 101 { 102 // Invalid UTF-32 character. 103 return 0; 104 } 105 106 return bytesToWrite; 107} 108 109// Write out the source character to <dstP>. 110 111static void utf32_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) 112{ 113 dstP += bytes; 114 switch (bytes) 115 { /* note: everything falls through. */ 116 case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 117 case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 118 case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; 119 case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); 120 } 121} 122 123// --------------------------------------------------------------------------- 124 125static SharedBuffer* gEmptyStringBuf = NULL; 126static char* gEmptyString = NULL; 127 128extern int gDarwinCantLoadAllObjects; 129int gDarwinIsReallyAnnoying; 130 131static inline char* getEmptyString() 132{ 133 gEmptyStringBuf->acquire(); 134 return gEmptyString; 135} 136 137void initialize_string8() 138{ 139#ifdef LIBUTILS_NATIVE 140 // Bite me, Darwin! 141 gDarwinIsReallyAnnoying = gDarwinCantLoadAllObjects; 142#endif 143 144 SharedBuffer* buf = SharedBuffer::alloc(1); 145 char* str = (char*)buf->data(); 146 *str = 0; 147 gEmptyStringBuf = buf; 148 gEmptyString = str; 149} 150 151void terminate_string8() 152{ 153 SharedBuffer::bufferFromData(gEmptyString)->release(); 154 gEmptyStringBuf = NULL; 155 gEmptyString = NULL; 156} 157 158// --------------------------------------------------------------------------- 159 160static char* allocFromUTF8(const char* in, size_t len) 161{ 162 if (len > 0) { 163 SharedBuffer* buf = SharedBuffer::alloc(len+1); 164 LOG_ASSERT(buf, "Unable to allocate shared buffer"); 165 if (buf) { 166 char* str = (char*)buf->data(); 167 memcpy(str, in, len); 168 str[len] = 0; 169 return str; 170 } 171 return NULL; 172 } 173 174 return getEmptyString(); 175} 176 177template<typename T, typename L> 178static char* allocFromUTF16OrUTF32(const T* in, L len) 179{ 180 if (len == 0) return getEmptyString(); 181 182 size_t bytes = 0; 183 const T* end = in+len; 184 const T* p = in; 185 186 while (p < end) { 187 bytes += utf32_to_utf8_bytes(*p); 188 p++; 189 } 190 191 SharedBuffer* buf = SharedBuffer::alloc(bytes+1); 192 LOG_ASSERT(buf, "Unable to allocate shared buffer"); 193 if (buf) { 194 p = in; 195 char* str = (char*)buf->data(); 196 char* d = str; 197 while (p < end) { 198 const T c = *p++; 199 size_t len = utf32_to_utf8_bytes(c); 200 utf32_to_utf8((uint8_t*)d, c, len); 201 d += len; 202 } 203 *d = 0; 204 205 return str; 206 } 207 208 return getEmptyString(); 209} 210 211// Note: not dealing with expanding surrogate pairs. 212static char* allocFromUTF16(const char16_t* in, size_t len) 213{ 214 return allocFromUTF16OrUTF32<char16_t, size_t>(in, len); 215} 216 217static char* allocFromUTF32(const char32_t* in, size_t len) 218{ 219 return allocFromUTF16OrUTF32<char32_t, size_t>(in, len); 220} 221 222// --------------------------------------------------------------------------- 223 224String8::String8() 225 : mString(getEmptyString()) 226{ 227} 228 229String8::String8(const String8& o) 230 : mString(o.mString) 231{ 232 SharedBuffer::bufferFromData(mString)->acquire(); 233} 234 235String8::String8(const char* o) 236 : mString(allocFromUTF8(o, strlen(o))) 237{ 238 if (mString == NULL) { 239 mString = getEmptyString(); 240 } 241} 242 243String8::String8(const char* o, size_t len) 244 : mString(allocFromUTF8(o, len)) 245{ 246 if (mString == NULL) { 247 mString = getEmptyString(); 248 } 249} 250 251String8::String8(const String16& o) 252 : mString(allocFromUTF16(o.string(), o.size())) 253{ 254} 255 256String8::String8(const char16_t* o) 257 : mString(allocFromUTF16(o, strlen16(o))) 258{ 259} 260 261String8::String8(const char16_t* o, size_t len) 262 : mString(allocFromUTF16(o, len)) 263{ 264} 265 266String8::String8(const char32_t* o) 267 : mString(allocFromUTF32(o, strlen32(o))) 268{ 269} 270 271String8::String8(const char32_t* o, size_t len) 272 : mString(allocFromUTF32(o, len)) 273{ 274} 275 276String8::~String8() 277{ 278 SharedBuffer::bufferFromData(mString)->release(); 279} 280 281void String8::setTo(const String8& other) 282{ 283 SharedBuffer::bufferFromData(other.mString)->acquire(); 284 SharedBuffer::bufferFromData(mString)->release(); 285 mString = other.mString; 286} 287 288status_t String8::setTo(const char* other) 289{ 290 SharedBuffer::bufferFromData(mString)->release(); 291 mString = allocFromUTF8(other, strlen(other)); 292 if (mString) return NO_ERROR; 293 294 mString = getEmptyString(); 295 return NO_MEMORY; 296} 297 298status_t String8::setTo(const char* other, size_t len) 299{ 300 SharedBuffer::bufferFromData(mString)->release(); 301 mString = allocFromUTF8(other, len); 302 if (mString) return NO_ERROR; 303 304 mString = getEmptyString(); 305 return NO_MEMORY; 306} 307 308status_t String8::setTo(const char16_t* other, size_t len) 309{ 310 SharedBuffer::bufferFromData(mString)->release(); 311 mString = allocFromUTF16(other, len); 312 if (mString) return NO_ERROR; 313 314 mString = getEmptyString(); 315 return NO_MEMORY; 316} 317 318status_t String8::setTo(const char32_t* other, size_t len) 319{ 320 SharedBuffer::bufferFromData(mString)->release(); 321 mString = allocFromUTF32(other, len); 322 if (mString) return NO_ERROR; 323 324 mString = getEmptyString(); 325 return NO_MEMORY; 326} 327 328status_t String8::append(const String8& other) 329{ 330 const size_t otherLen = other.bytes(); 331 if (bytes() == 0) { 332 setTo(other); 333 return NO_ERROR; 334 } else if (otherLen == 0) { 335 return NO_ERROR; 336 } 337 338 return real_append(other.string(), otherLen); 339} 340 341status_t String8::append(const char* other) 342{ 343 return append(other, strlen(other)); 344} 345 346status_t String8::append(const char* other, size_t otherLen) 347{ 348 if (bytes() == 0) { 349 return setTo(other, otherLen); 350 } else if (otherLen == 0) { 351 return NO_ERROR; 352 } 353 354 return real_append(other, otherLen); 355} 356 357status_t String8::real_append(const char* other, size_t otherLen) 358{ 359 const size_t myLen = bytes(); 360 361 SharedBuffer* buf = SharedBuffer::bufferFromData(mString) 362 ->editResize(myLen+otherLen+1); 363 if (buf) { 364 char* str = (char*)buf->data(); 365 mString = str; 366 str += myLen; 367 memcpy(str, other, otherLen); 368 str[otherLen] = '\0'; 369 return NO_ERROR; 370 } 371 return NO_MEMORY; 372} 373 374char* String8::lockBuffer(size_t size) 375{ 376 SharedBuffer* buf = SharedBuffer::bufferFromData(mString) 377 ->editResize(size+1); 378 if (buf) { 379 char* str = (char*)buf->data(); 380 mString = str; 381 return str; 382 } 383 return NULL; 384} 385 386void String8::unlockBuffer() 387{ 388 unlockBuffer(strlen(mString)); 389} 390 391status_t String8::unlockBuffer(size_t size) 392{ 393 if (size != this->size()) { 394 SharedBuffer* buf = SharedBuffer::bufferFromData(mString) 395 ->editResize(size+1); 396 if (buf) { 397 char* str = (char*)buf->data(); 398 str[size] = 0; 399 mString = str; 400 return NO_ERROR; 401 } 402 } 403 404 return NO_MEMORY; 405} 406 407ssize_t String8::find(const char* other, size_t start) const 408{ 409 size_t len = size(); 410 if (start >= len) { 411 return -1; 412 } 413 const char* s = mString+start; 414 const char* p = strstr(s, other); 415 return p ? p-mString : -1; 416} 417 418void String8::toLower() 419{ 420 toLower(0, size()); 421} 422 423void String8::toLower(size_t start, size_t length) 424{ 425 const size_t len = size(); 426 if (start >= len) { 427 return; 428 } 429 if (start+length > len) { 430 length = len-start; 431 } 432 char* buf = lockBuffer(len); 433 buf += start; 434 while (length > 0) { 435 *buf = tolower(*buf); 436 buf++; 437 length--; 438 } 439 unlockBuffer(len); 440} 441 442void String8::toUpper() 443{ 444 toUpper(0, size()); 445} 446 447void String8::toUpper(size_t start, size_t length) 448{ 449 const size_t len = size(); 450 if (start >= len) { 451 return; 452 } 453 if (start+length > len) { 454 length = len-start; 455 } 456 char* buf = lockBuffer(len); 457 buf += start; 458 while (length > 0) { 459 *buf = toupper(*buf); 460 buf++; 461 length--; 462 } 463 unlockBuffer(len); 464} 465 466size_t String8::getUtf32Length() const 467{ 468 return utf32_length(mString, length()); 469} 470 471int32_t String8::getUtf32At(size_t index, size_t *next_index) const 472{ 473 return utf32_at(mString, length(), index, next_index); 474} 475 476size_t String8::getUtf32(char32_t* dst, size_t dst_len) const 477{ 478 return utf8_to_utf32(mString, length(), dst, dst_len); 479} 480 481TextOutput& operator<<(TextOutput& to, const String8& val) 482{ 483 to << val.string(); 484 return to; 485} 486 487// --------------------------------------------------------------------------- 488// Path functions 489 490void String8::setPathName(const char* name) 491{ 492 setPathName(name, strlen(name)); 493} 494 495void String8::setPathName(const char* name, size_t len) 496{ 497 char* buf = lockBuffer(len); 498 499 memcpy(buf, name, len); 500 501 // remove trailing path separator, if present 502 if (len > 0 && buf[len-1] == OS_PATH_SEPARATOR) 503 len--; 504 505 buf[len] = '\0'; 506 507 unlockBuffer(len); 508} 509 510String8 String8::getPathLeaf(void) const 511{ 512 const char* cp; 513 const char*const buf = mString; 514 515 cp = strrchr(buf, OS_PATH_SEPARATOR); 516 if (cp == NULL) 517 return String8(*this); 518 else 519 return String8(cp+1); 520} 521 522String8 String8::getPathDir(void) const 523{ 524 const char* cp; 525 const char*const str = mString; 526 527 cp = strrchr(str, OS_PATH_SEPARATOR); 528 if (cp == NULL) 529 return String8(""); 530 else 531 return String8(str, cp - str); 532} 533 534String8 String8::walkPath(String8* outRemains) const 535{ 536 const char* cp; 537 const char*const str = mString; 538 const char* buf = str; 539 540 cp = strchr(buf, OS_PATH_SEPARATOR); 541 if (cp == buf) { 542 // don't include a leading '/'. 543 buf = buf+1; 544 cp = strchr(buf, OS_PATH_SEPARATOR); 545 } 546 547 if (cp == NULL) { 548 String8 res = buf != str ? String8(buf) : *this; 549 if (outRemains) *outRemains = String8(""); 550 return res; 551 } 552 553 String8 res(buf, cp-buf); 554 if (outRemains) *outRemains = String8(cp+1); 555 return res; 556} 557 558/* 559 * Helper function for finding the start of an extension in a pathname. 560 * 561 * Returns a pointer inside mString, or NULL if no extension was found. 562 */ 563char* String8::find_extension(void) const 564{ 565 const char* lastSlash; 566 const char* lastDot; 567 int extLen; 568 const char* const str = mString; 569 570 // only look at the filename 571 lastSlash = strrchr(str, OS_PATH_SEPARATOR); 572 if (lastSlash == NULL) 573 lastSlash = str; 574 else 575 lastSlash++; 576 577 // find the last dot 578 lastDot = strrchr(lastSlash, '.'); 579 if (lastDot == NULL) 580 return NULL; 581 582 // looks good, ship it 583 return const_cast<char*>(lastDot); 584} 585 586String8 String8::getPathExtension(void) const 587{ 588 char* ext; 589 590 ext = find_extension(); 591 if (ext != NULL) 592 return String8(ext); 593 else 594 return String8(""); 595} 596 597String8 String8::getBasePath(void) const 598{ 599 char* ext; 600 const char* const str = mString; 601 602 ext = find_extension(); 603 if (ext == NULL) 604 return String8(*this); 605 else 606 return String8(str, ext - str); 607} 608 609String8& String8::appendPath(const char* name) 610{ 611 // TODO: The test below will fail for Win32 paths. Fix later or ignore. 612 if (name[0] != OS_PATH_SEPARATOR) { 613 if (*name == '\0') { 614 // nothing to do 615 return *this; 616 } 617 618 size_t len = length(); 619 if (len == 0) { 620 // no existing filename, just use the new one 621 setPathName(name); 622 return *this; 623 } 624 625 // make room for oldPath + '/' + newPath 626 int newlen = strlen(name); 627 628 char* buf = lockBuffer(len+1+newlen); 629 630 // insert a '/' if needed 631 if (buf[len-1] != OS_PATH_SEPARATOR) 632 buf[len++] = OS_PATH_SEPARATOR; 633 634 memcpy(buf+len, name, newlen+1); 635 len += newlen; 636 637 unlockBuffer(len); 638 639 return *this; 640 } else { 641 setPathName(name); 642 return *this; 643 } 644} 645 646String8& String8::convertToResPath() 647{ 648#if OS_PATH_SEPARATOR != RES_PATH_SEPARATOR 649 size_t len = length(); 650 if (len > 0) { 651 char * buf = lockBuffer(len); 652 for (char * end = buf + len; buf < end; ++buf) { 653 if (*buf == OS_PATH_SEPARATOR) 654 *buf = RES_PATH_SEPARATOR; 655 } 656 unlockBuffer(len); 657 } 658#endif 659 return *this; 660} 661 662}; // namespace android 663 664// --------------------------------------------------------------------------- 665 666size_t strlen32(const char32_t *s) 667{ 668 const char32_t *ss = s; 669 while ( *ss ) 670 ss++; 671 return ss-s; 672} 673 674size_t strnlen32(const char32_t *s, size_t maxlen) 675{ 676 const char32_t *ss = s; 677 while ((maxlen > 0) && *ss) { 678 ss++; 679 maxlen--; 680 } 681 return ss-s; 682} 683 684size_t utf8_length(const char *src) 685{ 686 const char *cur = src; 687 size_t ret = 0; 688 while (*cur != '\0') { 689 const char first_char = *cur++; 690 if ((first_char & 0x80) == 0) { // ASCII 691 ret += 1; 692 continue; 693 } 694 // (UTF-8's character must not be like 10xxxxxx, 695 // but 110xxxxx, 1110xxxx, ... or 1111110x) 696 if ((first_char & 0x40) == 0) { 697 return 0; 698 } 699 700 int32_t mask, to_ignore_mask; 701 size_t num_to_read = 0; 702 char32_t utf32 = 0; 703 for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; 704 num_to_read < 5 && (first_char & mask); 705 num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 706 if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx 707 return 0; 708 } 709 // 0x3F == 00111111 710 utf32 = (utf32 << 6) + (*cur++ & 0x3F); 711 } 712 // "first_char" must be (110xxxxx - 11110xxx) 713 if (num_to_read == 5) { 714 return 0; 715 } 716 to_ignore_mask |= mask; 717 utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); 718 if (utf32 > android::kUnicodeMaxCodepoint) { 719 return 0; 720 } 721 722 ret += num_to_read; 723 } 724 return ret; 725} 726 727size_t utf32_length(const char *src, size_t src_len) 728{ 729 if (src == NULL || src_len == 0) { 730 return 0; 731 } 732 size_t ret = 0; 733 const char* cur; 734 const char* end; 735 size_t num_to_skip; 736 for (cur = src, end = src + src_len, num_to_skip = 1; 737 cur < end; 738 cur += num_to_skip, ret++) { 739 const char first_char = *cur; 740 num_to_skip = 1; 741 if ((first_char & 0x80) == 0) { // ASCII 742 continue; 743 } 744 int32_t mask; 745 746 for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { 747 } 748 } 749 return ret; 750} 751 752size_t utf8_length_from_utf32(const char32_t *src, size_t src_len) 753{ 754 if (src == NULL || src_len == 0) { 755 return 0; 756 } 757 size_t ret = 0; 758 const char32_t *end = src + src_len; 759 while (src < end) { 760 ret += android::utf32_to_utf8_bytes(*src++); 761 } 762 return ret; 763} 764 765static int32_t utf32_at_internal(const char* cur, size_t *num_read) 766{ 767 const char first_char = *cur; 768 if ((first_char & 0x80) == 0) { // ASCII 769 *num_read = 1; 770 return *cur; 771 } 772 cur++; 773 char32_t mask, to_ignore_mask; 774 size_t num_to_read = 0; 775 char32_t utf32 = first_char; 776 for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; 777 (first_char & mask); 778 num_to_read++, to_ignore_mask |= mask, mask >>= 1) { 779 // 0x3F == 00111111 780 utf32 = (utf32 << 6) + (*cur++ & 0x3F); 781 } 782 to_ignore_mask |= mask; 783 utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); 784 785 *num_read = num_to_read; 786 return static_cast<int32_t>(utf32); 787} 788 789int32_t utf32_at(const char *src, size_t src_len, 790 size_t index, size_t *next_index) 791{ 792 if (index >= src_len) { 793 return -1; 794 } 795 size_t dummy_index; 796 if (next_index == NULL) { 797 next_index = &dummy_index; 798 } 799 size_t num_read; 800 int32_t ret = utf32_at_internal(src + index, &num_read); 801 if (ret >= 0) { 802 *next_index = index + num_read; 803 } 804 805 return ret; 806} 807 808size_t utf8_to_utf32(const char* src, size_t src_len, 809 char32_t* dst, size_t dst_len) 810{ 811 if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { 812 return 0; 813 } 814 815 const char* cur = src; 816 const char* end = src + src_len; 817 char32_t* cur_utf32 = dst; 818 const char32_t* end_utf32 = dst + dst_len; 819 while (cur_utf32 < end_utf32 && cur < end) { 820 size_t num_read; 821 *cur_utf32++ = 822 static_cast<char32_t>(utf32_at_internal(cur, &num_read)); 823 cur += num_read; 824 } 825 if (cur_utf32 < end_utf32) { 826 *cur_utf32 = 0; 827 } 828 return static_cast<size_t>(cur_utf32 - dst); 829} 830 831size_t utf32_to_utf8(const char32_t* src, size_t src_len, 832 char* dst, size_t dst_len) 833{ 834 if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { 835 return 0; 836 } 837 const char32_t *cur_utf32 = src; 838 const char32_t *end_utf32 = src + src_len; 839 char *cur = dst; 840 const char *end = dst + dst_len; 841 while (cur_utf32 < end_utf32 && cur < end) { 842 size_t len = android::utf32_to_utf8_bytes(*cur_utf32); 843 android::utf32_to_utf8((uint8_t *)cur, *cur_utf32++, len); 844 cur += len; 845 } 846 if (cur < end) { 847 *cur = '\0'; 848 } 849 return cur - dst; 850} 851