1/* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "descriptors_names.h" 18 19#include "android-base/stringprintf.h" 20#include "android-base/strings.h" 21 22#include "dex/utf-inl.h" 23 24namespace art { 25 26using android::base::StringAppendF; 27using android::base::StringPrintf; 28 29void AppendPrettyDescriptor(const char* descriptor, std::string* result) { 30 // Count the number of '['s to get the dimensionality. 31 const char* c = descriptor; 32 size_t dim = 0; 33 while (*c == '[') { 34 dim++; 35 c++; 36 } 37 38 // Reference or primitive? 39 if (*c == 'L') { 40 // "[[La/b/C;" -> "a.b.C[][]". 41 c++; // Skip the 'L'. 42 } else { 43 // "[[B" -> "byte[][]". 44 // To make life easier, we make primitives look like unqualified 45 // reference types. 46 switch (*c) { 47 case 'B': c = "byte;"; break; 48 case 'C': c = "char;"; break; 49 case 'D': c = "double;"; break; 50 case 'F': c = "float;"; break; 51 case 'I': c = "int;"; break; 52 case 'J': c = "long;"; break; 53 case 'S': c = "short;"; break; 54 case 'Z': c = "boolean;"; break; 55 case 'V': c = "void;"; break; // Used when decoding return types. 56 default: result->append(descriptor); return; 57 } 58 } 59 60 // At this point, 'c' is a string of the form "fully/qualified/Type;" 61 // or "primitive;". Rewrite the type with '.' instead of '/': 62 const char* p = c; 63 while (*p != ';') { 64 char ch = *p++; 65 if (ch == '/') { 66 ch = '.'; 67 } 68 result->push_back(ch); 69 } 70 // ...and replace the semicolon with 'dim' "[]" pairs: 71 for (size_t i = 0; i < dim; ++i) { 72 result->append("[]"); 73 } 74} 75 76std::string PrettyDescriptor(const char* descriptor) { 77 std::string result; 78 AppendPrettyDescriptor(descriptor, &result); 79 return result; 80} 81 82std::string GetJniShortName(const std::string& class_descriptor, const std::string& method) { 83 // Remove the leading 'L' and trailing ';'... 84 std::string class_name(class_descriptor); 85 CHECK_EQ(class_name[0], 'L') << class_name; 86 CHECK_EQ(class_name[class_name.size() - 1], ';') << class_name; 87 class_name.erase(0, 1); 88 class_name.erase(class_name.size() - 1, 1); 89 90 std::string short_name; 91 short_name += "Java_"; 92 short_name += MangleForJni(class_name); 93 short_name += "_"; 94 short_name += MangleForJni(method); 95 return short_name; 96} 97 98// See http://java.sun.com/j2se/1.5.0/docs/guide/jni/spec/design.html#wp615 for the full rules. 99std::string MangleForJni(const std::string& s) { 100 std::string result; 101 size_t char_count = CountModifiedUtf8Chars(s.c_str()); 102 const char* cp = &s[0]; 103 for (size_t i = 0; i < char_count; ++i) { 104 uint32_t ch = GetUtf16FromUtf8(&cp); 105 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) { 106 result.push_back(ch); 107 } else if (ch == '.' || ch == '/') { 108 result += "_"; 109 } else if (ch == '_') { 110 result += "_1"; 111 } else if (ch == ';') { 112 result += "_2"; 113 } else if (ch == '[') { 114 result += "_3"; 115 } else { 116 const uint16_t leading = GetLeadingUtf16Char(ch); 117 const uint32_t trailing = GetTrailingUtf16Char(ch); 118 119 StringAppendF(&result, "_0%04x", leading); 120 if (trailing != 0) { 121 StringAppendF(&result, "_0%04x", trailing); 122 } 123 } 124 } 125 return result; 126} 127 128std::string DotToDescriptor(const char* class_name) { 129 std::string descriptor(class_name); 130 std::replace(descriptor.begin(), descriptor.end(), '.', '/'); 131 if (descriptor.length() > 0 && descriptor[0] != '[') { 132 descriptor = "L" + descriptor + ";"; 133 } 134 return descriptor; 135} 136 137std::string DescriptorToDot(const char* descriptor) { 138 size_t length = strlen(descriptor); 139 if (length > 1) { 140 if (descriptor[0] == 'L' && descriptor[length - 1] == ';') { 141 // Descriptors have the leading 'L' and trailing ';' stripped. 142 std::string result(descriptor + 1, length - 2); 143 std::replace(result.begin(), result.end(), '/', '.'); 144 return result; 145 } else { 146 // For arrays the 'L' and ';' remain intact. 147 std::string result(descriptor); 148 std::replace(result.begin(), result.end(), '/', '.'); 149 return result; 150 } 151 } 152 // Do nothing for non-class/array descriptors. 153 return descriptor; 154} 155 156std::string DescriptorToName(const char* descriptor) { 157 size_t length = strlen(descriptor); 158 if (descriptor[0] == 'L' && descriptor[length - 1] == ';') { 159 std::string result(descriptor + 1, length - 2); 160 return result; 161 } 162 return descriptor; 163} 164 165// Helper for IsValidPartOfMemberNameUtf8(), a bit vector indicating valid low ascii. 166static uint32_t DEX_MEMBER_VALID_LOW_ASCII[4] = { 167 0x00000000, // 00..1f low control characters; nothing valid 168 0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-' 169 0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_' 170 0x07fffffe // 60..7f lowercase etc.; valid: 'a'..'z' 171}; 172 173// Helper for IsValidPartOfMemberNameUtf8(); do not call directly. 174static bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) { 175 /* 176 * It's a multibyte encoded character. Decode it and analyze. We 177 * accept anything that isn't (a) an improperly encoded low value, 178 * (b) an improper surrogate pair, (c) an encoded '\0', (d) a high 179 * control character, or (e) a high space, layout, or special 180 * character (U+00a0, U+2000..U+200f, U+2028..U+202f, 181 * U+fff0..U+ffff). This is all specified in the dex format 182 * document. 183 */ 184 185 const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr); 186 const uint16_t leading = GetLeadingUtf16Char(pair); 187 188 // We have a surrogate pair resulting from a valid 4 byte UTF sequence. 189 // No further checks are necessary because 4 byte sequences span code 190 // points [U+10000, U+1FFFFF], which are valid codepoints in a dex 191 // identifier. Furthermore, GetUtf16FromUtf8 guarantees that each of 192 // the surrogate halves are valid and well formed in this instance. 193 if (GetTrailingUtf16Char(pair) != 0) { 194 return true; 195 } 196 197 198 // We've encountered a one, two or three byte UTF-8 sequence. The 199 // three byte UTF-8 sequence could be one half of a surrogate pair. 200 switch (leading >> 8) { 201 case 0x00: 202 // It's only valid if it's above the ISO-8859-1 high space (0xa0). 203 return (leading > 0x00a0); 204 case 0xd8: 205 case 0xd9: 206 case 0xda: 207 case 0xdb: 208 { 209 // We found a three byte sequence encoding one half of a surrogate. 210 // Look for the other half. 211 const uint32_t pair2 = GetUtf16FromUtf8(pUtf8Ptr); 212 const uint16_t trailing = GetLeadingUtf16Char(pair2); 213 214 return (GetTrailingUtf16Char(pair2) == 0) && (0xdc00 <= trailing && trailing <= 0xdfff); 215 } 216 case 0xdc: 217 case 0xdd: 218 case 0xde: 219 case 0xdf: 220 // It's a trailing surrogate, which is not valid at this point. 221 return false; 222 case 0x20: 223 case 0xff: 224 // It's in the range that has spaces, controls, and specials. 225 switch (leading & 0xfff8) { 226 case 0x2000: 227 case 0x2008: 228 case 0x2028: 229 case 0xfff0: 230 case 0xfff8: 231 return false; 232 } 233 return true; 234 default: 235 return true; 236 } 237 238 UNREACHABLE(); 239} 240 241/* Return whether the pointed-at modified-UTF-8 encoded character is 242 * valid as part of a member name, updating the pointer to point past 243 * the consumed character. This will consume two encoded UTF-16 code 244 * points if the character is encoded as a surrogate pair. Also, if 245 * this function returns false, then the given pointer may only have 246 * been partially advanced. 247 */ 248static bool IsValidPartOfMemberNameUtf8(const char** pUtf8Ptr) { 249 uint8_t c = (uint8_t) **pUtf8Ptr; 250 if (LIKELY(c <= 0x7f)) { 251 // It's low-ascii, so check the table. 252 uint32_t wordIdx = c >> 5; 253 uint32_t bitIdx = c & 0x1f; 254 (*pUtf8Ptr)++; 255 return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0; 256 } 257 258 // It's a multibyte encoded character. Call a non-inline function 259 // for the heavy lifting. 260 return IsValidPartOfMemberNameUtf8Slow(pUtf8Ptr); 261} 262 263bool IsValidMemberName(const char* s) { 264 bool angle_name = false; 265 266 switch (*s) { 267 case '\0': 268 // The empty string is not a valid name. 269 return false; 270 case '<': 271 angle_name = true; 272 s++; 273 break; 274 } 275 276 while (true) { 277 switch (*s) { 278 case '\0': 279 return !angle_name; 280 case '>': 281 return angle_name && s[1] == '\0'; 282 } 283 284 if (!IsValidPartOfMemberNameUtf8(&s)) { 285 return false; 286 } 287 } 288} 289 290enum ClassNameType { kName, kDescriptor }; 291template<ClassNameType kType, char kSeparator> 292static bool IsValidClassName(const char* s) { 293 int arrayCount = 0; 294 while (*s == '[') { 295 arrayCount++; 296 s++; 297 } 298 299 if (arrayCount > 255) { 300 // Arrays may have no more than 255 dimensions. 301 return false; 302 } 303 304 ClassNameType type = kType; 305 if (type != kDescriptor && arrayCount != 0) { 306 /* 307 * If we're looking at an array of some sort, then it doesn't 308 * matter if what is being asked for is a class name; the 309 * format looks the same as a type descriptor in that case, so 310 * treat it as such. 311 */ 312 type = kDescriptor; 313 } 314 315 if (type == kDescriptor) { 316 /* 317 * We are looking for a descriptor. Either validate it as a 318 * single-character primitive type, or continue on to check the 319 * embedded class name (bracketed by "L" and ";"). 320 */ 321 switch (*(s++)) { 322 case 'B': 323 case 'C': 324 case 'D': 325 case 'F': 326 case 'I': 327 case 'J': 328 case 'S': 329 case 'Z': 330 // These are all single-character descriptors for primitive types. 331 return (*s == '\0'); 332 case 'V': 333 // Non-array void is valid, but you can't have an array of void. 334 return (arrayCount == 0) && (*s == '\0'); 335 case 'L': 336 // Class name: Break out and continue below. 337 break; 338 default: 339 // Oddball descriptor character. 340 return false; 341 } 342 } 343 344 /* 345 * We just consumed the 'L' that introduces a class name as part 346 * of a type descriptor, or we are looking for an unadorned class 347 * name. 348 */ 349 350 bool sepOrFirst = true; // first character or just encountered a separator. 351 for (;;) { 352 uint8_t c = (uint8_t) *s; 353 switch (c) { 354 case '\0': 355 /* 356 * Premature end for a type descriptor, but valid for 357 * a class name as long as we haven't encountered an 358 * empty component (including the degenerate case of 359 * the empty string ""). 360 */ 361 return (type == kName) && !sepOrFirst; 362 case ';': 363 /* 364 * Invalid character for a class name, but the 365 * legitimate end of a type descriptor. In the latter 366 * case, make sure that this is the end of the string 367 * and that it doesn't end with an empty component 368 * (including the degenerate case of "L;"). 369 */ 370 return (type == kDescriptor) && !sepOrFirst && (s[1] == '\0'); 371 case '/': 372 case '.': 373 if (c != kSeparator) { 374 // The wrong separator character. 375 return false; 376 } 377 if (sepOrFirst) { 378 // Separator at start or two separators in a row. 379 return false; 380 } 381 sepOrFirst = true; 382 s++; 383 break; 384 default: 385 if (!IsValidPartOfMemberNameUtf8(&s)) { 386 return false; 387 } 388 sepOrFirst = false; 389 break; 390 } 391 } 392} 393 394bool IsValidBinaryClassName(const char* s) { 395 return IsValidClassName<kName, '.'>(s); 396} 397 398bool IsValidJniClassName(const char* s) { 399 return IsValidClassName<kName, '/'>(s); 400} 401 402bool IsValidDescriptor(const char* s) { 403 return IsValidClassName<kDescriptor, '/'>(s); 404} 405 406void Split(const std::string& s, char separator, std::vector<std::string>* result) { 407 const char* p = s.data(); 408 const char* end = p + s.size(); 409 while (p != end) { 410 if (*p == separator) { 411 ++p; 412 } else { 413 const char* start = p; 414 while (++p != end && *p != separator) { 415 // Skip to the next occurrence of the separator. 416 } 417 result->push_back(std::string(start, p - start)); 418 } 419 } 420} 421 422std::string PrettyDescriptor(Primitive::Type type) { 423 return PrettyDescriptor(Primitive::Descriptor(type)); 424} 425 426} // namespace art 427