1// Copyright 2012 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/regexp/regexp-macro-assembler.h" 6 7#include "src/assembler.h" 8#include "src/isolate-inl.h" 9#include "src/regexp/regexp-stack.h" 10#include "src/simulator.h" 11 12#ifdef V8_I18N_SUPPORT 13#include "unicode/uchar.h" 14#endif // V8_I18N_SUPPORT 15 16namespace v8 { 17namespace internal { 18 19RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone) 20 : slow_safe_compiler_(false), 21 global_mode_(NOT_GLOBAL), 22 isolate_(isolate), 23 zone_(zone) {} 24 25 26RegExpMacroAssembler::~RegExpMacroAssembler() { 27} 28 29 30int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1, 31 Address byte_offset2, 32 size_t byte_length, 33 Isolate* isolate) { 34 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 35 isolate->regexp_macro_assembler_canonicalize(); 36 // This function is not allowed to cause a garbage collection. 37 // A GC might move the calling generated code and invalidate the 38 // return address on the stack. 39 DCHECK(byte_length % 2 == 0); 40 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1); 41 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2); 42 size_t length = byte_length >> 1; 43 44#ifdef V8_I18N_SUPPORT 45 if (isolate == nullptr) { 46 for (size_t i = 0; i < length; i++) { 47 uc32 c1 = substring1[i]; 48 uc32 c2 = substring2[i]; 49 if (unibrow::Utf16::IsLeadSurrogate(c1)) { 50 // Non-BMP characters do not have case-equivalents in the BMP. 51 // Both have to be non-BMP for them to be able to match. 52 if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0; 53 if (i + 1 < length) { 54 uc16 c1t = substring1[i + 1]; 55 uc16 c2t = substring2[i + 1]; 56 if (unibrow::Utf16::IsTrailSurrogate(c1t) && 57 unibrow::Utf16::IsTrailSurrogate(c2t)) { 58 c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t); 59 c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t); 60 i++; 61 } 62 } 63 } 64 c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT); 65 c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT); 66 if (c1 != c2) return 0; 67 } 68 return 1; 69 } 70#endif // V8_I18N_SUPPORT 71 DCHECK_NOT_NULL(isolate); 72 for (size_t i = 0; i < length; i++) { 73 unibrow::uchar c1 = substring1[i]; 74 unibrow::uchar c2 = substring2[i]; 75 if (c1 != c2) { 76 unibrow::uchar s1[1] = {c1}; 77 canonicalize->get(c1, '\0', s1); 78 if (s1[0] != c2) { 79 unibrow::uchar s2[1] = {c2}; 80 canonicalize->get(c2, '\0', s2); 81 if (s1[0] != s2[0]) { 82 return 0; 83 } 84 } 85 } 86 } 87 return 1; 88} 89 90 91void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset, 92 Label* on_failure) { 93 Label ok; 94 // Check that current character is not a trail surrogate. 95 LoadCurrentCharacter(cp_offset, &ok); 96 CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok); 97 // Check that previous character is not a lead surrogate. 98 LoadCurrentCharacter(cp_offset - 1, &ok); 99 CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure); 100 Bind(&ok); 101} 102 103void RegExpMacroAssembler::CheckPosition(int cp_offset, 104 Label* on_outside_input) { 105 LoadCurrentCharacter(cp_offset, on_outside_input, true); 106} 107 108bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type, 109 Label* on_no_match) { 110 return false; 111} 112 113#ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM. 114 115NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate, 116 Zone* zone) 117 : RegExpMacroAssembler(isolate, zone) {} 118 119 120NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() { 121} 122 123 124bool NativeRegExpMacroAssembler::CanReadUnaligned() { 125 return FLAG_enable_regexp_unaligned_accesses && !slow_safe(); 126} 127 128const byte* NativeRegExpMacroAssembler::StringCharacterPosition( 129 String* subject, 130 int start_index) { 131 if (subject->IsConsString()) { 132 subject = ConsString::cast(subject)->first(); 133 } else if (subject->IsSlicedString()) { 134 start_index += SlicedString::cast(subject)->offset(); 135 subject = SlicedString::cast(subject)->parent(); 136 } 137 if (subject->IsThinString()) { 138 subject = ThinString::cast(subject)->actual(); 139 } 140 DCHECK(start_index >= 0); 141 DCHECK(start_index <= subject->length()); 142 if (subject->IsSeqOneByteString()) { 143 return reinterpret_cast<const byte*>( 144 SeqOneByteString::cast(subject)->GetChars() + start_index); 145 } else if (subject->IsSeqTwoByteString()) { 146 return reinterpret_cast<const byte*>( 147 SeqTwoByteString::cast(subject)->GetChars() + start_index); 148 } else if (subject->IsExternalOneByteString()) { 149 return reinterpret_cast<const byte*>( 150 ExternalOneByteString::cast(subject)->GetChars() + start_index); 151 } else { 152 DCHECK(subject->IsExternalTwoByteString()); 153 return reinterpret_cast<const byte*>( 154 ExternalTwoByteString::cast(subject)->GetChars() + start_index); 155 } 156} 157 158 159int NativeRegExpMacroAssembler::CheckStackGuardState( 160 Isolate* isolate, int start_index, bool is_direct_call, 161 Address* return_address, Code* re_code, String** subject, 162 const byte** input_start, const byte** input_end) { 163 DCHECK(re_code->instruction_start() <= *return_address); 164 DCHECK(*return_address <= re_code->instruction_end()); 165 int return_value = 0; 166 // Prepare for possible GC. 167 HandleScope handles(isolate); 168 Handle<Code> code_handle(re_code); 169 Handle<String> subject_handle(*subject); 170 bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath(); 171 172 StackLimitCheck check(isolate); 173 if (check.JsHasOverflowed()) { 174 isolate->StackOverflow(); 175 return_value = EXCEPTION; 176 } else if (is_direct_call) { 177 // If not real stack overflow the stack guard was used to interrupt 178 // execution for another purpose. If this is a direct call from JavaScript 179 // retry the RegExp forcing the call through the runtime system. 180 // Currently the direct call cannot handle a GC. 181 return_value = RETRY; 182 } else { 183 Object* result = isolate->stack_guard()->HandleInterrupts(); 184 if (result->IsException(isolate)) return_value = EXCEPTION; 185 } 186 187 DisallowHeapAllocation no_gc; 188 189 if (*code_handle != re_code) { // Return address no longer valid 190 intptr_t delta = code_handle->address() - re_code->address(); 191 // Overwrite the return address on the stack. 192 *return_address += delta; 193 } 194 195 // If we continue, we need to update the subject string addresses. 196 if (return_value == 0) { 197 // String encoding might have changed. 198 if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) { 199 // If we changed between an LATIN1 and an UC16 string, the specialized 200 // code cannot be used, and we need to restart regexp matching from 201 // scratch (including, potentially, compiling a new version of the code). 202 return_value = RETRY; 203 } else { 204 *subject = *subject_handle; 205 intptr_t byte_length = *input_end - *input_start; 206 *input_start = StringCharacterPosition(*subject, start_index); 207 *input_end = *input_start + byte_length; 208 } 209 } 210 return return_value; 211} 212 213 214NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match( 215 Handle<Code> regexp_code, 216 Handle<String> subject, 217 int* offsets_vector, 218 int offsets_vector_length, 219 int previous_index, 220 Isolate* isolate) { 221 222 DCHECK(subject->IsFlat()); 223 DCHECK(previous_index >= 0); 224 DCHECK(previous_index <= subject->length()); 225 226 // No allocations before calling the regexp, but we can't use 227 // DisallowHeapAllocation, since regexps might be preempted, and another 228 // thread might do allocation anyway. 229 230 String* subject_ptr = *subject; 231 // Character offsets into string. 232 int start_offset = previous_index; 233 int char_length = subject_ptr->length() - start_offset; 234 int slice_offset = 0; 235 236 // The string has been flattened, so if it is a cons string it contains the 237 // full string in the first part. 238 if (StringShape(subject_ptr).IsCons()) { 239 DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length()); 240 subject_ptr = ConsString::cast(subject_ptr)->first(); 241 } else if (StringShape(subject_ptr).IsSliced()) { 242 SlicedString* slice = SlicedString::cast(subject_ptr); 243 subject_ptr = slice->parent(); 244 slice_offset = slice->offset(); 245 } 246 if (StringShape(subject_ptr).IsThin()) { 247 subject_ptr = ThinString::cast(subject_ptr)->actual(); 248 } 249 // Ensure that an underlying string has the same representation. 250 bool is_one_byte = subject_ptr->IsOneByteRepresentation(); 251 DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString()); 252 // String is now either Sequential or External 253 int char_size_shift = is_one_byte ? 0 : 1; 254 255 const byte* input_start = 256 StringCharacterPosition(subject_ptr, start_offset + slice_offset); 257 int byte_length = char_length << char_size_shift; 258 const byte* input_end = input_start + byte_length; 259 Result res = Execute(*regexp_code, 260 *subject, 261 start_offset, 262 input_start, 263 input_end, 264 offsets_vector, 265 offsets_vector_length, 266 isolate); 267 return res; 268} 269 270 271NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute( 272 Code* code, 273 String* input, // This needs to be the unpacked (sliced, cons) string. 274 int start_offset, 275 const byte* input_start, 276 const byte* input_end, 277 int* output, 278 int output_size, 279 Isolate* isolate) { 280 // Ensure that the minimum stack has been allocated. 281 RegExpStackScope stack_scope(isolate); 282 Address stack_base = stack_scope.stack()->stack_base(); 283 284 int direct_call = 0; 285 int result = CALL_GENERATED_REGEXP_CODE( 286 isolate, code->entry(), input, start_offset, input_start, input_end, 287 output, output_size, stack_base, direct_call, isolate); 288 DCHECK(result >= RETRY); 289 290 if (result == EXCEPTION && !isolate->has_pending_exception()) { 291 // We detected a stack overflow (on the backtrack stack) in RegExp code, 292 // but haven't created the exception yet. 293 isolate->StackOverflow(); 294 } 295 return static_cast<Result>(result); 296} 297 298 299const byte NativeRegExpMacroAssembler::word_character_map[] = { 300 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 301 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 302 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 303 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 304 305 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 306 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 307 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // '0' - '7' 308 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9' 309 310 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'A' - 'G' 311 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'H' - 'O' 312 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'P' - 'W' 313 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu, // 'X' - 'Z', '_' 314 315 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'a' - 'g' 316 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o' 317 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w' 318 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z' 319 // Latin-1 range 320 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 321 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 322 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 323 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 324 325 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 326 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 327 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 328 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 329 330 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 331 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 332 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 333 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 334 335 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 336 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 337 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 338 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 339}; 340 341 342Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer, 343 Address* stack_base, 344 Isolate* isolate) { 345 RegExpStack* regexp_stack = isolate->regexp_stack(); 346 size_t size = regexp_stack->stack_capacity(); 347 Address old_stack_base = regexp_stack->stack_base(); 348 DCHECK(old_stack_base == *stack_base); 349 DCHECK(stack_pointer <= old_stack_base); 350 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size); 351 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2); 352 if (new_stack_base == NULL) { 353 return NULL; 354 } 355 *stack_base = new_stack_base; 356 intptr_t stack_content_size = old_stack_base - stack_pointer; 357 return new_stack_base - stack_content_size; 358} 359 360#endif // V8_INTERPRETED_REGEXP 361 362} // namespace internal 363} // namespace v8 364