1// Copyright 2012 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/regexp/jsregexp.h" 6 7#include "src/ast/ast.h" 8#include "src/base/platform/platform.h" 9#include "src/compilation-cache.h" 10#include "src/compiler.h" 11#include "src/execution.h" 12#include "src/factory.h" 13#include "src/isolate-inl.h" 14#include "src/messages.h" 15#include "src/ostreams.h" 16#include "src/regexp/interpreter-irregexp.h" 17#include "src/regexp/jsregexp-inl.h" 18#include "src/regexp/regexp-macro-assembler.h" 19#include "src/regexp/regexp-macro-assembler-irregexp.h" 20#include "src/regexp/regexp-macro-assembler-tracer.h" 21#include "src/regexp/regexp-parser.h" 22#include "src/regexp/regexp-stack.h" 23#include "src/runtime/runtime.h" 24#include "src/splay-tree-inl.h" 25#include "src/string-search.h" 26#include "src/unicode-decoder.h" 27 28#ifdef V8_I18N_SUPPORT 29#include "unicode/uset.h" 30#include "unicode/utypes.h" 31#endif // V8_I18N_SUPPORT 32 33#ifndef V8_INTERPRETED_REGEXP 34#if V8_TARGET_ARCH_IA32 35#include "src/regexp/ia32/regexp-macro-assembler-ia32.h" 36#elif V8_TARGET_ARCH_X64 37#include "src/regexp/x64/regexp-macro-assembler-x64.h" 38#elif V8_TARGET_ARCH_ARM64 39#include "src/regexp/arm64/regexp-macro-assembler-arm64.h" 40#elif V8_TARGET_ARCH_ARM 41#include "src/regexp/arm/regexp-macro-assembler-arm.h" 42#elif V8_TARGET_ARCH_PPC 43#include "src/regexp/ppc/regexp-macro-assembler-ppc.h" 44#elif V8_TARGET_ARCH_S390 45#include "src/regexp/s390/regexp-macro-assembler-s390.h" 46#elif V8_TARGET_ARCH_MIPS 47#include "src/regexp/mips/regexp-macro-assembler-mips.h" 48#elif V8_TARGET_ARCH_MIPS64 49#include "src/regexp/mips64/regexp-macro-assembler-mips64.h" 50#elif V8_TARGET_ARCH_X87 51#include "src/regexp/x87/regexp-macro-assembler-x87.h" 52#else 53#error Unsupported target architecture. 54#endif 55#endif 56 57 58namespace v8 { 59namespace internal { 60 61MUST_USE_RESULT 62static inline MaybeHandle<Object> ThrowRegExpException( 63 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) { 64 Isolate* isolate = re->GetIsolate(); 65 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp, 66 pattern, error_text), 67 Object); 68} 69 70 71inline void ThrowRegExpException(Handle<JSRegExp> re, 72 Handle<String> error_text) { 73 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text)); 74} 75 76 77ContainedInLattice AddRange(ContainedInLattice containment, 78 const int* ranges, 79 int ranges_length, 80 Interval new_range) { 81 DCHECK((ranges_length & 1) == 1); 82 DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1); 83 if (containment == kLatticeUnknown) return containment; 84 bool inside = false; 85 int last = 0; 86 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { 87 // Consider the range from last to ranges[i]. 88 // We haven't got to the new range yet. 89 if (ranges[i] <= new_range.from()) continue; 90 // New range is wholly inside last-ranges[i]. Note that new_range.to() is 91 // inclusive, but the values in ranges are not. 92 if (last <= new_range.from() && new_range.to() < ranges[i]) { 93 return Combine(containment, inside ? kLatticeIn : kLatticeOut); 94 } 95 return kLatticeUnknown; 96 } 97 return containment; 98} 99 100 101// More makes code generation slower, less makes V8 benchmark score lower. 102const int kMaxLookaheadForBoyerMoore = 8; 103// In a 3-character pattern you can maximally step forwards 3 characters 104// at a time, which is not always enough to pay for the extra logic. 105const int kPatternTooShortForBoyerMoore = 2; 106 107 108// Identifies the sort of regexps where the regexp engine is faster 109// than the code used for atom matches. 110static bool HasFewDifferentCharacters(Handle<String> pattern) { 111 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length()); 112 if (length <= kPatternTooShortForBoyerMoore) return false; 113 const int kMod = 128; 114 bool character_found[kMod]; 115 int different = 0; 116 memset(&character_found[0], 0, sizeof(character_found)); 117 for (int i = 0; i < length; i++) { 118 int ch = (pattern->Get(i) & (kMod - 1)); 119 if (!character_found[ch]) { 120 character_found[ch] = true; 121 different++; 122 // We declare a regexp low-alphabet if it has at least 3 times as many 123 // characters as it has different characters. 124 if (different * 3 > length) return false; 125 } 126 } 127 return true; 128} 129 130 131// Generic RegExp methods. Dispatches to implementation specific methods. 132 133 134MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 135 Handle<String> pattern, 136 JSRegExp::Flags flags) { 137 Isolate* isolate = re->GetIsolate(); 138 Zone zone(isolate->allocator()); 139 CompilationCache* compilation_cache = isolate->compilation_cache(); 140 MaybeHandle<FixedArray> maybe_cached = 141 compilation_cache->LookupRegExp(pattern, flags); 142 Handle<FixedArray> cached; 143 bool in_cache = maybe_cached.ToHandle(&cached); 144 LOG(isolate, RegExpCompileEvent(re, in_cache)); 145 146 Handle<Object> result; 147 if (in_cache) { 148 re->set_data(*cached); 149 return re; 150 } 151 pattern = String::Flatten(pattern); 152 PostponeInterruptsScope postpone(isolate); 153 RegExpCompileData parse_result; 154 FlatStringReader reader(isolate, pattern); 155 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags, 156 &parse_result)) { 157 // Throw an exception if we fail to parse the pattern. 158 return ThrowRegExpException(re, pattern, parse_result.error); 159 } 160 161 bool has_been_compiled = false; 162 163 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) && 164 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) { 165 // Parse-tree is a single atom that is equal to the pattern. 166 AtomCompile(re, pattern, flags, pattern); 167 has_been_compiled = true; 168 } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) && 169 !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) { 170 RegExpAtom* atom = parse_result.tree->AsAtom(); 171 Vector<const uc16> atom_pattern = atom->data(); 172 Handle<String> atom_string; 173 ASSIGN_RETURN_ON_EXCEPTION( 174 isolate, atom_string, 175 isolate->factory()->NewStringFromTwoByte(atom_pattern), 176 Object); 177 if (!HasFewDifferentCharacters(atom_string)) { 178 AtomCompile(re, pattern, flags, atom_string); 179 has_been_compiled = true; 180 } 181 } 182 if (!has_been_compiled) { 183 IrregexpInitialize(re, pattern, flags, parse_result.capture_count); 184 } 185 DCHECK(re->data()->IsFixedArray()); 186 // Compilation succeeded so the data is set on the regexp 187 // and we can store it in the cache. 188 Handle<FixedArray> data(FixedArray::cast(re->data())); 189 compilation_cache->PutRegExp(pattern, flags, data); 190 191 return re; 192} 193 194 195MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 196 Handle<String> subject, 197 int index, 198 Handle<JSArray> last_match_info) { 199 switch (regexp->TypeTag()) { 200 case JSRegExp::ATOM: 201 return AtomExec(regexp, subject, index, last_match_info); 202 case JSRegExp::IRREGEXP: { 203 return IrregexpExec(regexp, subject, index, last_match_info); 204 } 205 default: 206 UNREACHABLE(); 207 return MaybeHandle<Object>(); 208 } 209} 210 211 212// RegExp Atom implementation: Simple string search using indexOf. 213 214 215void RegExpImpl::AtomCompile(Handle<JSRegExp> re, 216 Handle<String> pattern, 217 JSRegExp::Flags flags, 218 Handle<String> match_pattern) { 219 re->GetIsolate()->factory()->SetRegExpAtomData(re, 220 JSRegExp::ATOM, 221 pattern, 222 flags, 223 match_pattern); 224} 225 226 227static void SetAtomLastCapture(FixedArray* array, 228 String* subject, 229 int from, 230 int to) { 231 SealHandleScope shs(array->GetIsolate()); 232 RegExpImpl::SetLastCaptureCount(array, 2); 233 RegExpImpl::SetLastSubject(array, subject); 234 RegExpImpl::SetLastInput(array, subject); 235 RegExpImpl::SetCapture(array, 0, from); 236 RegExpImpl::SetCapture(array, 1, to); 237} 238 239 240int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp, 241 Handle<String> subject, 242 int index, 243 int32_t* output, 244 int output_size) { 245 Isolate* isolate = regexp->GetIsolate(); 246 247 DCHECK(0 <= index); 248 DCHECK(index <= subject->length()); 249 250 subject = String::Flatten(subject); 251 DisallowHeapAllocation no_gc; // ensure vectors stay valid 252 253 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex)); 254 int needle_len = needle->length(); 255 DCHECK(needle->IsFlat()); 256 DCHECK_LT(0, needle_len); 257 258 if (index + needle_len > subject->length()) { 259 return RegExpImpl::RE_FAILURE; 260 } 261 262 for (int i = 0; i < output_size; i += 2) { 263 String::FlatContent needle_content = needle->GetFlatContent(); 264 String::FlatContent subject_content = subject->GetFlatContent(); 265 DCHECK(needle_content.IsFlat()); 266 DCHECK(subject_content.IsFlat()); 267 // dispatch on type of strings 268 index = 269 (needle_content.IsOneByte() 270 ? (subject_content.IsOneByte() 271 ? SearchString(isolate, subject_content.ToOneByteVector(), 272 needle_content.ToOneByteVector(), index) 273 : SearchString(isolate, subject_content.ToUC16Vector(), 274 needle_content.ToOneByteVector(), index)) 275 : (subject_content.IsOneByte() 276 ? SearchString(isolate, subject_content.ToOneByteVector(), 277 needle_content.ToUC16Vector(), index) 278 : SearchString(isolate, subject_content.ToUC16Vector(), 279 needle_content.ToUC16Vector(), index))); 280 if (index == -1) { 281 return i / 2; // Return number of matches. 282 } else { 283 output[i] = index; 284 output[i+1] = index + needle_len; 285 index += needle_len; 286 } 287 } 288 return output_size / 2; 289} 290 291 292Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 293 Handle<String> subject, 294 int index, 295 Handle<JSArray> last_match_info) { 296 Isolate* isolate = re->GetIsolate(); 297 298 static const int kNumRegisters = 2; 299 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize); 300 int32_t* output_registers = isolate->jsregexp_static_offsets_vector(); 301 302 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters); 303 304 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value(); 305 306 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS); 307 SealHandleScope shs(isolate); 308 FixedArray* array = FixedArray::cast(last_match_info->elements()); 309 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]); 310 return last_match_info; 311} 312 313 314// Irregexp implementation. 315 316// Ensures that the regexp object contains a compiled version of the 317// source for either one-byte or two-byte subject strings. 318// If the compiled version doesn't already exist, it is compiled 319// from the source pattern. 320// If compilation fails, an exception is thrown and this function 321// returns false. 322bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, 323 Handle<String> sample_subject, 324 bool is_one_byte) { 325 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte)); 326#ifdef V8_INTERPRETED_REGEXP 327 if (compiled_code->IsByteArray()) return true; 328#else // V8_INTERPRETED_REGEXP (RegExp native code) 329 if (compiled_code->IsCode()) return true; 330#endif 331 // We could potentially have marked this as flushable, but have kept 332 // a saved version if we did not flush it yet. 333 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); 334 if (saved_code->IsCode()) { 335 // Reinstate the code in the original place. 336 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code); 337 DCHECK(compiled_code->IsSmi()); 338 return true; 339 } 340 return CompileIrregexp(re, sample_subject, is_one_byte); 341} 342 343 344bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, 345 Handle<String> sample_subject, 346 bool is_one_byte) { 347 // Compile the RegExp. 348 Isolate* isolate = re->GetIsolate(); 349 Zone zone(isolate->allocator()); 350 PostponeInterruptsScope postpone(isolate); 351 // If we had a compilation error the last time this is saved at the 352 // saved code index. 353 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte)); 354 // When arriving here entry can only be a smi, either representing an 355 // uncompiled regexp, a previous compilation error, or code that has 356 // been flushed. 357 DCHECK(entry->IsSmi()); 358 int entry_value = Smi::cast(entry)->value(); 359 DCHECK(entry_value == JSRegExp::kUninitializedValue || 360 entry_value == JSRegExp::kCompilationErrorValue || 361 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); 362 363 if (entry_value == JSRegExp::kCompilationErrorValue) { 364 // A previous compilation failed and threw an error which we store in 365 // the saved code index (we store the error message, not the actual 366 // error). Recreate the error object and throw it. 367 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); 368 DCHECK(error_string->IsString()); 369 Handle<String> error_message(String::cast(error_string)); 370 ThrowRegExpException(re, error_message); 371 return false; 372 } 373 374 JSRegExp::Flags flags = re->GetFlags(); 375 376 Handle<String> pattern(re->Pattern()); 377 pattern = String::Flatten(pattern); 378 RegExpCompileData compile_data; 379 FlatStringReader reader(isolate, pattern); 380 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags, 381 &compile_data)) { 382 // Throw an exception if we fail to parse the pattern. 383 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. 384 USE(ThrowRegExpException(re, pattern, compile_data.error)); 385 return false; 386 } 387 RegExpEngine::CompilationResult result = 388 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern, 389 sample_subject, is_one_byte); 390 if (result.error_message != NULL) { 391 // Unable to compile regexp. 392 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( 393 CStrVector(result.error_message)).ToHandleChecked(); 394 ThrowRegExpException(re, error_message); 395 return false; 396 } 397 398 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 399 data->set(JSRegExp::code_index(is_one_byte), result.code); 400 SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map); 401 int register_max = IrregexpMaxRegisterCount(*data); 402 if (result.num_registers > register_max) { 403 SetIrregexpMaxRegisterCount(*data, result.num_registers); 404 } 405 406 return true; 407} 408 409 410int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { 411 return Smi::cast( 412 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 413} 414 415 416void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) { 417 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value)); 418} 419 420void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re, 421 Handle<FixedArray> value) { 422 if (value.is_null()) { 423 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::FromInt(0)); 424 } else { 425 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value); 426 } 427} 428 429int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { 430 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); 431} 432 433 434int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { 435 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); 436} 437 438 439ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) { 440 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte))); 441} 442 443 444Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) { 445 return Code::cast(re->get(JSRegExp::code_index(is_one_byte))); 446} 447 448 449void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, 450 Handle<String> pattern, 451 JSRegExp::Flags flags, 452 int capture_count) { 453 // Initialize compiled code entries to null. 454 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, 455 JSRegExp::IRREGEXP, 456 pattern, 457 flags, 458 capture_count); 459} 460 461 462int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 463 Handle<String> subject) { 464 subject = String::Flatten(subject); 465 466 // Check representation of the underlying storage. 467 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 468 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1; 469 470#ifdef V8_INTERPRETED_REGEXP 471 // Byte-code regexp needs space allocated for all its registers. 472 // The result captures are copied to the start of the registers array 473 // if the match succeeds. This way those registers are not clobbered 474 // when we set the last match info from last successful match. 475 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + 476 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 477#else // V8_INTERPRETED_REGEXP 478 // Native regexp only needs room to output captures. Registers are handled 479 // internally. 480 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 481#endif // V8_INTERPRETED_REGEXP 482} 483 484 485int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, 486 Handle<String> subject, 487 int index, 488 int32_t* output, 489 int output_size) { 490 Isolate* isolate = regexp->GetIsolate(); 491 492 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); 493 494 DCHECK(index >= 0); 495 DCHECK(index <= subject->length()); 496 DCHECK(subject->IsFlat()); 497 498 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); 499 500#ifndef V8_INTERPRETED_REGEXP 501 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 502 do { 503 EnsureCompiledIrregexp(regexp, subject, is_one_byte); 504 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate); 505 // The stack is used to allocate registers for the compiled regexp code. 506 // This means that in case of failure, the output registers array is left 507 // untouched and contains the capture results from the previous successful 508 // match. We can use that to set the last match info lazily. 509 NativeRegExpMacroAssembler::Result res = 510 NativeRegExpMacroAssembler::Match(code, 511 subject, 512 output, 513 output_size, 514 index, 515 isolate); 516 if (res != NativeRegExpMacroAssembler::RETRY) { 517 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || 518 isolate->has_pending_exception()); 519 STATIC_ASSERT( 520 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); 521 STATIC_ASSERT( 522 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); 523 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) 524 == RE_EXCEPTION); 525 return static_cast<IrregexpResult>(res); 526 } 527 // If result is RETRY, the string has changed representation, and we 528 // must restart from scratch. 529 // In this case, it means we must make sure we are prepared to handle 530 // the, potentially, different subject (the string can switch between 531 // being internal and external, and even between being Latin1 and UC16, 532 // but the characters are always the same). 533 IrregexpPrepare(regexp, subject); 534 is_one_byte = subject->IsOneByteRepresentationUnderneath(); 535 } while (true); 536 UNREACHABLE(); 537 return RE_EXCEPTION; 538#else // V8_INTERPRETED_REGEXP 539 540 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp)); 541 // We must have done EnsureCompiledIrregexp, so we can get the number of 542 // registers. 543 int number_of_capture_registers = 544 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; 545 int32_t* raw_output = &output[number_of_capture_registers]; 546 // We do not touch the actual capture result registers until we know there 547 // has been a match so that we can use those capture results to set the 548 // last match info. 549 for (int i = number_of_capture_registers - 1; i >= 0; i--) { 550 raw_output[i] = -1; 551 } 552 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte), 553 isolate); 554 555 IrregexpResult result = IrregexpInterpreter::Match(isolate, 556 byte_codes, 557 subject, 558 raw_output, 559 index); 560 if (result == RE_SUCCESS) { 561 // Copy capture results to the start of the registers array. 562 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t)); 563 } 564 if (result == RE_EXCEPTION) { 565 DCHECK(!isolate->has_pending_exception()); 566 isolate->StackOverflow(); 567 } 568 return result; 569#endif // V8_INTERPRETED_REGEXP 570} 571 572 573MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp, 574 Handle<String> subject, 575 int previous_index, 576 Handle<JSArray> last_match_info) { 577 Isolate* isolate = regexp->GetIsolate(); 578 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); 579 580 // Prepare space for the return values. 581#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG) 582 if (FLAG_trace_regexp_bytecodes) { 583 String* pattern = regexp->Pattern(); 584 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get()); 585 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get()); 586 } 587#endif 588 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject); 589 if (required_registers < 0) { 590 // Compiling failed with an exception. 591 DCHECK(isolate->has_pending_exception()); 592 return MaybeHandle<Object>(); 593 } 594 595 int32_t* output_registers = NULL; 596 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) { 597 output_registers = NewArray<int32_t>(required_registers); 598 } 599 base::SmartArrayPointer<int32_t> auto_release(output_registers); 600 if (output_registers == NULL) { 601 output_registers = isolate->jsregexp_static_offsets_vector(); 602 } 603 604 int res = RegExpImpl::IrregexpExecRaw( 605 regexp, subject, previous_index, output_registers, required_registers); 606 if (res == RE_SUCCESS) { 607 int capture_count = 608 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())); 609 return SetLastMatchInfo( 610 last_match_info, subject, capture_count, output_registers); 611 } 612 if (res == RE_EXCEPTION) { 613 DCHECK(isolate->has_pending_exception()); 614 return MaybeHandle<Object>(); 615 } 616 DCHECK(res == RE_FAILURE); 617 return isolate->factory()->null_value(); 618} 619 620 621static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) { 622 if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) { 623 JSArray::SetLength(array, minimum_size); 624 } 625} 626 627 628Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info, 629 Handle<String> subject, 630 int capture_count, 631 int32_t* match) { 632 DCHECK(last_match_info->HasFastObjectElements()); 633 int capture_register_count = (capture_count + 1) * 2; 634 EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead); 635 DisallowHeapAllocation no_allocation; 636 FixedArray* array = FixedArray::cast(last_match_info->elements()); 637 if (match != NULL) { 638 for (int i = 0; i < capture_register_count; i += 2) { 639 SetCapture(array, i, match[i]); 640 SetCapture(array, i + 1, match[i + 1]); 641 } 642 } 643 SetLastCaptureCount(array, capture_register_count); 644 SetLastSubject(array, *subject); 645 SetLastInput(array, *subject); 646 return last_match_info; 647} 648 649 650RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp, 651 Handle<String> subject, 652 Isolate* isolate) 653 : register_array_(NULL), 654 register_array_size_(0), 655 regexp_(regexp), 656 subject_(subject) { 657#ifdef V8_INTERPRETED_REGEXP 658 bool interpreted = true; 659#else 660 bool interpreted = false; 661#endif // V8_INTERPRETED_REGEXP 662 663 if (regexp_->TypeTag() == JSRegExp::ATOM) { 664 static const int kAtomRegistersPerMatch = 2; 665 registers_per_match_ = kAtomRegistersPerMatch; 666 // There is no distinction between interpreted and native for atom regexps. 667 interpreted = false; 668 } else { 669 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_); 670 if (registers_per_match_ < 0) { 671 num_matches_ = -1; // Signal exception. 672 return; 673 } 674 } 675 676 DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal); 677 if (!interpreted) { 678 register_array_size_ = 679 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize); 680 max_matches_ = register_array_size_ / registers_per_match_; 681 } else { 682 // Global loop in interpreted regexp is not implemented. We choose 683 // the size of the offsets vector so that it can only store one match. 684 register_array_size_ = registers_per_match_; 685 max_matches_ = 1; 686 } 687 688 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { 689 register_array_ = NewArray<int32_t>(register_array_size_); 690 } else { 691 register_array_ = isolate->jsregexp_static_offsets_vector(); 692 } 693 694 // Set state so that fetching the results the first time triggers a call 695 // to the compiled regexp. 696 current_match_index_ = max_matches_ - 1; 697 num_matches_ = max_matches_; 698 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture. 699 DCHECK_GE(register_array_size_, registers_per_match_); 700 int32_t* last_match = 701 ®ister_array_[current_match_index_ * registers_per_match_]; 702 last_match[0] = -1; 703 last_match[1] = 0; 704} 705 706int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) { 707 if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 && 708 last_index + 1 < subject_->length() && 709 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) && 710 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) { 711 // Advance over the surrogate pair. 712 return last_index + 2; 713 } 714 return last_index + 1; 715} 716 717// ------------------------------------------------------------------- 718// Implementation of the Irregexp regular expression engine. 719// 720// The Irregexp regular expression engine is intended to be a complete 721// implementation of ECMAScript regular expressions. It generates either 722// bytecodes or native code. 723 724// The Irregexp regexp engine is structured in three steps. 725// 1) The parser generates an abstract syntax tree. See ast.cc. 726// 2) From the AST a node network is created. The nodes are all 727// subclasses of RegExpNode. The nodes represent states when 728// executing a regular expression. Several optimizations are 729// performed on the node network. 730// 3) From the nodes we generate either byte codes or native code 731// that can actually execute the regular expression (perform 732// the search). The code generation step is described in more 733// detail below. 734 735// Code generation. 736// 737// The nodes are divided into four main categories. 738// * Choice nodes 739// These represent places where the regular expression can 740// match in more than one way. For example on entry to an 741// alternation (foo|bar) or a repetition (*, +, ? or {}). 742// * Action nodes 743// These represent places where some action should be 744// performed. Examples include recording the current position 745// in the input string to a register (in order to implement 746// captures) or other actions on register for example in order 747// to implement the counters needed for {} repetitions. 748// * Matching nodes 749// These attempt to match some element part of the input string. 750// Examples of elements include character classes, plain strings 751// or back references. 752// * End nodes 753// These are used to implement the actions required on finding 754// a successful match or failing to find a match. 755// 756// The code generated (whether as byte codes or native code) maintains 757// some state as it runs. This consists of the following elements: 758// 759// * The capture registers. Used for string captures. 760// * Other registers. Used for counters etc. 761// * The current position. 762// * The stack of backtracking information. Used when a matching node 763// fails to find a match and needs to try an alternative. 764// 765// Conceptual regular expression execution model: 766// 767// There is a simple conceptual model of regular expression execution 768// which will be presented first. The actual code generated is a more 769// efficient simulation of the simple conceptual model: 770// 771// * Choice nodes are implemented as follows: 772// For each choice except the last { 773// push current position 774// push backtrack code location 775// <generate code to test for choice> 776// backtrack code location: 777// pop current position 778// } 779// <generate code to test for last choice> 780// 781// * Actions nodes are generated as follows 782// <push affected registers on backtrack stack> 783// <generate code to perform action> 784// push backtrack code location 785// <generate code to test for following nodes> 786// backtrack code location: 787// <pop affected registers to restore their state> 788// <pop backtrack location from stack and go to it> 789// 790// * Matching nodes are generated as follows: 791// if input string matches at current position 792// update current position 793// <generate code to test for following nodes> 794// else 795// <pop backtrack location from stack and go to it> 796// 797// Thus it can be seen that the current position is saved and restored 798// by the choice nodes, whereas the registers are saved and restored by 799// by the action nodes that manipulate them. 800// 801// The other interesting aspect of this model is that nodes are generated 802// at the point where they are needed by a recursive call to Emit(). If 803// the node has already been code generated then the Emit() call will 804// generate a jump to the previously generated code instead. In order to 805// limit recursion it is possible for the Emit() function to put the node 806// on a work list for later generation and instead generate a jump. The 807// destination of the jump is resolved later when the code is generated. 808// 809// Actual regular expression code generation. 810// 811// Code generation is actually more complicated than the above. In order 812// to improve the efficiency of the generated code some optimizations are 813// performed 814// 815// * Choice nodes have 1-character lookahead. 816// A choice node looks at the following character and eliminates some of 817// the choices immediately based on that character. This is not yet 818// implemented. 819// * Simple greedy loops store reduced backtracking information. 820// A quantifier like /.*foo/m will greedily match the whole input. It will 821// then need to backtrack to a point where it can match "foo". The naive 822// implementation of this would push each character position onto the 823// backtracking stack, then pop them off one by one. This would use space 824// proportional to the length of the input string. However since the "." 825// can only match in one way and always has a constant length (in this case 826// of 1) it suffices to store the current position on the top of the stack 827// once. Matching now becomes merely incrementing the current position and 828// backtracking becomes decrementing the current position and checking the 829// result against the stored current position. This is faster and saves 830// space. 831// * The current state is virtualized. 832// This is used to defer expensive operations until it is clear that they 833// are needed and to generate code for a node more than once, allowing 834// specialized an efficient versions of the code to be created. This is 835// explained in the section below. 836// 837// Execution state virtualization. 838// 839// Instead of emitting code, nodes that manipulate the state can record their 840// manipulation in an object called the Trace. The Trace object can record a 841// current position offset, an optional backtrack code location on the top of 842// the virtualized backtrack stack and some register changes. When a node is 843// to be emitted it can flush the Trace or update it. Flushing the Trace 844// will emit code to bring the actual state into line with the virtual state. 845// Avoiding flushing the state can postpone some work (e.g. updates of capture 846// registers). Postponing work can save time when executing the regular 847// expression since it may be found that the work never has to be done as a 848// failure to match can occur. In addition it is much faster to jump to a 849// known backtrack code location than it is to pop an unknown backtrack 850// location from the stack and jump there. 851// 852// The virtual state found in the Trace affects code generation. For example 853// the virtual state contains the difference between the actual current 854// position and the virtual current position, and matching code needs to use 855// this offset to attempt a match in the correct location of the input 856// string. Therefore code generated for a non-trivial trace is specialized 857// to that trace. The code generator therefore has the ability to generate 858// code for each node several times. In order to limit the size of the 859// generated code there is an arbitrary limit on how many specialized sets of 860// code may be generated for a given node. If the limit is reached, the 861// trace is flushed and a generic version of the code for a node is emitted. 862// This is subsequently used for that node. The code emitted for non-generic 863// trace is not recorded in the node and so it cannot currently be reused in 864// the event that code generation is requested for an identical trace. 865 866 867void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { 868 UNREACHABLE(); 869} 870 871 872void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { 873 text->AddElement(TextElement::Atom(this), zone); 874} 875 876 877void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) { 878 text->AddElement(TextElement::CharClass(this), zone); 879} 880 881 882void RegExpText::AppendToText(RegExpText* text, Zone* zone) { 883 for (int i = 0; i < elements()->length(); i++) 884 text->AddElement(elements()->at(i), zone); 885} 886 887 888TextElement TextElement::Atom(RegExpAtom* atom) { 889 return TextElement(ATOM, atom); 890} 891 892 893TextElement TextElement::CharClass(RegExpCharacterClass* char_class) { 894 return TextElement(CHAR_CLASS, char_class); 895} 896 897 898int TextElement::length() const { 899 switch (text_type()) { 900 case ATOM: 901 return atom()->length(); 902 903 case CHAR_CLASS: 904 return 1; 905 } 906 UNREACHABLE(); 907 return 0; 908} 909 910 911DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 912 if (table_ == NULL) { 913 table_ = new(zone()) DispatchTable(zone()); 914 DispatchTableConstructor cons(table_, ignore_case, zone()); 915 cons.BuildTable(this); 916 } 917 return table_; 918} 919 920 921class FrequencyCollator { 922 public: 923 FrequencyCollator() : total_samples_(0) { 924 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { 925 frequencies_[i] = CharacterFrequency(i); 926 } 927 } 928 929 void CountCharacter(int character) { 930 int index = (character & RegExpMacroAssembler::kTableMask); 931 frequencies_[index].Increment(); 932 total_samples_++; 933 } 934 935 // Does not measure in percent, but rather per-128 (the table size from the 936 // regexp macro assembler). 937 int Frequency(int in_character) { 938 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); 939 if (total_samples_ < 1) return 1; // Division by zero. 940 int freq_in_per128 = 941 (frequencies_[in_character].counter() * 128) / total_samples_; 942 return freq_in_per128; 943 } 944 945 private: 946 class CharacterFrequency { 947 public: 948 CharacterFrequency() : counter_(0), character_(-1) { } 949 explicit CharacterFrequency(int character) 950 : counter_(0), character_(character) { } 951 952 void Increment() { counter_++; } 953 int counter() { return counter_; } 954 int character() { return character_; } 955 956 private: 957 int counter_; 958 int character_; 959 }; 960 961 962 private: 963 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; 964 int total_samples_; 965}; 966 967 968class RegExpCompiler { 969 public: 970 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 971 JSRegExp::Flags flags, bool is_one_byte); 972 973 int AllocateRegister() { 974 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 975 reg_exp_too_big_ = true; 976 return next_register_; 977 } 978 return next_register_++; 979 } 980 981 // Lookarounds to match lone surrogates for unicode character class matches 982 // are never nested. We can therefore reuse registers. 983 int UnicodeLookaroundStackRegister() { 984 if (unicode_lookaround_stack_register_ == kNoRegister) { 985 unicode_lookaround_stack_register_ = AllocateRegister(); 986 } 987 return unicode_lookaround_stack_register_; 988 } 989 990 int UnicodeLookaroundPositionRegister() { 991 if (unicode_lookaround_position_register_ == kNoRegister) { 992 unicode_lookaround_position_register_ = AllocateRegister(); 993 } 994 return unicode_lookaround_position_register_; 995 } 996 997 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler, 998 RegExpNode* start, 999 int capture_count, 1000 Handle<String> pattern); 1001 1002 inline void AddWork(RegExpNode* node) { 1003 if (!node->on_work_list() && !node->label()->is_bound()) { 1004 node->set_on_work_list(true); 1005 work_list_->Add(node); 1006 } 1007 } 1008 1009 static const int kImplementationOffset = 0; 1010 static const int kNumberOfRegistersOffset = 0; 1011 static const int kCodeOffset = 1; 1012 1013 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } 1014 EndNode* accept() { return accept_; } 1015 1016 static const int kMaxRecursion = 100; 1017 inline int recursion_depth() { return recursion_depth_; } 1018 inline void IncrementRecursionDepth() { recursion_depth_++; } 1019 inline void DecrementRecursionDepth() { recursion_depth_--; } 1020 1021 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 1022 1023 inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; } 1024 inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; } 1025 inline bool one_byte() { return one_byte_; } 1026 inline bool optimize() { return optimize_; } 1027 inline void set_optimize(bool value) { optimize_ = value; } 1028 inline bool limiting_recursion() { return limiting_recursion_; } 1029 inline void set_limiting_recursion(bool value) { 1030 limiting_recursion_ = value; 1031 } 1032 bool read_backward() { return read_backward_; } 1033 void set_read_backward(bool value) { read_backward_ = value; } 1034 FrequencyCollator* frequency_collator() { return &frequency_collator_; } 1035 1036 int current_expansion_factor() { return current_expansion_factor_; } 1037 void set_current_expansion_factor(int value) { 1038 current_expansion_factor_ = value; 1039 } 1040 1041 Isolate* isolate() const { return isolate_; } 1042 Zone* zone() const { return zone_; } 1043 1044 static const int kNoRegister = -1; 1045 1046 private: 1047 EndNode* accept_; 1048 int next_register_; 1049 int unicode_lookaround_stack_register_; 1050 int unicode_lookaround_position_register_; 1051 List<RegExpNode*>* work_list_; 1052 int recursion_depth_; 1053 RegExpMacroAssembler* macro_assembler_; 1054 JSRegExp::Flags flags_; 1055 bool one_byte_; 1056 bool reg_exp_too_big_; 1057 bool limiting_recursion_; 1058 bool optimize_; 1059 bool read_backward_; 1060 int current_expansion_factor_; 1061 FrequencyCollator frequency_collator_; 1062 Isolate* isolate_; 1063 Zone* zone_; 1064}; 1065 1066 1067class RecursionCheck { 1068 public: 1069 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 1070 compiler->IncrementRecursionDepth(); 1071 } 1072 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 1073 private: 1074 RegExpCompiler* compiler_; 1075}; 1076 1077 1078static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { 1079 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); 1080} 1081 1082 1083// Attempts to compile the regexp using an Irregexp code generator. Returns 1084// a fixed array or a null handle depending on whether it succeeded. 1085RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, 1086 JSRegExp::Flags flags, bool one_byte) 1087 : next_register_(2 * (capture_count + 1)), 1088 unicode_lookaround_stack_register_(kNoRegister), 1089 unicode_lookaround_position_register_(kNoRegister), 1090 work_list_(NULL), 1091 recursion_depth_(0), 1092 flags_(flags), 1093 one_byte_(one_byte), 1094 reg_exp_too_big_(false), 1095 limiting_recursion_(false), 1096 optimize_(FLAG_regexp_optimization), 1097 read_backward_(false), 1098 current_expansion_factor_(1), 1099 frequency_collator_(), 1100 isolate_(isolate), 1101 zone_(zone) { 1102 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); 1103 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 1104} 1105 1106 1107RegExpEngine::CompilationResult RegExpCompiler::Assemble( 1108 RegExpMacroAssembler* macro_assembler, 1109 RegExpNode* start, 1110 int capture_count, 1111 Handle<String> pattern) { 1112 Heap* heap = pattern->GetHeap(); 1113 1114#ifdef DEBUG 1115 if (FLAG_trace_regexp_assembler) 1116 macro_assembler_ = 1117 new RegExpMacroAssemblerTracer(isolate(), macro_assembler); 1118 else 1119#endif 1120 macro_assembler_ = macro_assembler; 1121 1122 List <RegExpNode*> work_list(0); 1123 work_list_ = &work_list; 1124 Label fail; 1125 macro_assembler_->PushBacktrack(&fail); 1126 Trace new_trace; 1127 start->Emit(this, &new_trace); 1128 macro_assembler_->Bind(&fail); 1129 macro_assembler_->Fail(); 1130 while (!work_list.is_empty()) { 1131 RegExpNode* node = work_list.RemoveLast(); 1132 node->set_on_work_list(false); 1133 if (!node->label()->is_bound()) node->Emit(this, &new_trace); 1134 } 1135 if (reg_exp_too_big_) { 1136 macro_assembler_->AbortedCodeGeneration(); 1137 return IrregexpRegExpTooBig(isolate_); 1138 } 1139 1140 Handle<HeapObject> code = macro_assembler_->GetCode(pattern); 1141 heap->IncreaseTotalRegexpCodeGenerated(code->Size()); 1142 work_list_ = NULL; 1143#ifdef ENABLE_DISASSEMBLER 1144 if (FLAG_print_code) { 1145 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer()); 1146 OFStream os(trace_scope.file()); 1147 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os); 1148 } 1149#endif 1150#ifdef DEBUG 1151 if (FLAG_trace_regexp_assembler) { 1152 delete macro_assembler_; 1153 } 1154#endif 1155 return RegExpEngine::CompilationResult(*code, next_register_); 1156} 1157 1158 1159bool Trace::DeferredAction::Mentions(int that) { 1160 if (action_type() == ActionNode::CLEAR_CAPTURES) { 1161 Interval range = static_cast<DeferredClearCaptures*>(this)->range(); 1162 return range.Contains(that); 1163 } else { 1164 return reg() == that; 1165 } 1166} 1167 1168 1169bool Trace::mentions_reg(int reg) { 1170 for (DeferredAction* action = actions_; 1171 action != NULL; 1172 action = action->next()) { 1173 if (action->Mentions(reg)) 1174 return true; 1175 } 1176 return false; 1177} 1178 1179 1180bool Trace::GetStoredPosition(int reg, int* cp_offset) { 1181 DCHECK_EQ(0, *cp_offset); 1182 for (DeferredAction* action = actions_; 1183 action != NULL; 1184 action = action->next()) { 1185 if (action->Mentions(reg)) { 1186 if (action->action_type() == ActionNode::STORE_POSITION) { 1187 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset(); 1188 return true; 1189 } else { 1190 return false; 1191 } 1192 } 1193 } 1194 return false; 1195} 1196 1197 1198int Trace::FindAffectedRegisters(OutSet* affected_registers, 1199 Zone* zone) { 1200 int max_register = RegExpCompiler::kNoRegister; 1201 for (DeferredAction* action = actions_; 1202 action != NULL; 1203 action = action->next()) { 1204 if (action->action_type() == ActionNode::CLEAR_CAPTURES) { 1205 Interval range = static_cast<DeferredClearCaptures*>(action)->range(); 1206 for (int i = range.from(); i <= range.to(); i++) 1207 affected_registers->Set(i, zone); 1208 if (range.to() > max_register) max_register = range.to(); 1209 } else { 1210 affected_registers->Set(action->reg(), zone); 1211 if (action->reg() > max_register) max_register = action->reg(); 1212 } 1213 } 1214 return max_register; 1215} 1216 1217 1218void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler, 1219 int max_register, 1220 const OutSet& registers_to_pop, 1221 const OutSet& registers_to_clear) { 1222 for (int reg = max_register; reg >= 0; reg--) { 1223 if (registers_to_pop.Get(reg)) { 1224 assembler->PopRegister(reg); 1225 } else if (registers_to_clear.Get(reg)) { 1226 int clear_to = reg; 1227 while (reg > 0 && registers_to_clear.Get(reg - 1)) { 1228 reg--; 1229 } 1230 assembler->ClearRegisters(reg, clear_to); 1231 } 1232 } 1233} 1234 1235 1236void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, 1237 int max_register, 1238 const OutSet& affected_registers, 1239 OutSet* registers_to_pop, 1240 OutSet* registers_to_clear, 1241 Zone* zone) { 1242 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1. 1243 const int push_limit = (assembler->stack_limit_slack() + 1) / 2; 1244 1245 // Count pushes performed to force a stack limit check occasionally. 1246 int pushes = 0; 1247 1248 for (int reg = 0; reg <= max_register; reg++) { 1249 if (!affected_registers.Get(reg)) { 1250 continue; 1251 } 1252 1253 // The chronologically first deferred action in the trace 1254 // is used to infer the action needed to restore a register 1255 // to its previous state (or not, if it's safe to ignore it). 1256 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR }; 1257 DeferredActionUndoType undo_action = IGNORE; 1258 1259 int value = 0; 1260 bool absolute = false; 1261 bool clear = false; 1262 static const int kNoStore = kMinInt; 1263 int store_position = kNoStore; 1264 // This is a little tricky because we are scanning the actions in reverse 1265 // historical order (newest first). 1266 for (DeferredAction* action = actions_; 1267 action != NULL; 1268 action = action->next()) { 1269 if (action->Mentions(reg)) { 1270 switch (action->action_type()) { 1271 case ActionNode::SET_REGISTER: { 1272 Trace::DeferredSetRegister* psr = 1273 static_cast<Trace::DeferredSetRegister*>(action); 1274 if (!absolute) { 1275 value += psr->value(); 1276 absolute = true; 1277 } 1278 // SET_REGISTER is currently only used for newly introduced loop 1279 // counters. They can have a significant previous value if they 1280 // occour in a loop. TODO(lrn): Propagate this information, so 1281 // we can set undo_action to IGNORE if we know there is no value to 1282 // restore. 1283 undo_action = RESTORE; 1284 DCHECK_EQ(store_position, kNoStore); 1285 DCHECK(!clear); 1286 break; 1287 } 1288 case ActionNode::INCREMENT_REGISTER: 1289 if (!absolute) { 1290 value++; 1291 } 1292 DCHECK_EQ(store_position, kNoStore); 1293 DCHECK(!clear); 1294 undo_action = RESTORE; 1295 break; 1296 case ActionNode::STORE_POSITION: { 1297 Trace::DeferredCapture* pc = 1298 static_cast<Trace::DeferredCapture*>(action); 1299 if (!clear && store_position == kNoStore) { 1300 store_position = pc->cp_offset(); 1301 } 1302 1303 // For captures we know that stores and clears alternate. 1304 // Other register, are never cleared, and if the occur 1305 // inside a loop, they might be assigned more than once. 1306 if (reg <= 1) { 1307 // Registers zero and one, aka "capture zero", is 1308 // always set correctly if we succeed. There is no 1309 // need to undo a setting on backtrack, because we 1310 // will set it again or fail. 1311 undo_action = IGNORE; 1312 } else { 1313 undo_action = pc->is_capture() ? CLEAR : RESTORE; 1314 } 1315 DCHECK(!absolute); 1316 DCHECK_EQ(value, 0); 1317 break; 1318 } 1319 case ActionNode::CLEAR_CAPTURES: { 1320 // Since we're scanning in reverse order, if we've already 1321 // set the position we have to ignore historically earlier 1322 // clearing operations. 1323 if (store_position == kNoStore) { 1324 clear = true; 1325 } 1326 undo_action = RESTORE; 1327 DCHECK(!absolute); 1328 DCHECK_EQ(value, 0); 1329 break; 1330 } 1331 default: 1332 UNREACHABLE(); 1333 break; 1334 } 1335 } 1336 } 1337 // Prepare for the undo-action (e.g., push if it's going to be popped). 1338 if (undo_action == RESTORE) { 1339 pushes++; 1340 RegExpMacroAssembler::StackCheckFlag stack_check = 1341 RegExpMacroAssembler::kNoStackLimitCheck; 1342 if (pushes == push_limit) { 1343 stack_check = RegExpMacroAssembler::kCheckStackLimit; 1344 pushes = 0; 1345 } 1346 1347 assembler->PushRegister(reg, stack_check); 1348 registers_to_pop->Set(reg, zone); 1349 } else if (undo_action == CLEAR) { 1350 registers_to_clear->Set(reg, zone); 1351 } 1352 // Perform the chronologically last action (or accumulated increment) 1353 // for the register. 1354 if (store_position != kNoStore) { 1355 assembler->WriteCurrentPositionToRegister(reg, store_position); 1356 } else if (clear) { 1357 assembler->ClearRegisters(reg, reg); 1358 } else if (absolute) { 1359 assembler->SetRegister(reg, value); 1360 } else if (value != 0) { 1361 assembler->AdvanceRegister(reg, value); 1362 } 1363 } 1364} 1365 1366 1367// This is called as we come into a loop choice node and some other tricky 1368// nodes. It normalizes the state of the code generator to ensure we can 1369// generate generic code. 1370void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { 1371 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1372 1373 DCHECK(!is_trivial()); 1374 1375 if (actions_ == NULL && backtrack() == NULL) { 1376 // Here we just have some deferred cp advances to fix and we are back to 1377 // a normal situation. We may also have to forget some information gained 1378 // through a quick check that was already performed. 1379 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_); 1380 // Create a new trivial state and generate the node with that. 1381 Trace new_state; 1382 successor->Emit(compiler, &new_state); 1383 return; 1384 } 1385 1386 // Generate deferred actions here along with code to undo them again. 1387 OutSet affected_registers; 1388 1389 if (backtrack() != NULL) { 1390 // Here we have a concrete backtrack location. These are set up by choice 1391 // nodes and so they indicate that we have a deferred save of the current 1392 // position which we may need to emit here. 1393 assembler->PushCurrentPosition(); 1394 } 1395 1396 int max_register = FindAffectedRegisters(&affected_registers, 1397 compiler->zone()); 1398 OutSet registers_to_pop; 1399 OutSet registers_to_clear; 1400 PerformDeferredActions(assembler, 1401 max_register, 1402 affected_registers, 1403 ®isters_to_pop, 1404 ®isters_to_clear, 1405 compiler->zone()); 1406 if (cp_offset_ != 0) { 1407 assembler->AdvanceCurrentPosition(cp_offset_); 1408 } 1409 1410 // Create a new trivial state and generate the node with that. 1411 Label undo; 1412 assembler->PushBacktrack(&undo); 1413 if (successor->KeepRecursing(compiler)) { 1414 Trace new_state; 1415 successor->Emit(compiler, &new_state); 1416 } else { 1417 compiler->AddWork(successor); 1418 assembler->GoTo(successor->label()); 1419 } 1420 1421 // On backtrack we need to restore state. 1422 assembler->Bind(&undo); 1423 RestoreAffectedRegisters(assembler, 1424 max_register, 1425 registers_to_pop, 1426 registers_to_clear); 1427 if (backtrack() == NULL) { 1428 assembler->Backtrack(); 1429 } else { 1430 assembler->PopCurrentPosition(); 1431 assembler->GoTo(backtrack()); 1432 } 1433} 1434 1435 1436void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) { 1437 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1438 1439 // Omit flushing the trace. We discard the entire stack frame anyway. 1440 1441 if (!label()->is_bound()) { 1442 // We are completely independent of the trace, since we ignore it, 1443 // so this code can be used as the generic version. 1444 assembler->Bind(label()); 1445 } 1446 1447 // Throw away everything on the backtrack stack since the start 1448 // of the negative submatch and restore the character position. 1449 assembler->ReadCurrentPositionFromRegister(current_position_register_); 1450 assembler->ReadStackPointerFromRegister(stack_pointer_register_); 1451 if (clear_capture_count_ > 0) { 1452 // Clear any captures that might have been performed during the success 1453 // of the body of the negative look-ahead. 1454 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1; 1455 assembler->ClearRegisters(clear_capture_start_, clear_capture_end); 1456 } 1457 // Now that we have unwound the stack we find at the top of the stack the 1458 // backtrack that the BeginSubmatch node got. 1459 assembler->Backtrack(); 1460} 1461 1462 1463void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) { 1464 if (!trace->is_trivial()) { 1465 trace->Flush(compiler, this); 1466 return; 1467 } 1468 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1469 if (!label()->is_bound()) { 1470 assembler->Bind(label()); 1471 } 1472 switch (action_) { 1473 case ACCEPT: 1474 assembler->Succeed(); 1475 return; 1476 case BACKTRACK: 1477 assembler->GoTo(trace->backtrack()); 1478 return; 1479 case NEGATIVE_SUBMATCH_SUCCESS: 1480 // This case is handled in a different virtual method. 1481 UNREACHABLE(); 1482 } 1483 UNIMPLEMENTED(); 1484} 1485 1486 1487void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) { 1488 if (guards_ == NULL) 1489 guards_ = new(zone) ZoneList<Guard*>(1, zone); 1490 guards_->Add(guard, zone); 1491} 1492 1493 1494ActionNode* ActionNode::SetRegister(int reg, 1495 int val, 1496 RegExpNode* on_success) { 1497 ActionNode* result = 1498 new(on_success->zone()) ActionNode(SET_REGISTER, on_success); 1499 result->data_.u_store_register.reg = reg; 1500 result->data_.u_store_register.value = val; 1501 return result; 1502} 1503 1504 1505ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { 1506 ActionNode* result = 1507 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success); 1508 result->data_.u_increment_register.reg = reg; 1509 return result; 1510} 1511 1512 1513ActionNode* ActionNode::StorePosition(int reg, 1514 bool is_capture, 1515 RegExpNode* on_success) { 1516 ActionNode* result = 1517 new(on_success->zone()) ActionNode(STORE_POSITION, on_success); 1518 result->data_.u_position_register.reg = reg; 1519 result->data_.u_position_register.is_capture = is_capture; 1520 return result; 1521} 1522 1523 1524ActionNode* ActionNode::ClearCaptures(Interval range, 1525 RegExpNode* on_success) { 1526 ActionNode* result = 1527 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success); 1528 result->data_.u_clear_captures.range_from = range.from(); 1529 result->data_.u_clear_captures.range_to = range.to(); 1530 return result; 1531} 1532 1533 1534ActionNode* ActionNode::BeginSubmatch(int stack_reg, 1535 int position_reg, 1536 RegExpNode* on_success) { 1537 ActionNode* result = 1538 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success); 1539 result->data_.u_submatch.stack_pointer_register = stack_reg; 1540 result->data_.u_submatch.current_position_register = position_reg; 1541 return result; 1542} 1543 1544 1545ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, 1546 int position_reg, 1547 int clear_register_count, 1548 int clear_register_from, 1549 RegExpNode* on_success) { 1550 ActionNode* result = 1551 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success); 1552 result->data_.u_submatch.stack_pointer_register = stack_reg; 1553 result->data_.u_submatch.current_position_register = position_reg; 1554 result->data_.u_submatch.clear_register_count = clear_register_count; 1555 result->data_.u_submatch.clear_register_from = clear_register_from; 1556 return result; 1557} 1558 1559 1560ActionNode* ActionNode::EmptyMatchCheck(int start_register, 1561 int repetition_register, 1562 int repetition_limit, 1563 RegExpNode* on_success) { 1564 ActionNode* result = 1565 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success); 1566 result->data_.u_empty_match_check.start_register = start_register; 1567 result->data_.u_empty_match_check.repetition_register = repetition_register; 1568 result->data_.u_empty_match_check.repetition_limit = repetition_limit; 1569 return result; 1570} 1571 1572 1573#define DEFINE_ACCEPT(Type) \ 1574 void Type##Node::Accept(NodeVisitor* visitor) { \ 1575 visitor->Visit##Type(this); \ 1576 } 1577FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) 1578#undef DEFINE_ACCEPT 1579 1580 1581void LoopChoiceNode::Accept(NodeVisitor* visitor) { 1582 visitor->VisitLoopChoice(this); 1583} 1584 1585 1586// ------------------------------------------------------------------- 1587// Emit code. 1588 1589 1590void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, 1591 Guard* guard, 1592 Trace* trace) { 1593 switch (guard->op()) { 1594 case Guard::LT: 1595 DCHECK(!trace->mentions_reg(guard->reg())); 1596 macro_assembler->IfRegisterGE(guard->reg(), 1597 guard->value(), 1598 trace->backtrack()); 1599 break; 1600 case Guard::GEQ: 1601 DCHECK(!trace->mentions_reg(guard->reg())); 1602 macro_assembler->IfRegisterLT(guard->reg(), 1603 guard->value(), 1604 trace->backtrack()); 1605 break; 1606 } 1607} 1608 1609 1610// Returns the number of characters in the equivalence class, omitting those 1611// that cannot occur in the source string because it is Latin1. 1612static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, 1613 bool one_byte_subject, 1614 unibrow::uchar* letters) { 1615 int length = 1616 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1617 // Unibrow returns 0 or 1 for characters where case independence is 1618 // trivial. 1619 if (length == 0) { 1620 letters[0] = character; 1621 length = 1; 1622 } 1623 1624 if (one_byte_subject) { 1625 int new_length = 0; 1626 for (int i = 0; i < length; i++) { 1627 if (letters[i] <= String::kMaxOneByteCharCode) { 1628 letters[new_length++] = letters[i]; 1629 } 1630 } 1631 length = new_length; 1632 } 1633 1634 return length; 1635} 1636 1637 1638static inline bool EmitSimpleCharacter(Isolate* isolate, 1639 RegExpCompiler* compiler, 1640 uc16 c, 1641 Label* on_failure, 1642 int cp_offset, 1643 bool check, 1644 bool preloaded) { 1645 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1646 bool bound_checked = false; 1647 if (!preloaded) { 1648 assembler->LoadCurrentCharacter( 1649 cp_offset, 1650 on_failure, 1651 check); 1652 bound_checked = true; 1653 } 1654 assembler->CheckNotCharacter(c, on_failure); 1655 return bound_checked; 1656} 1657 1658 1659// Only emits non-letters (things that don't have case). Only used for case 1660// independent matches. 1661static inline bool EmitAtomNonLetter(Isolate* isolate, 1662 RegExpCompiler* compiler, 1663 uc16 c, 1664 Label* on_failure, 1665 int cp_offset, 1666 bool check, 1667 bool preloaded) { 1668 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1669 bool one_byte = compiler->one_byte(); 1670 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1671 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1672 if (length < 1) { 1673 // This can't match. Must be an one-byte subject and a non-one-byte 1674 // character. We do not need to do anything since the one-byte pass 1675 // already handled this. 1676 return false; // Bounds not checked. 1677 } 1678 bool checked = false; 1679 // We handle the length > 1 case in a later pass. 1680 if (length == 1) { 1681 if (one_byte && c > String::kMaxOneByteCharCodeU) { 1682 // Can't match - see above. 1683 return false; // Bounds not checked. 1684 } 1685 if (!preloaded) { 1686 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1687 checked = check; 1688 } 1689 macro_assembler->CheckNotCharacter(c, on_failure); 1690 } 1691 return checked; 1692} 1693 1694 1695static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, 1696 bool one_byte, uc16 c1, uc16 c2, 1697 Label* on_failure) { 1698 uc16 char_mask; 1699 if (one_byte) { 1700 char_mask = String::kMaxOneByteCharCode; 1701 } else { 1702 char_mask = String::kMaxUtf16CodeUnit; 1703 } 1704 uc16 exor = c1 ^ c2; 1705 // Check whether exor has only one bit set. 1706 if (((exor - 1) & exor) == 0) { 1707 // If c1 and c2 differ only by one bit. 1708 // Ecma262UnCanonicalize always gives the highest number last. 1709 DCHECK(c2 > c1); 1710 uc16 mask = char_mask ^ exor; 1711 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); 1712 return true; 1713 } 1714 DCHECK(c2 > c1); 1715 uc16 diff = c2 - c1; 1716 if (((diff - 1) & diff) == 0 && c1 >= diff) { 1717 // If the characters differ by 2^n but don't differ by one bit then 1718 // subtract the difference from the found character, then do the or 1719 // trick. We avoid the theoretical case where negative numbers are 1720 // involved in order to simplify code generation. 1721 uc16 mask = char_mask ^ diff; 1722 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1723 diff, 1724 mask, 1725 on_failure); 1726 return true; 1727 } 1728 return false; 1729} 1730 1731 1732typedef bool EmitCharacterFunction(Isolate* isolate, 1733 RegExpCompiler* compiler, 1734 uc16 c, 1735 Label* on_failure, 1736 int cp_offset, 1737 bool check, 1738 bool preloaded); 1739 1740// Only emits letters (things that have case). Only used for case independent 1741// matches. 1742static inline bool EmitAtomLetter(Isolate* isolate, 1743 RegExpCompiler* compiler, 1744 uc16 c, 1745 Label* on_failure, 1746 int cp_offset, 1747 bool check, 1748 bool preloaded) { 1749 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1750 bool one_byte = compiler->one_byte(); 1751 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1752 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1753 if (length <= 1) return false; 1754 // We may not need to check against the end of the input string 1755 // if this character lies before a character that matched. 1756 if (!preloaded) { 1757 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1758 } 1759 Label ok; 1760 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1761 switch (length) { 1762 case 2: { 1763 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], 1764 chars[1], on_failure)) { 1765 } else { 1766 macro_assembler->CheckCharacter(chars[0], &ok); 1767 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1768 macro_assembler->Bind(&ok); 1769 } 1770 break; 1771 } 1772 case 4: 1773 macro_assembler->CheckCharacter(chars[3], &ok); 1774 // Fall through! 1775 case 3: 1776 macro_assembler->CheckCharacter(chars[0], &ok); 1777 macro_assembler->CheckCharacter(chars[1], &ok); 1778 macro_assembler->CheckNotCharacter(chars[2], on_failure); 1779 macro_assembler->Bind(&ok); 1780 break; 1781 default: 1782 UNREACHABLE(); 1783 break; 1784 } 1785 return true; 1786} 1787 1788 1789static void EmitBoundaryTest(RegExpMacroAssembler* masm, 1790 int border, 1791 Label* fall_through, 1792 Label* above_or_equal, 1793 Label* below) { 1794 if (below != fall_through) { 1795 masm->CheckCharacterLT(border, below); 1796 if (above_or_equal != fall_through) masm->GoTo(above_or_equal); 1797 } else { 1798 masm->CheckCharacterGT(border - 1, above_or_equal); 1799 } 1800} 1801 1802 1803static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, 1804 int first, 1805 int last, 1806 Label* fall_through, 1807 Label* in_range, 1808 Label* out_of_range) { 1809 if (in_range == fall_through) { 1810 if (first == last) { 1811 masm->CheckNotCharacter(first, out_of_range); 1812 } else { 1813 masm->CheckCharacterNotInRange(first, last, out_of_range); 1814 } 1815 } else { 1816 if (first == last) { 1817 masm->CheckCharacter(first, in_range); 1818 } else { 1819 masm->CheckCharacterInRange(first, last, in_range); 1820 } 1821 if (out_of_range != fall_through) masm->GoTo(out_of_range); 1822 } 1823} 1824 1825 1826// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. 1827// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. 1828static void EmitUseLookupTable( 1829 RegExpMacroAssembler* masm, 1830 ZoneList<int>* ranges, 1831 int start_index, 1832 int end_index, 1833 int min_char, 1834 Label* fall_through, 1835 Label* even_label, 1836 Label* odd_label) { 1837 static const int kSize = RegExpMacroAssembler::kTableSize; 1838 static const int kMask = RegExpMacroAssembler::kTableMask; 1839 1840 int base = (min_char & ~kMask); 1841 USE(base); 1842 1843 // Assert that everything is on one kTableSize page. 1844 for (int i = start_index; i <= end_index; i++) { 1845 DCHECK_EQ(ranges->at(i) & ~kMask, base); 1846 } 1847 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base); 1848 1849 char templ[kSize]; 1850 Label* on_bit_set; 1851 Label* on_bit_clear; 1852 int bit; 1853 if (even_label == fall_through) { 1854 on_bit_set = odd_label; 1855 on_bit_clear = even_label; 1856 bit = 1; 1857 } else { 1858 on_bit_set = even_label; 1859 on_bit_clear = odd_label; 1860 bit = 0; 1861 } 1862 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) { 1863 templ[i] = bit; 1864 } 1865 int j = 0; 1866 bit ^= 1; 1867 for (int i = start_index; i < end_index; i++) { 1868 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) { 1869 templ[j] = bit; 1870 } 1871 bit ^= 1; 1872 } 1873 for (int i = j; i < kSize; i++) { 1874 templ[i] = bit; 1875 } 1876 Factory* factory = masm->isolate()->factory(); 1877 // TODO(erikcorry): Cache these. 1878 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED); 1879 for (int i = 0; i < kSize; i++) { 1880 ba->set(i, templ[i]); 1881 } 1882 masm->CheckBitInTable(ba, on_bit_set); 1883 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); 1884} 1885 1886 1887static void CutOutRange(RegExpMacroAssembler* masm, 1888 ZoneList<int>* ranges, 1889 int start_index, 1890 int end_index, 1891 int cut_index, 1892 Label* even_label, 1893 Label* odd_label) { 1894 bool odd = (((cut_index - start_index) & 1) == 1); 1895 Label* in_range_label = odd ? odd_label : even_label; 1896 Label dummy; 1897 EmitDoubleBoundaryTest(masm, 1898 ranges->at(cut_index), 1899 ranges->at(cut_index + 1) - 1, 1900 &dummy, 1901 in_range_label, 1902 &dummy); 1903 DCHECK(!dummy.is_linked()); 1904 // Cut out the single range by rewriting the array. This creates a new 1905 // range that is a merger of the two ranges on either side of the one we 1906 // are cutting out. The oddity of the labels is preserved. 1907 for (int j = cut_index; j > start_index; j--) { 1908 ranges->at(j) = ranges->at(j - 1); 1909 } 1910 for (int j = cut_index + 1; j < end_index; j++) { 1911 ranges->at(j) = ranges->at(j + 1); 1912 } 1913} 1914 1915 1916// Unicode case. Split the search space into kSize spaces that are handled 1917// with recursion. 1918static void SplitSearchSpace(ZoneList<int>* ranges, 1919 int start_index, 1920 int end_index, 1921 int* new_start_index, 1922 int* new_end_index, 1923 int* border) { 1924 static const int kSize = RegExpMacroAssembler::kTableSize; 1925 static const int kMask = RegExpMacroAssembler::kTableMask; 1926 1927 int first = ranges->at(start_index); 1928 int last = ranges->at(end_index) - 1; 1929 1930 *new_start_index = start_index; 1931 *border = (ranges->at(start_index) & ~kMask) + kSize; 1932 while (*new_start_index < end_index) { 1933 if (ranges->at(*new_start_index) > *border) break; 1934 (*new_start_index)++; 1935 } 1936 // new_start_index is the index of the first edge that is beyond the 1937 // current kSize space. 1938 1939 // For very large search spaces we do a binary chop search of the non-Latin1 1940 // space instead of just going to the end of the current kSize space. The 1941 // heuristics are complicated a little by the fact that any 128-character 1942 // encoding space can be quickly tested with a table lookup, so we don't 1943 // wish to do binary chop search at a smaller granularity than that. A 1944 // 128-character space can take up a lot of space in the ranges array if, 1945 // for example, we only want to match every second character (eg. the lower 1946 // case characters on some Unicode pages). 1947 int binary_chop_index = (end_index + start_index) / 2; 1948 // The first test ensures that we get to the code that handles the Latin1 1949 // range with a single not-taken branch, speeding up this important 1950 // character range (even non-Latin1 charset-based text has spaces and 1951 // punctuation). 1952 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. 1953 end_index - start_index > (*new_start_index - start_index) * 2 && 1954 last - first > kSize * 2 && binary_chop_index > *new_start_index && 1955 ranges->at(binary_chop_index) >= first + 2 * kSize) { 1956 int scan_forward_for_section_border = binary_chop_index;; 1957 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; 1958 1959 while (scan_forward_for_section_border < end_index) { 1960 if (ranges->at(scan_forward_for_section_border) > new_border) { 1961 *new_start_index = scan_forward_for_section_border; 1962 *border = new_border; 1963 break; 1964 } 1965 scan_forward_for_section_border++; 1966 } 1967 } 1968 1969 DCHECK(*new_start_index > start_index); 1970 *new_end_index = *new_start_index - 1; 1971 if (ranges->at(*new_end_index) == *border) { 1972 (*new_end_index)--; 1973 } 1974 if (*border >= ranges->at(end_index)) { 1975 *border = ranges->at(end_index); 1976 *new_start_index = end_index; // Won't be used. 1977 *new_end_index = end_index - 1; 1978 } 1979} 1980 1981 1982// Gets a series of segment boundaries representing a character class. If the 1983// character is in the range between an even and an odd boundary (counting from 1984// start_index) then go to even_label, otherwise go to odd_label. We already 1985// know that the character is in the range of min_char to max_char inclusive. 1986// Either label can be NULL indicating backtracking. Either label can also be 1987// equal to the fall_through label. 1988static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges, 1989 int start_index, int end_index, uc32 min_char, 1990 uc32 max_char, Label* fall_through, 1991 Label* even_label, Label* odd_label) { 1992 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit); 1993 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit); 1994 1995 int first = ranges->at(start_index); 1996 int last = ranges->at(end_index) - 1; 1997 1998 DCHECK_LT(min_char, first); 1999 2000 // Just need to test if the character is before or on-or-after 2001 // a particular character. 2002 if (start_index == end_index) { 2003 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label); 2004 return; 2005 } 2006 2007 // Another almost trivial case: There is one interval in the middle that is 2008 // different from the end intervals. 2009 if (start_index + 1 == end_index) { 2010 EmitDoubleBoundaryTest( 2011 masm, first, last, fall_through, even_label, odd_label); 2012 return; 2013 } 2014 2015 // It's not worth using table lookup if there are very few intervals in the 2016 // character class. 2017 if (end_index - start_index <= 6) { 2018 // It is faster to test for individual characters, so we look for those 2019 // first, then try arbitrary ranges in the second round. 2020 static int kNoCutIndex = -1; 2021 int cut = kNoCutIndex; 2022 for (int i = start_index; i < end_index; i++) { 2023 if (ranges->at(i) == ranges->at(i + 1) - 1) { 2024 cut = i; 2025 break; 2026 } 2027 } 2028 if (cut == kNoCutIndex) cut = start_index; 2029 CutOutRange( 2030 masm, ranges, start_index, end_index, cut, even_label, odd_label); 2031 DCHECK_GE(end_index - start_index, 2); 2032 GenerateBranches(masm, 2033 ranges, 2034 start_index + 1, 2035 end_index - 1, 2036 min_char, 2037 max_char, 2038 fall_through, 2039 even_label, 2040 odd_label); 2041 return; 2042 } 2043 2044 // If there are a lot of intervals in the regexp, then we will use tables to 2045 // determine whether the character is inside or outside the character class. 2046 static const int kBits = RegExpMacroAssembler::kTableSizeBits; 2047 2048 if ((max_char >> kBits) == (min_char >> kBits)) { 2049 EmitUseLookupTable(masm, 2050 ranges, 2051 start_index, 2052 end_index, 2053 min_char, 2054 fall_through, 2055 even_label, 2056 odd_label); 2057 return; 2058 } 2059 2060 if ((min_char >> kBits) != (first >> kBits)) { 2061 masm->CheckCharacterLT(first, odd_label); 2062 GenerateBranches(masm, 2063 ranges, 2064 start_index + 1, 2065 end_index, 2066 first, 2067 max_char, 2068 fall_through, 2069 odd_label, 2070 even_label); 2071 return; 2072 } 2073 2074 int new_start_index = 0; 2075 int new_end_index = 0; 2076 int border = 0; 2077 2078 SplitSearchSpace(ranges, 2079 start_index, 2080 end_index, 2081 &new_start_index, 2082 &new_end_index, 2083 &border); 2084 2085 Label handle_rest; 2086 Label* above = &handle_rest; 2087 if (border == last + 1) { 2088 // We didn't find any section that started after the limit, so everything 2089 // above the border is one of the terminal labels. 2090 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label; 2091 DCHECK(new_end_index == end_index - 1); 2092 } 2093 2094 DCHECK_LE(start_index, new_end_index); 2095 DCHECK_LE(new_start_index, end_index); 2096 DCHECK_LT(start_index, new_start_index); 2097 DCHECK_LT(new_end_index, end_index); 2098 DCHECK(new_end_index + 1 == new_start_index || 2099 (new_end_index + 2 == new_start_index && 2100 border == ranges->at(new_end_index + 1))); 2101 DCHECK_LT(min_char, border - 1); 2102 DCHECK_LT(border, max_char); 2103 DCHECK_LT(ranges->at(new_end_index), border); 2104 DCHECK(border < ranges->at(new_start_index) || 2105 (border == ranges->at(new_start_index) && 2106 new_start_index == end_index && 2107 new_end_index == end_index - 1 && 2108 border == last + 1)); 2109 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1)); 2110 2111 masm->CheckCharacterGT(border - 1, above); 2112 Label dummy; 2113 GenerateBranches(masm, 2114 ranges, 2115 start_index, 2116 new_end_index, 2117 min_char, 2118 border - 1, 2119 &dummy, 2120 even_label, 2121 odd_label); 2122 if (handle_rest.is_linked()) { 2123 masm->Bind(&handle_rest); 2124 bool flip = (new_start_index & 1) != (start_index & 1); 2125 GenerateBranches(masm, 2126 ranges, 2127 new_start_index, 2128 end_index, 2129 border, 2130 max_char, 2131 &dummy, 2132 flip ? odd_label : even_label, 2133 flip ? even_label : odd_label); 2134 } 2135} 2136 2137 2138static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 2139 RegExpCharacterClass* cc, bool one_byte, 2140 Label* on_failure, int cp_offset, bool check_offset, 2141 bool preloaded, Zone* zone) { 2142 ZoneList<CharacterRange>* ranges = cc->ranges(zone); 2143 CharacterRange::Canonicalize(ranges); 2144 2145 int max_char; 2146 if (one_byte) { 2147 max_char = String::kMaxOneByteCharCode; 2148 } else { 2149 max_char = String::kMaxUtf16CodeUnit; 2150 } 2151 2152 int range_count = ranges->length(); 2153 2154 int last_valid_range = range_count - 1; 2155 while (last_valid_range >= 0) { 2156 CharacterRange& range = ranges->at(last_valid_range); 2157 if (range.from() <= max_char) { 2158 break; 2159 } 2160 last_valid_range--; 2161 } 2162 2163 if (last_valid_range < 0) { 2164 if (!cc->is_negated()) { 2165 macro_assembler->GoTo(on_failure); 2166 } 2167 if (check_offset) { 2168 macro_assembler->CheckPosition(cp_offset, on_failure); 2169 } 2170 return; 2171 } 2172 2173 if (last_valid_range == 0 && 2174 ranges->at(0).IsEverything(max_char)) { 2175 if (cc->is_negated()) { 2176 macro_assembler->GoTo(on_failure); 2177 } else { 2178 // This is a common case hit by non-anchored expressions. 2179 if (check_offset) { 2180 macro_assembler->CheckPosition(cp_offset, on_failure); 2181 } 2182 } 2183 return; 2184 } 2185 2186 if (!preloaded) { 2187 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); 2188 } 2189 2190 if (cc->is_standard(zone) && 2191 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(), 2192 on_failure)) { 2193 return; 2194 } 2195 2196 2197 // A new list with ascending entries. Each entry is a code unit 2198 // where there is a boundary between code units that are part of 2199 // the class and code units that are not. Normally we insert an 2200 // entry at zero which goes to the failure label, but if there 2201 // was already one there we fall through for success on that entry. 2202 // Subsequent entries have alternating meaning (success/failure). 2203 ZoneList<int>* range_boundaries = 2204 new(zone) ZoneList<int>(last_valid_range, zone); 2205 2206 bool zeroth_entry_is_failure = !cc->is_negated(); 2207 2208 for (int i = 0; i <= last_valid_range; i++) { 2209 CharacterRange& range = ranges->at(i); 2210 if (range.from() == 0) { 2211 DCHECK_EQ(i, 0); 2212 zeroth_entry_is_failure = !zeroth_entry_is_failure; 2213 } else { 2214 range_boundaries->Add(range.from(), zone); 2215 } 2216 range_boundaries->Add(range.to() + 1, zone); 2217 } 2218 int end_index = range_boundaries->length() - 1; 2219 if (range_boundaries->at(end_index) > max_char) { 2220 end_index--; 2221 } 2222 2223 Label fall_through; 2224 GenerateBranches(macro_assembler, 2225 range_boundaries, 2226 0, // start_index. 2227 end_index, 2228 0, // min_char. 2229 max_char, 2230 &fall_through, 2231 zeroth_entry_is_failure ? &fall_through : on_failure, 2232 zeroth_entry_is_failure ? on_failure : &fall_through); 2233 macro_assembler->Bind(&fall_through); 2234} 2235 2236 2237RegExpNode::~RegExpNode() { 2238} 2239 2240 2241RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, 2242 Trace* trace) { 2243 // If we are generating a greedy loop then don't stop and don't reuse code. 2244 if (trace->stop_node() != NULL) { 2245 return CONTINUE; 2246 } 2247 2248 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 2249 if (trace->is_trivial()) { 2250 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) { 2251 // If a generic version is already scheduled to be generated or we have 2252 // recursed too deeply then just generate a jump to that code. 2253 macro_assembler->GoTo(&label_); 2254 // This will queue it up for generation of a generic version if it hasn't 2255 // already been queued. 2256 compiler->AddWork(this); 2257 return DONE; 2258 } 2259 // Generate generic version of the node and bind the label for later use. 2260 macro_assembler->Bind(&label_); 2261 return CONTINUE; 2262 } 2263 2264 // We are being asked to make a non-generic version. Keep track of how many 2265 // non-generic versions we generate so as not to overdo it. 2266 trace_count_++; 2267 if (KeepRecursing(compiler) && compiler->optimize() && 2268 trace_count_ < kMaxCopiesCodeGenerated) { 2269 return CONTINUE; 2270 } 2271 2272 // If we get here code has been generated for this node too many times or 2273 // recursion is too deep. Time to switch to a generic version. The code for 2274 // generic versions above can handle deep recursion properly. 2275 bool was_limiting = compiler->limiting_recursion(); 2276 compiler->set_limiting_recursion(true); 2277 trace->Flush(compiler, this); 2278 compiler->set_limiting_recursion(was_limiting); 2279 return DONE; 2280} 2281 2282 2283bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) { 2284 return !compiler->limiting_recursion() && 2285 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion; 2286} 2287 2288 2289int ActionNode::EatsAtLeast(int still_to_find, 2290 int budget, 2291 bool not_at_start) { 2292 if (budget <= 0) return 0; 2293 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2294 return on_success()->EatsAtLeast(still_to_find, 2295 budget - 1, 2296 not_at_start); 2297} 2298 2299 2300void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2301 BoyerMooreLookahead* bm, bool not_at_start) { 2302 if (action_type_ == BEGIN_SUBMATCH) { 2303 bm->SetRest(offset); 2304 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2305 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2306 } 2307 SaveBMInfo(bm, not_at_start, offset); 2308} 2309 2310 2311int AssertionNode::EatsAtLeast(int still_to_find, 2312 int budget, 2313 bool not_at_start) { 2314 if (budget <= 0) return 0; 2315 // If we know we are not at the start and we are asked "how many characters 2316 // will you match if you succeed?" then we can answer anything since false 2317 // implies false. So lets just return the max answer (still_to_find) since 2318 // that won't prevent us from preloading a lot of characters for the other 2319 // branches in the node graph. 2320 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2321 return on_success()->EatsAtLeast(still_to_find, 2322 budget - 1, 2323 not_at_start); 2324} 2325 2326 2327void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2328 BoyerMooreLookahead* bm, bool not_at_start) { 2329 // Match the behaviour of EatsAtLeast on this node. 2330 if (assertion_type() == AT_START && not_at_start) return; 2331 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2332 SaveBMInfo(bm, not_at_start, offset); 2333} 2334 2335 2336int BackReferenceNode::EatsAtLeast(int still_to_find, 2337 int budget, 2338 bool not_at_start) { 2339 if (read_backward()) return 0; 2340 if (budget <= 0) return 0; 2341 return on_success()->EatsAtLeast(still_to_find, 2342 budget - 1, 2343 not_at_start); 2344} 2345 2346 2347int TextNode::EatsAtLeast(int still_to_find, 2348 int budget, 2349 bool not_at_start) { 2350 if (read_backward()) return 0; 2351 int answer = Length(); 2352 if (answer >= still_to_find) return answer; 2353 if (budget <= 0) return answer; 2354 // We are not at start after this node so we set the last argument to 'true'. 2355 return answer + on_success()->EatsAtLeast(still_to_find - answer, 2356 budget - 1, 2357 true); 2358} 2359 2360 2361int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget, 2362 bool not_at_start) { 2363 if (budget <= 0) return 0; 2364 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2365 // afterwards. 2366 RegExpNode* node = alternatives_->at(1).node(); 2367 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start); 2368} 2369 2370 2371void NegativeLookaroundChoiceNode::GetQuickCheckDetails( 2372 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in, 2373 bool not_at_start) { 2374 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2375 // afterwards. 2376 RegExpNode* node = alternatives_->at(1).node(); 2377 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); 2378} 2379 2380 2381int ChoiceNode::EatsAtLeastHelper(int still_to_find, 2382 int budget, 2383 RegExpNode* ignore_this_node, 2384 bool not_at_start) { 2385 if (budget <= 0) return 0; 2386 int min = 100; 2387 int choice_count = alternatives_->length(); 2388 budget = (budget - 1) / choice_count; 2389 for (int i = 0; i < choice_count; i++) { 2390 RegExpNode* node = alternatives_->at(i).node(); 2391 if (node == ignore_this_node) continue; 2392 int node_eats_at_least = 2393 node->EatsAtLeast(still_to_find, budget, not_at_start); 2394 if (node_eats_at_least < min) min = node_eats_at_least; 2395 if (min == 0) return 0; 2396 } 2397 return min; 2398} 2399 2400 2401int LoopChoiceNode::EatsAtLeast(int still_to_find, 2402 int budget, 2403 bool not_at_start) { 2404 return EatsAtLeastHelper(still_to_find, 2405 budget - 1, 2406 loop_node_, 2407 not_at_start); 2408} 2409 2410 2411int ChoiceNode::EatsAtLeast(int still_to_find, 2412 int budget, 2413 bool not_at_start) { 2414 return EatsAtLeastHelper(still_to_find, 2415 budget, 2416 NULL, 2417 not_at_start); 2418} 2419 2420 2421// Takes the left-most 1-bit and smears it out, setting all bits to its right. 2422static inline uint32_t SmearBitsRight(uint32_t v) { 2423 v |= v >> 1; 2424 v |= v >> 2; 2425 v |= v >> 4; 2426 v |= v >> 8; 2427 v |= v >> 16; 2428 return v; 2429} 2430 2431 2432bool QuickCheckDetails::Rationalize(bool asc) { 2433 bool found_useful_op = false; 2434 uint32_t char_mask; 2435 if (asc) { 2436 char_mask = String::kMaxOneByteCharCode; 2437 } else { 2438 char_mask = String::kMaxUtf16CodeUnit; 2439 } 2440 mask_ = 0; 2441 value_ = 0; 2442 int char_shift = 0; 2443 for (int i = 0; i < characters_; i++) { 2444 Position* pos = &positions_[i]; 2445 if ((pos->mask & String::kMaxOneByteCharCode) != 0) { 2446 found_useful_op = true; 2447 } 2448 mask_ |= (pos->mask & char_mask) << char_shift; 2449 value_ |= (pos->value & char_mask) << char_shift; 2450 char_shift += asc ? 8 : 16; 2451 } 2452 return found_useful_op; 2453} 2454 2455 2456bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, 2457 Trace* bounds_check_trace, 2458 Trace* trace, 2459 bool preload_has_checked_bounds, 2460 Label* on_possible_success, 2461 QuickCheckDetails* details, 2462 bool fall_through_on_failure) { 2463 if (details->characters() == 0) return false; 2464 GetQuickCheckDetails( 2465 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); 2466 if (details->cannot_match()) return false; 2467 if (!details->Rationalize(compiler->one_byte())) return false; 2468 DCHECK(details->characters() == 1 || 2469 compiler->macro_assembler()->CanReadUnaligned()); 2470 uint32_t mask = details->mask(); 2471 uint32_t value = details->value(); 2472 2473 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 2474 2475 if (trace->characters_preloaded() != details->characters()) { 2476 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset()); 2477 // We are attempting to preload the minimum number of characters 2478 // any choice would eat, so if the bounds check fails, then none of the 2479 // choices can succeed, so we can just immediately backtrack, rather 2480 // than go to the next choice. 2481 assembler->LoadCurrentCharacter(trace->cp_offset(), 2482 bounds_check_trace->backtrack(), 2483 !preload_has_checked_bounds, 2484 details->characters()); 2485 } 2486 2487 2488 bool need_mask = true; 2489 2490 if (details->characters() == 1) { 2491 // If number of characters preloaded is 1 then we used a byte or 16 bit 2492 // load so the value is already masked down. 2493 uint32_t char_mask; 2494 if (compiler->one_byte()) { 2495 char_mask = String::kMaxOneByteCharCode; 2496 } else { 2497 char_mask = String::kMaxUtf16CodeUnit; 2498 } 2499 if ((mask & char_mask) == char_mask) need_mask = false; 2500 mask &= char_mask; 2501 } else { 2502 // For 2-character preloads in one-byte mode or 1-character preloads in 2503 // two-byte mode we also use a 16 bit load with zero extend. 2504 static const uint32_t kTwoByteMask = 0xffff; 2505 static const uint32_t kFourByteMask = 0xffffffff; 2506 if (details->characters() == 2 && compiler->one_byte()) { 2507 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 2508 } else if (details->characters() == 1 && !compiler->one_byte()) { 2509 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false; 2510 } else { 2511 if (mask == kFourByteMask) need_mask = false; 2512 } 2513 } 2514 2515 if (fall_through_on_failure) { 2516 if (need_mask) { 2517 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); 2518 } else { 2519 assembler->CheckCharacter(value, on_possible_success); 2520 } 2521 } else { 2522 if (need_mask) { 2523 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack()); 2524 } else { 2525 assembler->CheckNotCharacter(value, trace->backtrack()); 2526 } 2527 } 2528 return true; 2529} 2530 2531 2532// Here is the meat of GetQuickCheckDetails (see also the comment on the 2533// super-class in the .h file). 2534// 2535// We iterate along the text object, building up for each character a 2536// mask and value that can be used to test for a quick failure to match. 2537// The masks and values for the positions will be combined into a single 2538// machine word for the current character width in order to be used in 2539// generating a quick check. 2540void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2541 RegExpCompiler* compiler, 2542 int characters_filled_in, 2543 bool not_at_start) { 2544 // Do not collect any quick check details if the text node reads backward, 2545 // since it reads in the opposite direction than we use for quick checks. 2546 if (read_backward()) return; 2547 Isolate* isolate = compiler->macro_assembler()->isolate(); 2548 DCHECK(characters_filled_in < details->characters()); 2549 int characters = details->characters(); 2550 int char_mask; 2551 if (compiler->one_byte()) { 2552 char_mask = String::kMaxOneByteCharCode; 2553 } else { 2554 char_mask = String::kMaxUtf16CodeUnit; 2555 } 2556 for (int k = 0; k < elements()->length(); k++) { 2557 TextElement elm = elements()->at(k); 2558 if (elm.text_type() == TextElement::ATOM) { 2559 Vector<const uc16> quarks = elm.atom()->data(); 2560 for (int i = 0; i < characters && i < quarks.length(); i++) { 2561 QuickCheckDetails::Position* pos = 2562 details->positions(characters_filled_in); 2563 uc16 c = quarks[i]; 2564 if (compiler->ignore_case()) { 2565 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2566 int length = GetCaseIndependentLetters(isolate, c, 2567 compiler->one_byte(), chars); 2568 if (length == 0) { 2569 // This can happen because all case variants are non-Latin1, but we 2570 // know the input is Latin1. 2571 details->set_cannot_match(); 2572 pos->determines_perfectly = false; 2573 return; 2574 } 2575 if (length == 1) { 2576 // This letter has no case equivalents, so it's nice and simple 2577 // and the mask-compare will determine definitely whether we have 2578 // a match at this character position. 2579 pos->mask = char_mask; 2580 pos->value = c; 2581 pos->determines_perfectly = true; 2582 } else { 2583 uint32_t common_bits = char_mask; 2584 uint32_t bits = chars[0]; 2585 for (int j = 1; j < length; j++) { 2586 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits); 2587 common_bits ^= differing_bits; 2588 bits &= common_bits; 2589 } 2590 // If length is 2 and common bits has only one zero in it then 2591 // our mask and compare instruction will determine definitely 2592 // whether we have a match at this character position. Otherwise 2593 // it can only be an approximate check. 2594 uint32_t one_zero = (common_bits | ~char_mask); 2595 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 2596 pos->determines_perfectly = true; 2597 } 2598 pos->mask = common_bits; 2599 pos->value = bits; 2600 } 2601 } else { 2602 // Don't ignore case. Nice simple case where the mask-compare will 2603 // determine definitely whether we have a match at this character 2604 // position. 2605 if (c > char_mask) { 2606 details->set_cannot_match(); 2607 pos->determines_perfectly = false; 2608 return; 2609 } 2610 pos->mask = char_mask; 2611 pos->value = c; 2612 pos->determines_perfectly = true; 2613 } 2614 characters_filled_in++; 2615 DCHECK(characters_filled_in <= details->characters()); 2616 if (characters_filled_in == details->characters()) { 2617 return; 2618 } 2619 } 2620 } else { 2621 QuickCheckDetails::Position* pos = 2622 details->positions(characters_filled_in); 2623 RegExpCharacterClass* tree = elm.char_class(); 2624 ZoneList<CharacterRange>* ranges = tree->ranges(zone()); 2625 if (tree->is_negated()) { 2626 // A quick check uses multi-character mask and compare. There is no 2627 // useful way to incorporate a negative char class into this scheme 2628 // so we just conservatively create a mask and value that will always 2629 // succeed. 2630 pos->mask = 0; 2631 pos->value = 0; 2632 } else { 2633 int first_range = 0; 2634 while (ranges->at(first_range).from() > char_mask) { 2635 first_range++; 2636 if (first_range == ranges->length()) { 2637 details->set_cannot_match(); 2638 pos->determines_perfectly = false; 2639 return; 2640 } 2641 } 2642 CharacterRange range = ranges->at(first_range); 2643 uc16 from = range.from(); 2644 uc16 to = range.to(); 2645 if (to > char_mask) { 2646 to = char_mask; 2647 } 2648 uint32_t differing_bits = (from ^ to); 2649 // A mask and compare is only perfect if the differing bits form a 2650 // number like 00011111 with one single block of trailing 1s. 2651 if ((differing_bits & (differing_bits + 1)) == 0 && 2652 from + differing_bits == to) { 2653 pos->determines_perfectly = true; 2654 } 2655 uint32_t common_bits = ~SmearBitsRight(differing_bits); 2656 uint32_t bits = (from & common_bits); 2657 for (int i = first_range + 1; i < ranges->length(); i++) { 2658 CharacterRange range = ranges->at(i); 2659 uc16 from = range.from(); 2660 uc16 to = range.to(); 2661 if (from > char_mask) continue; 2662 if (to > char_mask) to = char_mask; 2663 // Here we are combining more ranges into the mask and compare 2664 // value. With each new range the mask becomes more sparse and 2665 // so the chances of a false positive rise. A character class 2666 // with multiple ranges is assumed never to be equivalent to a 2667 // mask and compare operation. 2668 pos->determines_perfectly = false; 2669 uint32_t new_common_bits = (from ^ to); 2670 new_common_bits = ~SmearBitsRight(new_common_bits); 2671 common_bits &= new_common_bits; 2672 bits &= new_common_bits; 2673 uint32_t differing_bits = (from & common_bits) ^ bits; 2674 common_bits ^= differing_bits; 2675 bits &= common_bits; 2676 } 2677 pos->mask = common_bits; 2678 pos->value = bits; 2679 } 2680 characters_filled_in++; 2681 DCHECK(characters_filled_in <= details->characters()); 2682 if (characters_filled_in == details->characters()) { 2683 return; 2684 } 2685 } 2686 } 2687 DCHECK(characters_filled_in != details->characters()); 2688 if (!details->cannot_match()) { 2689 on_success()-> GetQuickCheckDetails(details, 2690 compiler, 2691 characters_filled_in, 2692 true); 2693 } 2694} 2695 2696 2697void QuickCheckDetails::Clear() { 2698 for (int i = 0; i < characters_; i++) { 2699 positions_[i].mask = 0; 2700 positions_[i].value = 0; 2701 positions_[i].determines_perfectly = false; 2702 } 2703 characters_ = 0; 2704} 2705 2706 2707void QuickCheckDetails::Advance(int by, bool one_byte) { 2708 if (by >= characters_ || by < 0) { 2709 DCHECK_IMPLIES(by < 0, characters_ == 0); 2710 Clear(); 2711 return; 2712 } 2713 DCHECK_LE(characters_ - by, 4); 2714 DCHECK_LE(characters_, 4); 2715 for (int i = 0; i < characters_ - by; i++) { 2716 positions_[i] = positions_[by + i]; 2717 } 2718 for (int i = characters_ - by; i < characters_; i++) { 2719 positions_[i].mask = 0; 2720 positions_[i].value = 0; 2721 positions_[i].determines_perfectly = false; 2722 } 2723 characters_ -= by; 2724 // We could change mask_ and value_ here but we would never advance unless 2725 // they had already been used in a check and they won't be used again because 2726 // it would gain us nothing. So there's no point. 2727} 2728 2729 2730void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { 2731 DCHECK(characters_ == other->characters_); 2732 if (other->cannot_match_) { 2733 return; 2734 } 2735 if (cannot_match_) { 2736 *this = *other; 2737 return; 2738 } 2739 for (int i = from_index; i < characters_; i++) { 2740 QuickCheckDetails::Position* pos = positions(i); 2741 QuickCheckDetails::Position* other_pos = other->positions(i); 2742 if (pos->mask != other_pos->mask || 2743 pos->value != other_pos->value || 2744 !other_pos->determines_perfectly) { 2745 // Our mask-compare operation will be approximate unless we have the 2746 // exact same operation on both sides of the alternation. 2747 pos->determines_perfectly = false; 2748 } 2749 pos->mask &= other_pos->mask; 2750 pos->value &= pos->mask; 2751 other_pos->value &= pos->mask; 2752 uc16 differing_bits = (pos->value ^ other_pos->value); 2753 pos->mask &= ~differing_bits; 2754 pos->value &= pos->mask; 2755 } 2756} 2757 2758 2759class VisitMarker { 2760 public: 2761 explicit VisitMarker(NodeInfo* info) : info_(info) { 2762 DCHECK(!info->visited); 2763 info->visited = true; 2764 } 2765 ~VisitMarker() { 2766 info_->visited = false; 2767 } 2768 private: 2769 NodeInfo* info_; 2770}; 2771 2772 2773RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) { 2774 if (info()->replacement_calculated) return replacement(); 2775 if (depth < 0) return this; 2776 DCHECK(!info()->visited); 2777 VisitMarker marker(info()); 2778 return FilterSuccessor(depth - 1, ignore_case); 2779} 2780 2781 2782RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { 2783 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); 2784 if (next == NULL) return set_replacement(NULL); 2785 on_success_ = next; 2786 return set_replacement(this); 2787} 2788 2789 2790// We need to check for the following characters: 0x39c 0x3bc 0x178. 2791static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2792 // TODO(dcarney): this could be a lot more efficient. 2793 return range.Contains(0x39c) || 2794 range.Contains(0x3bc) || range.Contains(0x178); 2795} 2796 2797 2798static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2799 for (int i = 0; i < ranges->length(); i++) { 2800 // TODO(dcarney): this could be a lot more efficient. 2801 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2802 } 2803 return false; 2804} 2805 2806 2807RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { 2808 if (info()->replacement_calculated) return replacement(); 2809 if (depth < 0) return this; 2810 DCHECK(!info()->visited); 2811 VisitMarker marker(info()); 2812 int element_count = elements()->length(); 2813 for (int i = 0; i < element_count; i++) { 2814 TextElement elm = elements()->at(i); 2815 if (elm.text_type() == TextElement::ATOM) { 2816 Vector<const uc16> quarks = elm.atom()->data(); 2817 for (int j = 0; j < quarks.length(); j++) { 2818 uint16_t c = quarks[j]; 2819 if (c <= String::kMaxOneByteCharCode) continue; 2820 if (!ignore_case) return set_replacement(NULL); 2821 // Here, we need to check for characters whose upper and lower cases 2822 // are outside the Latin-1 range. 2823 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); 2824 // Character is outside Latin-1 completely 2825 if (converted == 0) return set_replacement(NULL); 2826 // Convert quark to Latin-1 in place. 2827 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); 2828 copy[j] = converted; 2829 } 2830 } else { 2831 DCHECK(elm.text_type() == TextElement::CHAR_CLASS); 2832 RegExpCharacterClass* cc = elm.char_class(); 2833 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2834 CharacterRange::Canonicalize(ranges); 2835 // Now they are in order so we only need to look at the first. 2836 int range_count = ranges->length(); 2837 if (cc->is_negated()) { 2838 if (range_count != 0 && 2839 ranges->at(0).from() == 0 && 2840 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2841 // This will be handled in a later filter. 2842 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2843 return set_replacement(NULL); 2844 } 2845 } else { 2846 if (range_count == 0 || 2847 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2848 // This will be handled in a later filter. 2849 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2850 return set_replacement(NULL); 2851 } 2852 } 2853 } 2854 } 2855 return FilterSuccessor(depth - 1, ignore_case); 2856} 2857 2858 2859RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) { 2860 if (info()->replacement_calculated) return replacement(); 2861 if (depth < 0) return this; 2862 if (info()->visited) return this; 2863 { 2864 VisitMarker marker(info()); 2865 2866 RegExpNode* continue_replacement = 2867 continue_node_->FilterOneByte(depth - 1, ignore_case); 2868 // If we can't continue after the loop then there is no sense in doing the 2869 // loop. 2870 if (continue_replacement == NULL) return set_replacement(NULL); 2871 } 2872 2873 return ChoiceNode::FilterOneByte(depth - 1, ignore_case); 2874} 2875 2876 2877RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { 2878 if (info()->replacement_calculated) return replacement(); 2879 if (depth < 0) return this; 2880 if (info()->visited) return this; 2881 VisitMarker marker(info()); 2882 int choice_count = alternatives_->length(); 2883 2884 for (int i = 0; i < choice_count; i++) { 2885 GuardedAlternative alternative = alternatives_->at(i); 2886 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2887 set_replacement(this); 2888 return this; 2889 } 2890 } 2891 2892 int surviving = 0; 2893 RegExpNode* survivor = NULL; 2894 for (int i = 0; i < choice_count; i++) { 2895 GuardedAlternative alternative = alternatives_->at(i); 2896 RegExpNode* replacement = 2897 alternative.node()->FilterOneByte(depth - 1, ignore_case); 2898 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. 2899 if (replacement != NULL) { 2900 alternatives_->at(i).set_node(replacement); 2901 surviving++; 2902 survivor = replacement; 2903 } 2904 } 2905 if (surviving < 2) return set_replacement(survivor); 2906 2907 set_replacement(this); 2908 if (surviving == choice_count) { 2909 return this; 2910 } 2911 // Only some of the nodes survived the filtering. We need to rebuild the 2912 // alternatives list. 2913 ZoneList<GuardedAlternative>* new_alternatives = 2914 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2915 for (int i = 0; i < choice_count; i++) { 2916 RegExpNode* replacement = 2917 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); 2918 if (replacement != NULL) { 2919 alternatives_->at(i).set_node(replacement); 2920 new_alternatives->Add(alternatives_->at(i), zone()); 2921 } 2922 } 2923 alternatives_ = new_alternatives; 2924 return this; 2925} 2926 2927 2928RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, 2929 bool ignore_case) { 2930 if (info()->replacement_calculated) return replacement(); 2931 if (depth < 0) return this; 2932 if (info()->visited) return this; 2933 VisitMarker marker(info()); 2934 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2935 // afterwards. 2936 RegExpNode* node = alternatives_->at(1).node(); 2937 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); 2938 if (replacement == NULL) return set_replacement(NULL); 2939 alternatives_->at(1).set_node(replacement); 2940 2941 RegExpNode* neg_node = alternatives_->at(0).node(); 2942 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); 2943 // If the negative lookahead is always going to fail then 2944 // we don't need to check it. 2945 if (neg_replacement == NULL) return set_replacement(replacement); 2946 alternatives_->at(0).set_node(neg_replacement); 2947 return set_replacement(this); 2948} 2949 2950 2951void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2952 RegExpCompiler* compiler, 2953 int characters_filled_in, 2954 bool not_at_start) { 2955 if (body_can_be_zero_length_ || info()->visited) return; 2956 VisitMarker marker(info()); 2957 return ChoiceNode::GetQuickCheckDetails(details, 2958 compiler, 2959 characters_filled_in, 2960 not_at_start); 2961} 2962 2963 2964void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2965 BoyerMooreLookahead* bm, bool not_at_start) { 2966 if (body_can_be_zero_length_ || budget <= 0) { 2967 bm->SetRest(offset); 2968 SaveBMInfo(bm, not_at_start, offset); 2969 return; 2970 } 2971 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2972 SaveBMInfo(bm, not_at_start, offset); 2973} 2974 2975 2976void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2977 RegExpCompiler* compiler, 2978 int characters_filled_in, 2979 bool not_at_start) { 2980 not_at_start = (not_at_start || not_at_start_); 2981 int choice_count = alternatives_->length(); 2982 DCHECK(choice_count > 0); 2983 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2984 compiler, 2985 characters_filled_in, 2986 not_at_start); 2987 for (int i = 1; i < choice_count; i++) { 2988 QuickCheckDetails new_details(details->characters()); 2989 RegExpNode* node = alternatives_->at(i).node(); 2990 node->GetQuickCheckDetails(&new_details, compiler, 2991 characters_filled_in, 2992 not_at_start); 2993 // Here we merge the quick match details of the two branches. 2994 details->Merge(&new_details, characters_filled_in); 2995 } 2996} 2997 2998 2999// Check for [0-9A-Z_a-z]. 3000static void EmitWordCheck(RegExpMacroAssembler* assembler, 3001 Label* word, 3002 Label* non_word, 3003 bool fall_through_on_word) { 3004 if (assembler->CheckSpecialCharacterClass( 3005 fall_through_on_word ? 'w' : 'W', 3006 fall_through_on_word ? non_word : word)) { 3007 // Optimized implementation available. 3008 return; 3009 } 3010 assembler->CheckCharacterGT('z', non_word); 3011 assembler->CheckCharacterLT('0', non_word); 3012 assembler->CheckCharacterGT('a' - 1, word); 3013 assembler->CheckCharacterLT('9' + 1, word); 3014 assembler->CheckCharacterLT('A', non_word); 3015 assembler->CheckCharacterLT('Z' + 1, word); 3016 if (fall_through_on_word) { 3017 assembler->CheckNotCharacter('_', non_word); 3018 } else { 3019 assembler->CheckCharacter('_', word); 3020 } 3021} 3022 3023 3024// Emit the code to check for a ^ in multiline mode (1-character lookbehind 3025// that matches newline or the start of input). 3026static void EmitHat(RegExpCompiler* compiler, 3027 RegExpNode* on_success, 3028 Trace* trace) { 3029 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3030 // We will be loading the previous character into the current character 3031 // register. 3032 Trace new_trace(*trace); 3033 new_trace.InvalidateCurrentCharacter(); 3034 3035 Label ok; 3036 if (new_trace.cp_offset() == 0) { 3037 // The start of input counts as a newline in this context, so skip to 3038 // ok if we are at the start. 3039 assembler->CheckAtStart(&ok); 3040 } 3041 // We already checked that we are not at the start of input so it must be 3042 // OK to load the previous character. 3043 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, 3044 new_trace.backtrack(), 3045 false); 3046 if (!assembler->CheckSpecialCharacterClass('n', 3047 new_trace.backtrack())) { 3048 // Newline means \n, \r, 0x2028 or 0x2029. 3049 if (!compiler->one_byte()) { 3050 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); 3051 } 3052 assembler->CheckCharacter('\n', &ok); 3053 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 3054 } 3055 assembler->Bind(&ok); 3056 on_success->Emit(compiler, &new_trace); 3057} 3058 3059 3060// Emit the code to handle \b and \B (word-boundary or non-word-boundary). 3061void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 3062 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3063 Isolate* isolate = assembler->isolate(); 3064 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 3065 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 3066 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3067 if (lookahead == NULL) { 3068 int eats_at_least = 3069 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 3070 kRecursionBudget, 3071 not_at_start)); 3072 if (eats_at_least >= 1) { 3073 BoyerMooreLookahead* bm = 3074 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 3075 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start); 3076 if (bm->at(0)->is_non_word()) 3077 next_is_word_character = Trace::FALSE_VALUE; 3078 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3079 } 3080 } else { 3081 if (lookahead->at(0)->is_non_word()) 3082 next_is_word_character = Trace::FALSE_VALUE; 3083 if (lookahead->at(0)->is_word()) 3084 next_is_word_character = Trace::TRUE_VALUE; 3085 } 3086 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY); 3087 if (next_is_word_character == Trace::UNKNOWN) { 3088 Label before_non_word; 3089 Label before_word; 3090 if (trace->characters_preloaded() != 1) { 3091 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); 3092 } 3093 // Fall through on non-word. 3094 EmitWordCheck(assembler, &before_word, &before_non_word, false); 3095 // Next character is not a word character. 3096 assembler->Bind(&before_non_word); 3097 Label ok; 3098 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3099 assembler->GoTo(&ok); 3100 3101 assembler->Bind(&before_word); 3102 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3103 assembler->Bind(&ok); 3104 } else if (next_is_word_character == Trace::TRUE_VALUE) { 3105 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); 3106 } else { 3107 DCHECK(next_is_word_character == Trace::FALSE_VALUE); 3108 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); 3109 } 3110} 3111 3112 3113void AssertionNode::BacktrackIfPrevious( 3114 RegExpCompiler* compiler, 3115 Trace* trace, 3116 AssertionNode::IfPrevious backtrack_if_previous) { 3117 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3118 Trace new_trace(*trace); 3119 new_trace.InvalidateCurrentCharacter(); 3120 3121 Label fall_through, dummy; 3122 3123 Label* non_word = backtrack_if_previous == kIsNonWord ? 3124 new_trace.backtrack() : 3125 &fall_through; 3126 Label* word = backtrack_if_previous == kIsNonWord ? 3127 &fall_through : 3128 new_trace.backtrack(); 3129 3130 if (new_trace.cp_offset() == 0) { 3131 // The start of input counts as a non-word character, so the question is 3132 // decided if we are at the start. 3133 assembler->CheckAtStart(non_word); 3134 } 3135 // We already checked that we are not at the start of input so it must be 3136 // OK to load the previous character. 3137 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); 3138 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); 3139 3140 assembler->Bind(&fall_through); 3141 on_success()->Emit(compiler, &new_trace); 3142} 3143 3144 3145void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details, 3146 RegExpCompiler* compiler, 3147 int filled_in, 3148 bool not_at_start) { 3149 if (assertion_type_ == AT_START && not_at_start) { 3150 details->set_cannot_match(); 3151 return; 3152 } 3153 return on_success()->GetQuickCheckDetails(details, 3154 compiler, 3155 filled_in, 3156 not_at_start); 3157} 3158 3159 3160void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3161 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3162 switch (assertion_type_) { 3163 case AT_END: { 3164 Label ok; 3165 assembler->CheckPosition(trace->cp_offset(), &ok); 3166 assembler->GoTo(trace->backtrack()); 3167 assembler->Bind(&ok); 3168 break; 3169 } 3170 case AT_START: { 3171 if (trace->at_start() == Trace::FALSE_VALUE) { 3172 assembler->GoTo(trace->backtrack()); 3173 return; 3174 } 3175 if (trace->at_start() == Trace::UNKNOWN) { 3176 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack()); 3177 Trace at_start_trace = *trace; 3178 at_start_trace.set_at_start(Trace::TRUE_VALUE); 3179 on_success()->Emit(compiler, &at_start_trace); 3180 return; 3181 } 3182 } 3183 break; 3184 case AFTER_NEWLINE: 3185 EmitHat(compiler, on_success(), trace); 3186 return; 3187 case AT_BOUNDARY: 3188 case AT_NON_BOUNDARY: { 3189 EmitBoundaryCheck(compiler, trace); 3190 return; 3191 } 3192 } 3193 on_success()->Emit(compiler, trace); 3194} 3195 3196 3197static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { 3198 if (quick_check == NULL) return false; 3199 if (offset >= quick_check->characters()) return false; 3200 return quick_check->positions(offset)->determines_perfectly; 3201} 3202 3203 3204static void UpdateBoundsCheck(int index, int* checked_up_to) { 3205 if (index > *checked_up_to) { 3206 *checked_up_to = index; 3207 } 3208} 3209 3210 3211// We call this repeatedly to generate code for each pass over the text node. 3212// The passes are in increasing order of difficulty because we hope one 3213// of the first passes will fail in which case we are saved the work of the 3214// later passes. for example for the case independent regexp /%[asdfghjkl]a/ 3215// we will check the '%' in the first pass, the case independent 'a' in the 3216// second pass and the character class in the last pass. 3217// 3218// The passes are done from right to left, so for example to test for /bar/ 3219// we will first test for an 'r' with offset 2, then an 'a' with offset 1 3220// and then a 'b' with offset 0. This means we can avoid the end-of-input 3221// bounds check most of the time. In the example we only need to check for 3222// end-of-input when loading the putative 'r'. 3223// 3224// A slight complication involves the fact that the first character may already 3225// be fetched into a register by the previous node. In this case we want to 3226// do the test for that character first. We do this in separate passes. The 3227// 'preloaded' argument indicates that we are doing such a 'pass'. If such a 3228// pass has been performed then subsequent passes will have true in 3229// first_element_checked to indicate that that character does not need to be 3230// checked again. 3231// 3232// In addition to all this we are passed a Trace, which can 3233// contain an AlternativeGeneration object. In this AlternativeGeneration 3234// object we can see details of any quick check that was already passed in 3235// order to get to the code we are now generating. The quick check can involve 3236// loading characters, which means we do not need to recheck the bounds 3237// up to the limit the quick check already checked. In addition the quick 3238// check can have involved a mask and compare operation which may simplify 3239// or obviate the need for further checks at some character positions. 3240void TextNode::TextEmitPass(RegExpCompiler* compiler, 3241 TextEmitPassType pass, 3242 bool preloaded, 3243 Trace* trace, 3244 bool first_element_checked, 3245 int* checked_up_to) { 3246 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3247 Isolate* isolate = assembler->isolate(); 3248 bool one_byte = compiler->one_byte(); 3249 Label* backtrack = trace->backtrack(); 3250 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3251 int element_count = elements()->length(); 3252 int backward_offset = read_backward() ? -Length() : 0; 3253 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3254 TextElement elm = elements()->at(i); 3255 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; 3256 if (elm.text_type() == TextElement::ATOM) { 3257 Vector<const uc16> quarks = elm.atom()->data(); 3258 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3259 if (first_element_checked && i == 0 && j == 0) continue; 3260 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 3261 EmitCharacterFunction* emit_function = NULL; 3262 switch (pass) { 3263 case NON_LATIN1_MATCH: 3264 DCHECK(one_byte); 3265 if (quarks[j] > String::kMaxOneByteCharCode) { 3266 assembler->GoTo(backtrack); 3267 return; 3268 } 3269 break; 3270 case NON_LETTER_CHARACTER_MATCH: 3271 emit_function = &EmitAtomNonLetter; 3272 break; 3273 case SIMPLE_CHARACTER_MATCH: 3274 emit_function = &EmitSimpleCharacter; 3275 break; 3276 case CASE_CHARACTER_MATCH: 3277 emit_function = &EmitAtomLetter; 3278 break; 3279 default: 3280 break; 3281 } 3282 if (emit_function != NULL) { 3283 bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); 3284 bool bound_checked = 3285 emit_function(isolate, compiler, quarks[j], backtrack, 3286 cp_offset + j, bounds_check, preloaded); 3287 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3288 } 3289 } 3290 } else { 3291 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); 3292 if (pass == CHARACTER_CLASS_MATCH) { 3293 if (first_element_checked && i == 0) continue; 3294 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 3295 RegExpCharacterClass* cc = elm.char_class(); 3296 bool bounds_check = *checked_up_to < cp_offset || read_backward(); 3297 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset, 3298 bounds_check, preloaded, zone()); 3299 UpdateBoundsCheck(cp_offset, checked_up_to); 3300 } 3301 } 3302 } 3303} 3304 3305 3306int TextNode::Length() { 3307 TextElement elm = elements()->last(); 3308 DCHECK(elm.cp_offset() >= 0); 3309 return elm.cp_offset() + elm.length(); 3310} 3311 3312 3313bool TextNode::SkipPass(int int_pass, bool ignore_case) { 3314 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass); 3315 if (ignore_case) { 3316 return pass == SIMPLE_CHARACTER_MATCH; 3317 } else { 3318 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH; 3319 } 3320} 3321 3322 3323TextNode* TextNode::CreateForCharacterRanges(Zone* zone, 3324 ZoneList<CharacterRange>* ranges, 3325 bool read_backward, 3326 RegExpNode* on_success) { 3327 DCHECK_NOT_NULL(ranges); 3328 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone); 3329 elms->Add( 3330 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)), 3331 zone); 3332 return new (zone) TextNode(elms, read_backward, on_success); 3333} 3334 3335 3336TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead, 3337 CharacterRange trail, 3338 bool read_backward, 3339 RegExpNode* on_success) { 3340 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead); 3341 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail); 3342 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone); 3343 elms->Add(TextElement::CharClass( 3344 new (zone) RegExpCharacterClass(lead_ranges, false)), 3345 zone); 3346 elms->Add(TextElement::CharClass( 3347 new (zone) RegExpCharacterClass(trail_ranges, false)), 3348 zone); 3349 return new (zone) TextNode(elms, read_backward, on_success); 3350} 3351 3352 3353// This generates the code to match a text node. A text node can contain 3354// straight character sequences (possibly to be matched in a case-independent 3355// way) and character classes. For efficiency we do not do this in a single 3356// pass from left to right. Instead we pass over the text node several times, 3357// emitting code for some character positions every time. See the comment on 3358// TextEmitPass for details. 3359void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3360 LimitResult limit_result = LimitVersions(compiler, trace); 3361 if (limit_result == DONE) return; 3362 DCHECK(limit_result == CONTINUE); 3363 3364 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3365 compiler->SetRegExpTooBig(); 3366 return; 3367 } 3368 3369 if (compiler->one_byte()) { 3370 int dummy = 0; 3371 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); 3372 } 3373 3374 bool first_elt_done = false; 3375 int bound_checked_to = trace->cp_offset() - 1; 3376 bound_checked_to += trace->bound_checked_up_to(); 3377 3378 // If a character is preloaded into the current character register then 3379 // check that now. 3380 if (trace->characters_preloaded() == 1) { 3381 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3382 if (!SkipPass(pass, compiler->ignore_case())) { 3383 TextEmitPass(compiler, 3384 static_cast<TextEmitPassType>(pass), 3385 true, 3386 trace, 3387 false, 3388 &bound_checked_to); 3389 } 3390 } 3391 first_elt_done = true; 3392 } 3393 3394 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { 3395 if (!SkipPass(pass, compiler->ignore_case())) { 3396 TextEmitPass(compiler, 3397 static_cast<TextEmitPassType>(pass), 3398 false, 3399 trace, 3400 first_elt_done, 3401 &bound_checked_to); 3402 } 3403 } 3404 3405 Trace successor_trace(*trace); 3406 // If we advance backward, we may end up at the start. 3407 successor_trace.AdvanceCurrentPositionInTrace( 3408 read_backward() ? -Length() : Length(), compiler); 3409 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN 3410 : Trace::FALSE_VALUE); 3411 RecursionCheck rc(compiler); 3412 on_success()->Emit(compiler, &successor_trace); 3413} 3414 3415 3416void Trace::InvalidateCurrentCharacter() { 3417 characters_preloaded_ = 0; 3418} 3419 3420 3421void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { 3422 // We don't have an instruction for shifting the current character register 3423 // down or for using a shifted value for anything so lets just forget that 3424 // we preloaded any characters into it. 3425 characters_preloaded_ = 0; 3426 // Adjust the offsets of the quick check performed information. This 3427 // information is used to find out what we already determined about the 3428 // characters by means of mask and compare. 3429 quick_check_performed_.Advance(by, compiler->one_byte()); 3430 cp_offset_ += by; 3431 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 3432 compiler->SetRegExpTooBig(); 3433 cp_offset_ = 0; 3434 } 3435 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 3436} 3437 3438 3439void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) { 3440 int element_count = elements()->length(); 3441 for (int i = 0; i < element_count; i++) { 3442 TextElement elm = elements()->at(i); 3443 if (elm.text_type() == TextElement::CHAR_CLASS) { 3444 RegExpCharacterClass* cc = elm.char_class(); 3445 // None of the standard character classes is different in the case 3446 // independent case and it slows us down if we don't know that. 3447 if (cc->is_standard(zone())) continue; 3448 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 3449 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); 3450 } 3451 } 3452} 3453 3454 3455int TextNode::GreedyLoopTextLength() { return Length(); } 3456 3457 3458RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( 3459 RegExpCompiler* compiler) { 3460 if (read_backward()) return NULL; 3461 if (elements()->length() != 1) return NULL; 3462 TextElement elm = elements()->at(0); 3463 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; 3464 RegExpCharacterClass* node = elm.char_class(); 3465 ZoneList<CharacterRange>* ranges = node->ranges(zone()); 3466 CharacterRange::Canonicalize(ranges); 3467 if (node->is_negated()) { 3468 return ranges->length() == 0 ? on_success() : NULL; 3469 } 3470 if (ranges->length() != 1) return NULL; 3471 uint32_t max_char; 3472 if (compiler->one_byte()) { 3473 max_char = String::kMaxOneByteCharCode; 3474 } else { 3475 max_char = String::kMaxUtf16CodeUnit; 3476 } 3477 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; 3478} 3479 3480 3481// Finds the fixed match length of a sequence of nodes that goes from 3482// this alternative and back to this choice node. If there are variable 3483// length nodes or other complications in the way then return a sentinel 3484// value indicating that a greedy loop cannot be constructed. 3485int ChoiceNode::GreedyLoopTextLengthForAlternative( 3486 GuardedAlternative* alternative) { 3487 int length = 0; 3488 RegExpNode* node = alternative->node(); 3489 // Later we will generate code for all these text nodes using recursion 3490 // so we have to limit the max number. 3491 int recursion_depth = 0; 3492 while (node != this) { 3493 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) { 3494 return kNodeIsTooComplexForGreedyLoops; 3495 } 3496 int node_length = node->GreedyLoopTextLength(); 3497 if (node_length == kNodeIsTooComplexForGreedyLoops) { 3498 return kNodeIsTooComplexForGreedyLoops; 3499 } 3500 length += node_length; 3501 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node); 3502 node = seq_node->on_success(); 3503 } 3504 return read_backward() ? -length : length; 3505} 3506 3507 3508void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) { 3509 DCHECK_NULL(loop_node_); 3510 AddAlternative(alt); 3511 loop_node_ = alt.node(); 3512} 3513 3514 3515void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) { 3516 DCHECK_NULL(continue_node_); 3517 AddAlternative(alt); 3518 continue_node_ = alt.node(); 3519} 3520 3521 3522void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3523 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 3524 if (trace->stop_node() == this) { 3525 // Back edge of greedy optimized loop node graph. 3526 int text_length = 3527 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0))); 3528 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops); 3529 // Update the counter-based backtracking info on the stack. This is an 3530 // optimization for greedy loops (see below). 3531 DCHECK(trace->cp_offset() == text_length); 3532 macro_assembler->AdvanceCurrentPosition(text_length); 3533 macro_assembler->GoTo(trace->loop_label()); 3534 return; 3535 } 3536 DCHECK_NULL(trace->stop_node()); 3537 if (!trace->is_trivial()) { 3538 trace->Flush(compiler, this); 3539 return; 3540 } 3541 ChoiceNode::Emit(compiler, trace); 3542} 3543 3544 3545int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 3546 int eats_at_least) { 3547 int preload_characters = Min(4, eats_at_least); 3548 if (compiler->macro_assembler()->CanReadUnaligned()) { 3549 bool one_byte = compiler->one_byte(); 3550 if (one_byte) { 3551 if (preload_characters > 4) preload_characters = 4; 3552 // We can't preload 3 characters because there is no machine instruction 3553 // to do that. We can't just load 4 because we could be reading 3554 // beyond the end of the string, which could cause a memory fault. 3555 if (preload_characters == 3) preload_characters = 2; 3556 } else { 3557 if (preload_characters > 2) preload_characters = 2; 3558 } 3559 } else { 3560 if (preload_characters > 1) preload_characters = 1; 3561 } 3562 return preload_characters; 3563} 3564 3565 3566// This class is used when generating the alternatives in a choice node. It 3567// records the way the alternative is being code generated. 3568class AlternativeGeneration: public Malloced { 3569 public: 3570 AlternativeGeneration() 3571 : possible_success(), 3572 expects_preload(false), 3573 after(), 3574 quick_check_details() { } 3575 Label possible_success; 3576 bool expects_preload; 3577 Label after; 3578 QuickCheckDetails quick_check_details; 3579}; 3580 3581 3582// Creates a list of AlternativeGenerations. If the list has a reasonable 3583// size then it is on the stack, otherwise the excess is on the heap. 3584class AlternativeGenerationList { 3585 public: 3586 AlternativeGenerationList(int count, Zone* zone) 3587 : alt_gens_(count, zone) { 3588 for (int i = 0; i < count && i < kAFew; i++) { 3589 alt_gens_.Add(a_few_alt_gens_ + i, zone); 3590 } 3591 for (int i = kAFew; i < count; i++) { 3592 alt_gens_.Add(new AlternativeGeneration(), zone); 3593 } 3594 } 3595 ~AlternativeGenerationList() { 3596 for (int i = kAFew; i < alt_gens_.length(); i++) { 3597 delete alt_gens_[i]; 3598 alt_gens_[i] = NULL; 3599 } 3600 } 3601 3602 AlternativeGeneration* at(int i) { 3603 return alt_gens_[i]; 3604 } 3605 3606 private: 3607 static const int kAFew = 10; 3608 ZoneList<AlternativeGeneration*> alt_gens_; 3609 AlternativeGeneration a_few_alt_gens_[kAFew]; 3610}; 3611 3612 3613static const uc32 kRangeEndMarker = 0x110000; 3614 3615// The '2' variant is has inclusive from and exclusive to. 3616// This covers \s as defined in ECMA-262 5.1, 15.10.2.12, 3617// which include WhiteSpace (7.2) or LineTerminator (7.3) values. 3618static const int kSpaceRanges[] = { 3619 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681, 3620 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, 3621 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; 3622static const int kSpaceRangeCount = arraysize(kSpaceRanges); 3623 3624static const int kWordRanges[] = { 3625 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; 3626static const int kWordRangeCount = arraysize(kWordRanges); 3627static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; 3628static const int kDigitRangeCount = arraysize(kDigitRanges); 3629static const int kSurrogateRanges[] = { 3630 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker}; 3631static const int kSurrogateRangeCount = arraysize(kSurrogateRanges); 3632static const int kLineTerminatorRanges[] = { 3633 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker}; 3634static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); 3635 3636void BoyerMoorePositionInfo::Set(int character) { 3637 SetInterval(Interval(character, character)); 3638} 3639 3640 3641void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { 3642 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); 3643 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); 3644 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); 3645 surrogate_ = 3646 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); 3647 if (interval.to() - interval.from() >= kMapSize - 1) { 3648 if (map_count_ != kMapSize) { 3649 map_count_ = kMapSize; 3650 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3651 } 3652 return; 3653 } 3654 for (int i = interval.from(); i <= interval.to(); i++) { 3655 int mod_character = (i & kMask); 3656 if (!map_->at(mod_character)) { 3657 map_count_++; 3658 map_->at(mod_character) = true; 3659 } 3660 if (map_count_ == kMapSize) return; 3661 } 3662} 3663 3664 3665void BoyerMoorePositionInfo::SetAll() { 3666 s_ = w_ = d_ = kLatticeUnknown; 3667 if (map_count_ != kMapSize) { 3668 map_count_ = kMapSize; 3669 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; 3670 } 3671} 3672 3673 3674BoyerMooreLookahead::BoyerMooreLookahead( 3675 int length, RegExpCompiler* compiler, Zone* zone) 3676 : length_(length), 3677 compiler_(compiler) { 3678 if (compiler->one_byte()) { 3679 max_char_ = String::kMaxOneByteCharCode; 3680 } else { 3681 max_char_ = String::kMaxUtf16CodeUnit; 3682 } 3683 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); 3684 for (int i = 0; i < length; i++) { 3685 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); 3686 } 3687} 3688 3689 3690// Find the longest range of lookahead that has the fewest number of different 3691// characters that can occur at a given position. Since we are optimizing two 3692// different parameters at once this is a tradeoff. 3693bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { 3694 int biggest_points = 0; 3695 // If more than 32 characters out of 128 can occur it is unlikely that we can 3696 // be lucky enough to step forwards much of the time. 3697 const int kMaxMax = 32; 3698 for (int max_number_of_chars = 4; 3699 max_number_of_chars < kMaxMax; 3700 max_number_of_chars *= 2) { 3701 biggest_points = 3702 FindBestInterval(max_number_of_chars, biggest_points, from, to); 3703 } 3704 if (biggest_points == 0) return false; 3705 return true; 3706} 3707 3708 3709// Find the highest-points range between 0 and length_ where the character 3710// information is not too vague. 'Too vague' means that there are more than 3711// max_number_of_chars that can occur at this position. Calculates the number 3712// of points as the product of width-of-the-range and 3713// probability-of-finding-one-of-the-characters, where the probability is 3714// calculated using the frequency distribution of the sample subject string. 3715int BoyerMooreLookahead::FindBestInterval( 3716 int max_number_of_chars, int old_biggest_points, int* from, int* to) { 3717 int biggest_points = old_biggest_points; 3718 static const int kSize = RegExpMacroAssembler::kTableSize; 3719 for (int i = 0; i < length_; ) { 3720 while (i < length_ && Count(i) > max_number_of_chars) i++; 3721 if (i == length_) break; 3722 int remembered_from = i; 3723 bool union_map[kSize]; 3724 for (int j = 0; j < kSize; j++) union_map[j] = false; 3725 while (i < length_ && Count(i) <= max_number_of_chars) { 3726 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3727 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); 3728 i++; 3729 } 3730 int frequency = 0; 3731 for (int j = 0; j < kSize; j++) { 3732 if (union_map[j]) { 3733 // Add 1 to the frequency to give a small per-character boost for 3734 // the cases where our sampling is not good enough and many 3735 // characters have a frequency of zero. This means the frequency 3736 // can theoretically be up to 2*kSize though we treat it mostly as 3737 // a fraction of kSize. 3738 frequency += compiler_->frequency_collator()->Frequency(j) + 1; 3739 } 3740 } 3741 // We use the probability of skipping times the distance we are skipping to 3742 // judge the effectiveness of this. Actually we have a cut-off: By 3743 // dividing by 2 we switch off the skipping if the probability of skipping 3744 // is less than 50%. This is because the multibyte mask-and-compare 3745 // skipping in quickcheck is more likely to do well on this case. 3746 bool in_quickcheck_range = 3747 ((i - remembered_from < 4) || 3748 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); 3749 // Called 'probability' but it is only a rough estimate and can actually 3750 // be outside the 0-kSize range. 3751 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; 3752 int points = (i - remembered_from) * probability; 3753 if (points > biggest_points) { 3754 *from = remembered_from; 3755 *to = i - 1; 3756 biggest_points = points; 3757 } 3758 } 3759 return biggest_points; 3760} 3761 3762 3763// Take all the characters that will not prevent a successful match if they 3764// occur in the subject string in the range between min_lookahead and 3765// max_lookahead (inclusive) measured from the current position. If the 3766// character at max_lookahead offset is not one of these characters, then we 3767// can safely skip forwards by the number of characters in the range. 3768int BoyerMooreLookahead::GetSkipTable(int min_lookahead, 3769 int max_lookahead, 3770 Handle<ByteArray> boolean_skip_table) { 3771 const int kSize = RegExpMacroAssembler::kTableSize; 3772 3773 const int kSkipArrayEntry = 0; 3774 const int kDontSkipArrayEntry = 1; 3775 3776 for (int i = 0; i < kSize; i++) { 3777 boolean_skip_table->set(i, kSkipArrayEntry); 3778 } 3779 int skip = max_lookahead + 1 - min_lookahead; 3780 3781 for (int i = max_lookahead; i >= min_lookahead; i--) { 3782 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3783 for (int j = 0; j < kSize; j++) { 3784 if (map->at(j)) { 3785 boolean_skip_table->set(j, kDontSkipArrayEntry); 3786 } 3787 } 3788 } 3789 3790 return skip; 3791} 3792 3793 3794// See comment above on the implementation of GetSkipTable. 3795void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { 3796 const int kSize = RegExpMacroAssembler::kTableSize; 3797 3798 int min_lookahead = 0; 3799 int max_lookahead = 0; 3800 3801 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return; 3802 3803 bool found_single_character = false; 3804 int single_character = 0; 3805 for (int i = max_lookahead; i >= min_lookahead; i--) { 3806 BoyerMoorePositionInfo* map = bitmaps_->at(i); 3807 if (map->map_count() > 1 || 3808 (found_single_character && map->map_count() != 0)) { 3809 found_single_character = false; 3810 break; 3811 } 3812 for (int j = 0; j < kSize; j++) { 3813 if (map->at(j)) { 3814 found_single_character = true; 3815 single_character = j; 3816 break; 3817 } 3818 } 3819 } 3820 3821 int lookahead_width = max_lookahead + 1 - min_lookahead; 3822 3823 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { 3824 // The mask-compare can probably handle this better. 3825 return; 3826 } 3827 3828 if (found_single_character) { 3829 Label cont, again; 3830 masm->Bind(&again); 3831 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3832 if (max_char_ > kSize) { 3833 masm->CheckCharacterAfterAnd(single_character, 3834 RegExpMacroAssembler::kTableMask, 3835 &cont); 3836 } else { 3837 masm->CheckCharacter(single_character, &cont); 3838 } 3839 masm->AdvanceCurrentPosition(lookahead_width); 3840 masm->GoTo(&again); 3841 masm->Bind(&cont); 3842 return; 3843 } 3844 3845 Factory* factory = masm->isolate()->factory(); 3846 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED); 3847 int skip_distance = GetSkipTable( 3848 min_lookahead, max_lookahead, boolean_skip_table); 3849 DCHECK(skip_distance != 0); 3850 3851 Label cont, again; 3852 masm->Bind(&again); 3853 masm->LoadCurrentCharacter(max_lookahead, &cont, true); 3854 masm->CheckBitInTable(boolean_skip_table, &cont); 3855 masm->AdvanceCurrentPosition(skip_distance); 3856 masm->GoTo(&again); 3857 masm->Bind(&cont); 3858} 3859 3860 3861/* Code generation for choice nodes. 3862 * 3863 * We generate quick checks that do a mask and compare to eliminate a 3864 * choice. If the quick check succeeds then it jumps to the continuation to 3865 * do slow checks and check subsequent nodes. If it fails (the common case) 3866 * it falls through to the next choice. 3867 * 3868 * Here is the desired flow graph. Nodes directly below each other imply 3869 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 3870 * 3 doesn't have a quick check so we have to call the slow check. 3871 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire 3872 * regexp continuation is generated directly after the Sn node, up to the 3873 * next GoTo if we decide to reuse some already generated code. Some 3874 * nodes expect preload_characters to be preloaded into the current 3875 * character register. R nodes do this preloading. Vertices are marked 3876 * F for failures and S for success (possible success in the case of quick 3877 * nodes). L, V, < and > are used as arrow heads. 3878 * 3879 * ----------> R 3880 * | 3881 * V 3882 * Q1 -----> S1 3883 * | S / 3884 * F| / 3885 * | F/ 3886 * | / 3887 * | R 3888 * | / 3889 * V L 3890 * Q2 -----> S2 3891 * | S / 3892 * F| / 3893 * | F/ 3894 * | / 3895 * | R 3896 * | / 3897 * V L 3898 * S3 3899 * | 3900 * F| 3901 * | 3902 * R 3903 * | 3904 * backtrack V 3905 * <----------Q4 3906 * \ F | 3907 * \ |S 3908 * \ F V 3909 * \-----S4 3910 * 3911 * For greedy loops we push the current position, then generate the code that 3912 * eats the input specially in EmitGreedyLoop. The other choice (the 3913 * continuation) is generated by the normal code in EmitChoices, and steps back 3914 * in the input to the starting position when it fails to match. The loop code 3915 * looks like this (U is the unwind code that steps back in the greedy loop). 3916 * 3917 * _____ 3918 * / \ 3919 * V | 3920 * ----------> S1 | 3921 * /| | 3922 * / |S | 3923 * F/ \_____/ 3924 * / 3925 * |<----- 3926 * | \ 3927 * V |S 3928 * Q2 ---> U----->backtrack 3929 * | F / 3930 * S| / 3931 * V F / 3932 * S2--/ 3933 */ 3934 3935GreedyLoopState::GreedyLoopState(bool not_at_start) { 3936 counter_backtrack_trace_.set_backtrack(&label_); 3937 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE); 3938} 3939 3940 3941void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) { 3942#ifdef DEBUG 3943 int choice_count = alternatives_->length(); 3944 for (int i = 0; i < choice_count - 1; i++) { 3945 GuardedAlternative alternative = alternatives_->at(i); 3946 ZoneList<Guard*>* guards = alternative.guards(); 3947 int guard_count = (guards == NULL) ? 0 : guards->length(); 3948 for (int j = 0; j < guard_count; j++) { 3949 DCHECK(!trace->mentions_reg(guards->at(j)->reg())); 3950 } 3951 } 3952#endif 3953} 3954 3955 3956void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, 3957 Trace* current_trace, 3958 PreloadState* state) { 3959 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { 3960 // Save some time by looking at most one machine word ahead. 3961 state->eats_at_least_ = 3962 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget, 3963 current_trace->at_start() == Trace::FALSE_VALUE); 3964 } 3965 state->preload_characters_ = 3966 CalculatePreloadCharacters(compiler, state->eats_at_least_); 3967 3968 state->preload_is_current_ = 3969 (current_trace->characters_preloaded() == state->preload_characters_); 3970 state->preload_has_checked_bounds_ = state->preload_is_current_; 3971} 3972 3973 3974void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3975 int choice_count = alternatives_->length(); 3976 3977 if (choice_count == 1 && alternatives_->at(0).guards() == NULL) { 3978 alternatives_->at(0).node()->Emit(compiler, trace); 3979 return; 3980 } 3981 3982 AssertGuardsMentionRegisters(trace); 3983 3984 LimitResult limit_result = LimitVersions(compiler, trace); 3985 if (limit_result == DONE) return; 3986 DCHECK(limit_result == CONTINUE); 3987 3988 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for 3989 // other choice nodes we only flush if we are out of code size budget. 3990 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 3991 trace->Flush(compiler, this); 3992 return; 3993 } 3994 3995 RecursionCheck rc(compiler); 3996 3997 PreloadState preload; 3998 preload.init(); 3999 GreedyLoopState greedy_loop_state(not_at_start()); 4000 4001 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0)); 4002 AlternativeGenerationList alt_gens(choice_count, zone()); 4003 4004 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { 4005 trace = EmitGreedyLoop(compiler, 4006 trace, 4007 &alt_gens, 4008 &preload, 4009 &greedy_loop_state, 4010 text_length); 4011 } else { 4012 // TODO(erikcorry): Delete this. We don't need this label, but it makes us 4013 // match the traces produced pre-cleanup. 4014 Label second_choice; 4015 compiler->macro_assembler()->Bind(&second_choice); 4016 4017 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace); 4018 4019 EmitChoices(compiler, 4020 &alt_gens, 4021 0, 4022 trace, 4023 &preload); 4024 } 4025 4026 // At this point we need to generate slow checks for the alternatives where 4027 // the quick check was inlined. We can recognize these because the associated 4028 // label was bound. 4029 int new_flush_budget = trace->flush_budget() / choice_count; 4030 for (int i = 0; i < choice_count; i++) { 4031 AlternativeGeneration* alt_gen = alt_gens.at(i); 4032 Trace new_trace(*trace); 4033 // If there are actions to be flushed we have to limit how many times 4034 // they are flushed. Take the budget of the parent trace and distribute 4035 // it fairly amongst the children. 4036 if (new_trace.actions() != NULL) { 4037 new_trace.set_flush_budget(new_flush_budget); 4038 } 4039 bool next_expects_preload = 4040 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload; 4041 EmitOutOfLineContinuation(compiler, 4042 &new_trace, 4043 alternatives_->at(i), 4044 alt_gen, 4045 preload.preload_characters_, 4046 next_expects_preload); 4047 } 4048} 4049 4050 4051Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, 4052 Trace* trace, 4053 AlternativeGenerationList* alt_gens, 4054 PreloadState* preload, 4055 GreedyLoopState* greedy_loop_state, 4056 int text_length) { 4057 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4058 // Here we have special handling for greedy loops containing only text nodes 4059 // and other simple nodes. These are handled by pushing the current 4060 // position on the stack and then incrementing the current position each 4061 // time around the switch. On backtrack we decrement the current position 4062 // and check it against the pushed value. This avoids pushing backtrack 4063 // information for each iteration of the loop, which could take up a lot of 4064 // space. 4065 DCHECK(trace->stop_node() == NULL); 4066 macro_assembler->PushCurrentPosition(); 4067 Label greedy_match_failed; 4068 Trace greedy_match_trace; 4069 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE); 4070 greedy_match_trace.set_backtrack(&greedy_match_failed); 4071 Label loop_label; 4072 macro_assembler->Bind(&loop_label); 4073 greedy_match_trace.set_stop_node(this); 4074 greedy_match_trace.set_loop_label(&loop_label); 4075 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 4076 macro_assembler->Bind(&greedy_match_failed); 4077 4078 Label second_choice; // For use in greedy matches. 4079 macro_assembler->Bind(&second_choice); 4080 4081 Trace* new_trace = greedy_loop_state->counter_backtrack_trace(); 4082 4083 EmitChoices(compiler, 4084 alt_gens, 4085 1, 4086 new_trace, 4087 preload); 4088 4089 macro_assembler->Bind(greedy_loop_state->label()); 4090 // If we have unwound to the bottom then backtrack. 4091 macro_assembler->CheckGreedyLoop(trace->backtrack()); 4092 // Otherwise try the second priority at an earlier position. 4093 macro_assembler->AdvanceCurrentPosition(-text_length); 4094 macro_assembler->GoTo(&second_choice); 4095 return new_trace; 4096} 4097 4098int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, 4099 Trace* trace) { 4100 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized; 4101 if (alternatives_->length() != 2) return eats_at_least; 4102 4103 GuardedAlternative alt1 = alternatives_->at(1); 4104 if (alt1.guards() != NULL && alt1.guards()->length() != 0) { 4105 return eats_at_least; 4106 } 4107 RegExpNode* eats_anything_node = alt1.node(); 4108 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) { 4109 return eats_at_least; 4110 } 4111 4112 // Really we should be creating a new trace when we execute this function, 4113 // but there is no need, because the code it generates cannot backtrack, and 4114 // we always arrive here with a trivial trace (since it's the entry to a 4115 // loop. That also implies that there are no preloaded characters, which is 4116 // good, because it means we won't be violating any assumptions by 4117 // overwriting those characters with new load instructions. 4118 DCHECK(trace->is_trivial()); 4119 4120 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4121 Isolate* isolate = macro_assembler->isolate(); 4122 // At this point we know that we are at a non-greedy loop that will eat 4123 // any character one at a time. Any non-anchored regexp has such a 4124 // loop prepended to it in order to find where it starts. We look for 4125 // a pattern of the form ...abc... where we can look 6 characters ahead 4126 // and step forwards 3 if the character is not one of abc. Abc need 4127 // not be atoms, they can be any reasonably limited character class or 4128 // small alternation. 4129 BoyerMooreLookahead* bm = bm_info(false); 4130 if (bm == NULL) { 4131 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 4132 EatsAtLeast(kMaxLookaheadForBoyerMoore, 4133 kRecursionBudget, 4134 false)); 4135 if (eats_at_least >= 1) { 4136 bm = new(zone()) BoyerMooreLookahead(eats_at_least, 4137 compiler, 4138 zone()); 4139 GuardedAlternative alt0 = alternatives_->at(0); 4140 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false); 4141 } 4142 } 4143 if (bm != NULL) { 4144 bm->EmitSkipInstructions(macro_assembler); 4145 } 4146 return eats_at_least; 4147} 4148 4149 4150void ChoiceNode::EmitChoices(RegExpCompiler* compiler, 4151 AlternativeGenerationList* alt_gens, 4152 int first_choice, 4153 Trace* trace, 4154 PreloadState* preload) { 4155 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4156 SetUpPreLoad(compiler, trace, preload); 4157 4158 // For now we just call all choices one after the other. The idea ultimately 4159 // is to use the Dispatch table to try only the relevant ones. 4160 int choice_count = alternatives_->length(); 4161 4162 int new_flush_budget = trace->flush_budget() / choice_count; 4163 4164 for (int i = first_choice; i < choice_count; i++) { 4165 bool is_last = i == choice_count - 1; 4166 bool fall_through_on_failure = !is_last; 4167 GuardedAlternative alternative = alternatives_->at(i); 4168 AlternativeGeneration* alt_gen = alt_gens->at(i); 4169 alt_gen->quick_check_details.set_characters(preload->preload_characters_); 4170 ZoneList<Guard*>* guards = alternative.guards(); 4171 int guard_count = (guards == NULL) ? 0 : guards->length(); 4172 Trace new_trace(*trace); 4173 new_trace.set_characters_preloaded(preload->preload_is_current_ ? 4174 preload->preload_characters_ : 4175 0); 4176 if (preload->preload_has_checked_bounds_) { 4177 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4178 } 4179 new_trace.quick_check_performed()->Clear(); 4180 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE); 4181 if (!is_last) { 4182 new_trace.set_backtrack(&alt_gen->after); 4183 } 4184 alt_gen->expects_preload = preload->preload_is_current_; 4185 bool generate_full_check_inline = false; 4186 if (compiler->optimize() && 4187 try_to_emit_quick_check_for_alternative(i == 0) && 4188 alternative.node()->EmitQuickCheck( 4189 compiler, trace, &new_trace, preload->preload_has_checked_bounds_, 4190 &alt_gen->possible_success, &alt_gen->quick_check_details, 4191 fall_through_on_failure)) { 4192 // Quick check was generated for this choice. 4193 preload->preload_is_current_ = true; 4194 preload->preload_has_checked_bounds_ = true; 4195 // If we generated the quick check to fall through on possible success, 4196 // we now need to generate the full check inline. 4197 if (!fall_through_on_failure) { 4198 macro_assembler->Bind(&alt_gen->possible_success); 4199 new_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4200 new_trace.set_characters_preloaded(preload->preload_characters_); 4201 new_trace.set_bound_checked_up_to(preload->preload_characters_); 4202 generate_full_check_inline = true; 4203 } 4204 } else if (alt_gen->quick_check_details.cannot_match()) { 4205 if (!fall_through_on_failure) { 4206 macro_assembler->GoTo(trace->backtrack()); 4207 } 4208 continue; 4209 } else { 4210 // No quick check was generated. Put the full code here. 4211 // If this is not the first choice then there could be slow checks from 4212 // previous cases that go here when they fail. There's no reason to 4213 // insist that they preload characters since the slow check we are about 4214 // to generate probably can't use it. 4215 if (i != first_choice) { 4216 alt_gen->expects_preload = false; 4217 new_trace.InvalidateCurrentCharacter(); 4218 } 4219 generate_full_check_inline = true; 4220 } 4221 if (generate_full_check_inline) { 4222 if (new_trace.actions() != NULL) { 4223 new_trace.set_flush_budget(new_flush_budget); 4224 } 4225 for (int j = 0; j < guard_count; j++) { 4226 GenerateGuard(macro_assembler, guards->at(j), &new_trace); 4227 } 4228 alternative.node()->Emit(compiler, &new_trace); 4229 preload->preload_is_current_ = false; 4230 } 4231 macro_assembler->Bind(&alt_gen->after); 4232 } 4233} 4234 4235 4236void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler, 4237 Trace* trace, 4238 GuardedAlternative alternative, 4239 AlternativeGeneration* alt_gen, 4240 int preload_characters, 4241 bool next_expects_preload) { 4242 if (!alt_gen->possible_success.is_linked()) return; 4243 4244 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4245 macro_assembler->Bind(&alt_gen->possible_success); 4246 Trace out_of_line_trace(*trace); 4247 out_of_line_trace.set_characters_preloaded(preload_characters); 4248 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details); 4249 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE); 4250 ZoneList<Guard*>* guards = alternative.guards(); 4251 int guard_count = (guards == NULL) ? 0 : guards->length(); 4252 if (next_expects_preload) { 4253 Label reload_current_char; 4254 out_of_line_trace.set_backtrack(&reload_current_char); 4255 for (int j = 0; j < guard_count; j++) { 4256 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4257 } 4258 alternative.node()->Emit(compiler, &out_of_line_trace); 4259 macro_assembler->Bind(&reload_current_char); 4260 // Reload the current character, since the next quick check expects that. 4261 // We don't need to check bounds here because we only get into this 4262 // code through a quick check which already did the checked load. 4263 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), 4264 NULL, 4265 false, 4266 preload_characters); 4267 macro_assembler->GoTo(&(alt_gen->after)); 4268 } else { 4269 out_of_line_trace.set_backtrack(&(alt_gen->after)); 4270 for (int j = 0; j < guard_count; j++) { 4271 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace); 4272 } 4273 alternative.node()->Emit(compiler, &out_of_line_trace); 4274 } 4275} 4276 4277 4278void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4279 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4280 LimitResult limit_result = LimitVersions(compiler, trace); 4281 if (limit_result == DONE) return; 4282 DCHECK(limit_result == CONTINUE); 4283 4284 RecursionCheck rc(compiler); 4285 4286 switch (action_type_) { 4287 case STORE_POSITION: { 4288 Trace::DeferredCapture 4289 new_capture(data_.u_position_register.reg, 4290 data_.u_position_register.is_capture, 4291 trace); 4292 Trace new_trace = *trace; 4293 new_trace.add_action(&new_capture); 4294 on_success()->Emit(compiler, &new_trace); 4295 break; 4296 } 4297 case INCREMENT_REGISTER: { 4298 Trace::DeferredIncrementRegister 4299 new_increment(data_.u_increment_register.reg); 4300 Trace new_trace = *trace; 4301 new_trace.add_action(&new_increment); 4302 on_success()->Emit(compiler, &new_trace); 4303 break; 4304 } 4305 case SET_REGISTER: { 4306 Trace::DeferredSetRegister 4307 new_set(data_.u_store_register.reg, data_.u_store_register.value); 4308 Trace new_trace = *trace; 4309 new_trace.add_action(&new_set); 4310 on_success()->Emit(compiler, &new_trace); 4311 break; 4312 } 4313 case CLEAR_CAPTURES: { 4314 Trace::DeferredClearCaptures 4315 new_capture(Interval(data_.u_clear_captures.range_from, 4316 data_.u_clear_captures.range_to)); 4317 Trace new_trace = *trace; 4318 new_trace.add_action(&new_capture); 4319 on_success()->Emit(compiler, &new_trace); 4320 break; 4321 } 4322 case BEGIN_SUBMATCH: 4323 if (!trace->is_trivial()) { 4324 trace->Flush(compiler, this); 4325 } else { 4326 assembler->WriteCurrentPositionToRegister( 4327 data_.u_submatch.current_position_register, 0); 4328 assembler->WriteStackPointerToRegister( 4329 data_.u_submatch.stack_pointer_register); 4330 on_success()->Emit(compiler, trace); 4331 } 4332 break; 4333 case EMPTY_MATCH_CHECK: { 4334 int start_pos_reg = data_.u_empty_match_check.start_register; 4335 int stored_pos = 0; 4336 int rep_reg = data_.u_empty_match_check.repetition_register; 4337 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister); 4338 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos); 4339 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) { 4340 // If we know we haven't advanced and there is no minimum we 4341 // can just backtrack immediately. 4342 assembler->GoTo(trace->backtrack()); 4343 } else if (know_dist && stored_pos < trace->cp_offset()) { 4344 // If we know we've advanced we can generate the continuation 4345 // immediately. 4346 on_success()->Emit(compiler, trace); 4347 } else if (!trace->is_trivial()) { 4348 trace->Flush(compiler, this); 4349 } else { 4350 Label skip_empty_check; 4351 // If we have a minimum number of repetitions we check the current 4352 // number first and skip the empty check if it's not enough. 4353 if (has_minimum) { 4354 int limit = data_.u_empty_match_check.repetition_limit; 4355 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check); 4356 } 4357 // If the match is empty we bail out, otherwise we fall through 4358 // to the on-success continuation. 4359 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register, 4360 trace->backtrack()); 4361 assembler->Bind(&skip_empty_check); 4362 on_success()->Emit(compiler, trace); 4363 } 4364 break; 4365 } 4366 case POSITIVE_SUBMATCH_SUCCESS: { 4367 if (!trace->is_trivial()) { 4368 trace->Flush(compiler, this); 4369 return; 4370 } 4371 assembler->ReadCurrentPositionFromRegister( 4372 data_.u_submatch.current_position_register); 4373 assembler->ReadStackPointerFromRegister( 4374 data_.u_submatch.stack_pointer_register); 4375 int clear_register_count = data_.u_submatch.clear_register_count; 4376 if (clear_register_count == 0) { 4377 on_success()->Emit(compiler, trace); 4378 return; 4379 } 4380 int clear_registers_from = data_.u_submatch.clear_register_from; 4381 Label clear_registers_backtrack; 4382 Trace new_trace = *trace; 4383 new_trace.set_backtrack(&clear_registers_backtrack); 4384 on_success()->Emit(compiler, &new_trace); 4385 4386 assembler->Bind(&clear_registers_backtrack); 4387 int clear_registers_to = clear_registers_from + clear_register_count - 1; 4388 assembler->ClearRegisters(clear_registers_from, clear_registers_to); 4389 4390 DCHECK(trace->backtrack() == NULL); 4391 assembler->Backtrack(); 4392 return; 4393 } 4394 default: 4395 UNREACHABLE(); 4396 } 4397} 4398 4399 4400void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 4401 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 4402 if (!trace->is_trivial()) { 4403 trace->Flush(compiler, this); 4404 return; 4405 } 4406 4407 LimitResult limit_result = LimitVersions(compiler, trace); 4408 if (limit_result == DONE) return; 4409 DCHECK(limit_result == CONTINUE); 4410 4411 RecursionCheck rc(compiler); 4412 4413 DCHECK_EQ(start_reg_ + 1, end_reg_); 4414 if (compiler->ignore_case()) { 4415 assembler->CheckNotBackReferenceIgnoreCase( 4416 start_reg_, read_backward(), compiler->unicode(), trace->backtrack()); 4417 } else { 4418 assembler->CheckNotBackReference(start_reg_, read_backward(), 4419 trace->backtrack()); 4420 } 4421 // We are going to advance backward, so we may end up at the start. 4422 if (read_backward()) trace->set_at_start(Trace::UNKNOWN); 4423 4424 // Check that the back reference does not end inside a surrogate pair. 4425 if (compiler->unicode() && !compiler->one_byte()) { 4426 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); 4427 } 4428 on_success()->Emit(compiler, trace); 4429} 4430 4431 4432// ------------------------------------------------------------------- 4433// Dot/dotty output 4434 4435 4436#ifdef DEBUG 4437 4438 4439class DotPrinter: public NodeVisitor { 4440 public: 4441 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT 4442 : os_(os), 4443 ignore_case_(ignore_case) {} 4444 void PrintNode(const char* label, RegExpNode* node); 4445 void Visit(RegExpNode* node); 4446 void PrintAttributes(RegExpNode* from); 4447 void PrintOnFailure(RegExpNode* from, RegExpNode* to); 4448#define DECLARE_VISIT(Type) \ 4449 virtual void Visit##Type(Type##Node* that); 4450FOR_EACH_NODE_TYPE(DECLARE_VISIT) 4451#undef DECLARE_VISIT 4452 private: 4453 std::ostream& os_; 4454 bool ignore_case_; 4455}; 4456 4457 4458void DotPrinter::PrintNode(const char* label, RegExpNode* node) { 4459 os_ << "digraph G {\n graph [label=\""; 4460 for (int i = 0; label[i]; i++) { 4461 switch (label[i]) { 4462 case '\\': 4463 os_ << "\\\\"; 4464 break; 4465 case '"': 4466 os_ << "\""; 4467 break; 4468 default: 4469 os_ << label[i]; 4470 break; 4471 } 4472 } 4473 os_ << "\"];\n"; 4474 Visit(node); 4475 os_ << "}" << std::endl; 4476} 4477 4478 4479void DotPrinter::Visit(RegExpNode* node) { 4480 if (node->info()->visited) return; 4481 node->info()->visited = true; 4482 node->Accept(this); 4483} 4484 4485 4486void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { 4487 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n"; 4488 Visit(on_failure); 4489} 4490 4491 4492class TableEntryBodyPrinter { 4493 public: 4494 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT 4495 : os_(os), 4496 choice_(choice) {} 4497 void Call(uc16 from, DispatchTable::Entry entry) { 4498 OutSet* out_set = entry.out_set(); 4499 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4500 if (out_set->Get(i)) { 4501 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n" 4502 << choice()->alternatives()->at(i).node() << ";\n"; 4503 } 4504 } 4505 } 4506 private: 4507 ChoiceNode* choice() { return choice_; } 4508 std::ostream& os_; 4509 ChoiceNode* choice_; 4510}; 4511 4512 4513class TableEntryHeaderPrinter { 4514 public: 4515 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT 4516 : first_(true), 4517 os_(os) {} 4518 void Call(uc16 from, DispatchTable::Entry entry) { 4519 if (first_) { 4520 first_ = false; 4521 } else { 4522 os_ << "|"; 4523 } 4524 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{"; 4525 OutSet* out_set = entry.out_set(); 4526 int priority = 0; 4527 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4528 if (out_set->Get(i)) { 4529 if (priority > 0) os_ << "|"; 4530 os_ << "<s" << from << "o" << i << "> " << priority; 4531 priority++; 4532 } 4533 } 4534 os_ << "}}"; 4535 } 4536 4537 private: 4538 bool first_; 4539 std::ostream& os_; 4540}; 4541 4542 4543class AttributePrinter { 4544 public: 4545 explicit AttributePrinter(std::ostream& os) // NOLINT 4546 : os_(os), 4547 first_(true) {} 4548 void PrintSeparator() { 4549 if (first_) { 4550 first_ = false; 4551 } else { 4552 os_ << "|"; 4553 } 4554 } 4555 void PrintBit(const char* name, bool value) { 4556 if (!value) return; 4557 PrintSeparator(); 4558 os_ << "{" << name << "}"; 4559 } 4560 void PrintPositive(const char* name, int value) { 4561 if (value < 0) return; 4562 PrintSeparator(); 4563 os_ << "{" << name << "|" << value << "}"; 4564 } 4565 4566 private: 4567 std::ostream& os_; 4568 bool first_; 4569}; 4570 4571 4572void DotPrinter::PrintAttributes(RegExpNode* that) { 4573 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, " 4574 << "margin=0.1, fontsize=10, label=\"{"; 4575 AttributePrinter printer(os_); 4576 NodeInfo* info = that->info(); 4577 printer.PrintBit("NI", info->follows_newline_interest); 4578 printer.PrintBit("WI", info->follows_word_interest); 4579 printer.PrintBit("SI", info->follows_start_interest); 4580 Label* label = that->label(); 4581 if (label->is_bound()) 4582 printer.PrintPositive("@", label->pos()); 4583 os_ << "}\"];\n" 4584 << " a" << that << " -> n" << that 4585 << " [style=dashed, color=grey, arrowhead=none];\n"; 4586} 4587 4588 4589static const bool kPrintDispatchTable = false; 4590void DotPrinter::VisitChoice(ChoiceNode* that) { 4591 if (kPrintDispatchTable) { 4592 os_ << " n" << that << " [shape=Mrecord, label=\""; 4593 TableEntryHeaderPrinter header_printer(os_); 4594 that->GetTable(ignore_case_)->ForEach(&header_printer); 4595 os_ << "\"]\n"; 4596 PrintAttributes(that); 4597 TableEntryBodyPrinter body_printer(os_, that); 4598 that->GetTable(ignore_case_)->ForEach(&body_printer); 4599 } else { 4600 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n"; 4601 for (int i = 0; i < that->alternatives()->length(); i++) { 4602 GuardedAlternative alt = that->alternatives()->at(i); 4603 os_ << " n" << that << " -> n" << alt.node(); 4604 } 4605 } 4606 for (int i = 0; i < that->alternatives()->length(); i++) { 4607 GuardedAlternative alt = that->alternatives()->at(i); 4608 alt.node()->Accept(this); 4609 } 4610} 4611 4612 4613void DotPrinter::VisitText(TextNode* that) { 4614 Zone* zone = that->zone(); 4615 os_ << " n" << that << " [label=\""; 4616 for (int i = 0; i < that->elements()->length(); i++) { 4617 if (i > 0) os_ << " "; 4618 TextElement elm = that->elements()->at(i); 4619 switch (elm.text_type()) { 4620 case TextElement::ATOM: { 4621 Vector<const uc16> data = elm.atom()->data(); 4622 for (int i = 0; i < data.length(); i++) { 4623 os_ << static_cast<char>(data[i]); 4624 } 4625 break; 4626 } 4627 case TextElement::CHAR_CLASS: { 4628 RegExpCharacterClass* node = elm.char_class(); 4629 os_ << "["; 4630 if (node->is_negated()) os_ << "^"; 4631 for (int j = 0; j < node->ranges(zone)->length(); j++) { 4632 CharacterRange range = node->ranges(zone)->at(j); 4633 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to()); 4634 } 4635 os_ << "]"; 4636 break; 4637 } 4638 default: 4639 UNREACHABLE(); 4640 } 4641 } 4642 os_ << "\", shape=box, peripheries=2];\n"; 4643 PrintAttributes(that); 4644 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4645 Visit(that->on_success()); 4646} 4647 4648 4649void DotPrinter::VisitBackReference(BackReferenceNode* that) { 4650 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$" 4651 << that->end_register() << "\", shape=doubleoctagon];\n"; 4652 PrintAttributes(that); 4653 os_ << " n" << that << " -> n" << that->on_success() << ";\n"; 4654 Visit(that->on_success()); 4655} 4656 4657 4658void DotPrinter::VisitEnd(EndNode* that) { 4659 os_ << " n" << that << " [style=bold, shape=point];\n"; 4660 PrintAttributes(that); 4661} 4662 4663 4664void DotPrinter::VisitAssertion(AssertionNode* that) { 4665 os_ << " n" << that << " ["; 4666 switch (that->assertion_type()) { 4667 case AssertionNode::AT_END: 4668 os_ << "label=\"$\", shape=septagon"; 4669 break; 4670 case AssertionNode::AT_START: 4671 os_ << "label=\"^\", shape=septagon"; 4672 break; 4673 case AssertionNode::AT_BOUNDARY: 4674 os_ << "label=\"\\b\", shape=septagon"; 4675 break; 4676 case AssertionNode::AT_NON_BOUNDARY: 4677 os_ << "label=\"\\B\", shape=septagon"; 4678 break; 4679 case AssertionNode::AFTER_NEWLINE: 4680 os_ << "label=\"(?<=\\n)\", shape=septagon"; 4681 break; 4682 } 4683 os_ << "];\n"; 4684 PrintAttributes(that); 4685 RegExpNode* successor = that->on_success(); 4686 os_ << " n" << that << " -> n" << successor << ";\n"; 4687 Visit(successor); 4688} 4689 4690 4691void DotPrinter::VisitAction(ActionNode* that) { 4692 os_ << " n" << that << " ["; 4693 switch (that->action_type_) { 4694 case ActionNode::SET_REGISTER: 4695 os_ << "label=\"$" << that->data_.u_store_register.reg 4696 << ":=" << that->data_.u_store_register.value << "\", shape=octagon"; 4697 break; 4698 case ActionNode::INCREMENT_REGISTER: 4699 os_ << "label=\"$" << that->data_.u_increment_register.reg 4700 << "++\", shape=octagon"; 4701 break; 4702 case ActionNode::STORE_POSITION: 4703 os_ << "label=\"$" << that->data_.u_position_register.reg 4704 << ":=$pos\", shape=octagon"; 4705 break; 4706 case ActionNode::BEGIN_SUBMATCH: 4707 os_ << "label=\"$" << that->data_.u_submatch.current_position_register 4708 << ":=$pos,begin\", shape=septagon"; 4709 break; 4710 case ActionNode::POSITIVE_SUBMATCH_SUCCESS: 4711 os_ << "label=\"escape\", shape=septagon"; 4712 break; 4713 case ActionNode::EMPTY_MATCH_CHECK: 4714 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register 4715 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register 4716 << "<" << that->data_.u_empty_match_check.repetition_limit 4717 << "?\", shape=septagon"; 4718 break; 4719 case ActionNode::CLEAR_CAPTURES: { 4720 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from 4721 << " to $" << that->data_.u_clear_captures.range_to 4722 << "\", shape=septagon"; 4723 break; 4724 } 4725 } 4726 os_ << "];\n"; 4727 PrintAttributes(that); 4728 RegExpNode* successor = that->on_success(); 4729 os_ << " n" << that << " -> n" << successor << ";\n"; 4730 Visit(successor); 4731} 4732 4733 4734class DispatchTableDumper { 4735 public: 4736 explicit DispatchTableDumper(std::ostream& os) : os_(os) {} 4737 void Call(uc16 key, DispatchTable::Entry entry); 4738 private: 4739 std::ostream& os_; 4740}; 4741 4742 4743void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { 4744 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {"; 4745 OutSet* set = entry.out_set(); 4746 bool first = true; 4747 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { 4748 if (set->Get(i)) { 4749 if (first) { 4750 first = false; 4751 } else { 4752 os_ << ", "; 4753 } 4754 os_ << i; 4755 } 4756 } 4757 os_ << "}\n"; 4758} 4759 4760 4761void DispatchTable::Dump() { 4762 OFStream os(stderr); 4763 DispatchTableDumper dumper(os); 4764 tree()->ForEach(&dumper); 4765} 4766 4767 4768void RegExpEngine::DotPrint(const char* label, 4769 RegExpNode* node, 4770 bool ignore_case) { 4771 OFStream os(stdout); 4772 DotPrinter printer(os, ignore_case); 4773 printer.PrintNode(label, node); 4774} 4775 4776 4777#endif // DEBUG 4778 4779 4780// ------------------------------------------------------------------- 4781// Tree to graph conversion 4782 4783RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 4784 RegExpNode* on_success) { 4785 ZoneList<TextElement>* elms = 4786 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone()); 4787 elms->Add(TextElement::Atom(this), compiler->zone()); 4788 return new (compiler->zone()) 4789 TextNode(elms, compiler->read_backward(), on_success); 4790} 4791 4792 4793RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 4794 RegExpNode* on_success) { 4795 return new (compiler->zone()) 4796 TextNode(elements(), compiler->read_backward(), on_success); 4797} 4798 4799 4800static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, 4801 const int* special_class, 4802 int length) { 4803 length--; // Remove final marker. 4804 DCHECK(special_class[length] == kRangeEndMarker); 4805 DCHECK(ranges->length() != 0); 4806 DCHECK(length != 0); 4807 DCHECK(special_class[0] != 0); 4808 if (ranges->length() != (length >> 1) + 1) { 4809 return false; 4810 } 4811 CharacterRange range = ranges->at(0); 4812 if (range.from() != 0) { 4813 return false; 4814 } 4815 for (int i = 0; i < length; i += 2) { 4816 if (special_class[i] != (range.to() + 1)) { 4817 return false; 4818 } 4819 range = ranges->at((i >> 1) + 1); 4820 if (special_class[i+1] != range.from()) { 4821 return false; 4822 } 4823 } 4824 if (range.to() != String::kMaxCodePoint) { 4825 return false; 4826 } 4827 return true; 4828} 4829 4830 4831static bool CompareRanges(ZoneList<CharacterRange>* ranges, 4832 const int* special_class, 4833 int length) { 4834 length--; // Remove final marker. 4835 DCHECK(special_class[length] == kRangeEndMarker); 4836 if (ranges->length() * 2 != length) { 4837 return false; 4838 } 4839 for (int i = 0; i < length; i += 2) { 4840 CharacterRange range = ranges->at(i >> 1); 4841 if (range.from() != special_class[i] || 4842 range.to() != special_class[i + 1] - 1) { 4843 return false; 4844 } 4845 } 4846 return true; 4847} 4848 4849 4850bool RegExpCharacterClass::is_standard(Zone* zone) { 4851 // TODO(lrn): Remove need for this function, by not throwing away information 4852 // along the way. 4853 if (is_negated_) { 4854 return false; 4855 } 4856 if (set_.is_standard()) { 4857 return true; 4858 } 4859 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4860 set_.set_standard_set_type('s'); 4861 return true; 4862 } 4863 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4864 set_.set_standard_set_type('S'); 4865 return true; 4866 } 4867 if (CompareInverseRanges(set_.ranges(zone), 4868 kLineTerminatorRanges, 4869 kLineTerminatorRangeCount)) { 4870 set_.set_standard_set_type('.'); 4871 return true; 4872 } 4873 if (CompareRanges(set_.ranges(zone), 4874 kLineTerminatorRanges, 4875 kLineTerminatorRangeCount)) { 4876 set_.set_standard_set_type('n'); 4877 return true; 4878 } 4879 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4880 set_.set_standard_set_type('w'); 4881 return true; 4882 } 4883 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4884 set_.set_standard_set_type('W'); 4885 return true; 4886 } 4887 return false; 4888} 4889 4890 4891UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, 4892 ZoneList<CharacterRange>* base) 4893 : zone_(zone), 4894 table_(zone), 4895 bmp_(nullptr), 4896 lead_surrogates_(nullptr), 4897 trail_surrogates_(nullptr), 4898 non_bmp_(nullptr) { 4899 // The unicode range splitter categorizes given character ranges into: 4900 // - Code points from the BMP representable by one code unit. 4901 // - Code points outside the BMP that need to be split into surrogate pairs. 4902 // - Lone lead surrogates. 4903 // - Lone trail surrogates. 4904 // Lone surrogates are valid code points, even though no actual characters. 4905 // They require special matching to make sure we do not split surrogate pairs. 4906 // We use the dispatch table to accomplish this. The base range is split up 4907 // by the table by the overlay ranges, and the Call callback is used to 4908 // filter and collect ranges for each category. 4909 for (int i = 0; i < base->length(); i++) { 4910 table_.AddRange(base->at(i), kBase, zone_); 4911 } 4912 // Add overlay ranges. 4913 table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1), 4914 kBmpCodePoints, zone_); 4915 table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd), 4916 kLeadSurrogates, zone_); 4917 table_.AddRange( 4918 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), 4919 kTrailSurrogates, zone_); 4920 table_.AddRange( 4921 CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1), 4922 kBmpCodePoints, zone_); 4923 table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd), 4924 kNonBmpCodePoints, zone_); 4925 table_.ForEach(this); 4926} 4927 4928 4929void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) { 4930 OutSet* outset = entry.out_set(); 4931 if (!outset->Get(kBase)) return; 4932 ZoneList<CharacterRange>** target = NULL; 4933 if (outset->Get(kBmpCodePoints)) { 4934 target = &bmp_; 4935 } else if (outset->Get(kLeadSurrogates)) { 4936 target = &lead_surrogates_; 4937 } else if (outset->Get(kTrailSurrogates)) { 4938 target = &trail_surrogates_; 4939 } else { 4940 DCHECK(outset->Get(kNonBmpCodePoints)); 4941 target = &non_bmp_; 4942 } 4943 if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_); 4944 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_); 4945} 4946 4947 4948void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, 4949 RegExpNode* on_success, UnicodeRangeSplitter* splitter) { 4950 ZoneList<CharacterRange>* bmp = splitter->bmp(); 4951 if (bmp == nullptr) return; 4952 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( 4953 compiler->zone(), bmp, compiler->read_backward(), on_success))); 4954} 4955 4956 4957void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, 4958 RegExpNode* on_success, 4959 UnicodeRangeSplitter* splitter) { 4960 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp(); 4961 if (non_bmp == nullptr) return; 4962 DCHECK(compiler->unicode()); 4963 DCHECK(!compiler->one_byte()); 4964 Zone* zone = compiler->zone(); 4965 CharacterRange::Canonicalize(non_bmp); 4966 for (int i = 0; i < non_bmp->length(); i++) { 4967 // Match surrogate pair. 4968 // E.g. [\u10005-\u11005] becomes 4969 // \ud800[\udc05-\udfff]| 4970 // [\ud801-\ud803][\udc00-\udfff]| 4971 // \ud804[\udc00-\udc05] 4972 uc32 from = non_bmp->at(i).from(); 4973 uc32 to = non_bmp->at(i).to(); 4974 uc16 from_l = unibrow::Utf16::LeadSurrogate(from); 4975 uc16 from_t = unibrow::Utf16::TrailSurrogate(from); 4976 uc16 to_l = unibrow::Utf16::LeadSurrogate(to); 4977 uc16 to_t = unibrow::Utf16::TrailSurrogate(to); 4978 if (from_l == to_l) { 4979 // The lead surrogate is the same. 4980 result->AddAlternative( 4981 GuardedAlternative(TextNode::CreateForSurrogatePair( 4982 zone, CharacterRange::Singleton(from_l), 4983 CharacterRange::Range(from_t, to_t), compiler->read_backward(), 4984 on_success))); 4985 } else { 4986 if (from_t != kTrailSurrogateStart) { 4987 // Add [from_l][from_t-\udfff] 4988 result->AddAlternative( 4989 GuardedAlternative(TextNode::CreateForSurrogatePair( 4990 zone, CharacterRange::Singleton(from_l), 4991 CharacterRange::Range(from_t, kTrailSurrogateEnd), 4992 compiler->read_backward(), on_success))); 4993 from_l++; 4994 } 4995 if (to_t != kTrailSurrogateEnd) { 4996 // Add [to_l][\udc00-to_t] 4997 result->AddAlternative( 4998 GuardedAlternative(TextNode::CreateForSurrogatePair( 4999 zone, CharacterRange::Singleton(to_l), 5000 CharacterRange::Range(kTrailSurrogateStart, to_t), 5001 compiler->read_backward(), on_success))); 5002 to_l--; 5003 } 5004 if (from_l <= to_l) { 5005 // Add [from_l-to_l][\udc00-\udfff] 5006 result->AddAlternative( 5007 GuardedAlternative(TextNode::CreateForSurrogatePair( 5008 zone, CharacterRange::Range(from_l, to_l), 5009 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), 5010 compiler->read_backward(), on_success))); 5011 } 5012 } 5013 } 5014} 5015 5016 5017RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( 5018 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind, 5019 ZoneList<CharacterRange>* match, RegExpNode* on_success, 5020 bool read_backward) { 5021 Zone* zone = compiler->zone(); 5022 RegExpNode* match_node = TextNode::CreateForCharacterRanges( 5023 zone, match, read_backward, on_success); 5024 int stack_register = compiler->UnicodeLookaroundStackRegister(); 5025 int position_register = compiler->UnicodeLookaroundPositionRegister(); 5026 RegExpLookaround::Builder lookaround(false, match_node, stack_register, 5027 position_register); 5028 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 5029 zone, lookbehind, !read_backward, lookaround.on_match_success()); 5030 return lookaround.ForMatch(negative_match); 5031} 5032 5033 5034RegExpNode* MatchAndNegativeLookaroundInReadDirection( 5035 RegExpCompiler* compiler, ZoneList<CharacterRange>* match, 5036 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success, 5037 bool read_backward) { 5038 Zone* zone = compiler->zone(); 5039 int stack_register = compiler->UnicodeLookaroundStackRegister(); 5040 int position_register = compiler->UnicodeLookaroundPositionRegister(); 5041 RegExpLookaround::Builder lookaround(false, on_success, stack_register, 5042 position_register); 5043 RegExpNode* negative_match = TextNode::CreateForCharacterRanges( 5044 zone, lookahead, read_backward, lookaround.on_match_success()); 5045 return TextNode::CreateForCharacterRanges( 5046 zone, match, read_backward, lookaround.ForMatch(negative_match)); 5047} 5048 5049 5050void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 5051 RegExpNode* on_success, 5052 UnicodeRangeSplitter* splitter) { 5053 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates(); 5054 if (lead_surrogates == nullptr) return; 5055 Zone* zone = compiler->zone(); 5056 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). 5057 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 5058 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 5059 5060 RegExpNode* match; 5061 if (compiler->read_backward()) { 5062 // Reading backward. Assert that reading forward, there is no trail 5063 // surrogate, and then backward match the lead surrogate. 5064 match = NegativeLookaroundAgainstReadDirectionAndMatch( 5065 compiler, trail_surrogates, lead_surrogates, on_success, true); 5066 } else { 5067 // Reading forward. Forward match the lead surrogate and assert that 5068 // no trail surrogate follows. 5069 match = MatchAndNegativeLookaroundInReadDirection( 5070 compiler, lead_surrogates, trail_surrogates, on_success, false); 5071 } 5072 result->AddAlternative(GuardedAlternative(match)); 5073} 5074 5075 5076void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 5077 RegExpNode* on_success, 5078 UnicodeRangeSplitter* splitter) { 5079 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates(); 5080 if (trail_surrogates == nullptr) return; 5081 Zone* zone = compiler->zone(); 5082 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01 5083 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 5084 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 5085 5086 RegExpNode* match; 5087 if (compiler->read_backward()) { 5088 // Reading backward. Backward match the trail surrogate and assert that no 5089 // lead surrogate precedes it. 5090 match = MatchAndNegativeLookaroundInReadDirection( 5091 compiler, trail_surrogates, lead_surrogates, on_success, true); 5092 } else { 5093 // Reading forward. Assert that reading backward, there is no lead 5094 // surrogate, and then forward match the trail surrogate. 5095 match = NegativeLookaroundAgainstReadDirectionAndMatch( 5096 compiler, lead_surrogates, trail_surrogates, on_success, false); 5097 } 5098 result->AddAlternative(GuardedAlternative(match)); 5099} 5100 5101RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, 5102 RegExpNode* on_success) { 5103 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. 5104 DCHECK(!compiler->read_backward()); 5105 Zone* zone = compiler->zone(); 5106 // Advance any character. If the character happens to be a lead surrogate and 5107 // we advanced into the middle of a surrogate pair, it will work out, as 5108 // nothing will match from there. We will have to advance again, consuming 5109 // the associated trail surrogate. 5110 ZoneList<CharacterRange>* range = CharacterRange::List( 5111 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); 5112 return TextNode::CreateForCharacterRanges(zone, range, false, on_success); 5113} 5114 5115 5116void AddUnicodeCaseEquivalents(RegExpCompiler* compiler, 5117 ZoneList<CharacterRange>* ranges) { 5118#ifdef V8_I18N_SUPPORT 5119 // Use ICU to compute the case fold closure over the ranges. 5120 DCHECK(compiler->unicode()); 5121 DCHECK(compiler->ignore_case()); 5122 USet* set = uset_openEmpty(); 5123 for (int i = 0; i < ranges->length(); i++) { 5124 uset_addRange(set, ranges->at(i).from(), ranges->at(i).to()); 5125 } 5126 ranges->Clear(); 5127 uset_closeOver(set, USET_CASE_INSENSITIVE); 5128 // Full case mapping map single characters to multiple characters. 5129 // Those are represented as strings in the set. Remove them so that 5130 // we end up with only simple and common case mappings. 5131 uset_removeAllStrings(set); 5132 int item_count = uset_getItemCount(set); 5133 int item_result = 0; 5134 UErrorCode ec = U_ZERO_ERROR; 5135 Zone* zone = compiler->zone(); 5136 for (int i = 0; i < item_count; i++) { 5137 uc32 start = 0; 5138 uc32 end = 0; 5139 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); 5140 ranges->Add(CharacterRange::Range(start, end), zone); 5141 } 5142 // No errors and everything we collected have been ranges. 5143 DCHECK_EQ(U_ZERO_ERROR, ec); 5144 DCHECK_EQ(0, item_result); 5145 uset_close(set); 5146#else 5147 // Fallback if ICU is not included. 5148 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(), 5149 ranges, compiler->one_byte()); 5150#endif // V8_I18N_SUPPORT 5151 CharacterRange::Canonicalize(ranges); 5152} 5153 5154 5155RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 5156 RegExpNode* on_success) { 5157 set_.Canonicalize(); 5158 Zone* zone = compiler->zone(); 5159 ZoneList<CharacterRange>* ranges = this->ranges(zone); 5160 if (compiler->unicode() && compiler->ignore_case()) { 5161 AddUnicodeCaseEquivalents(compiler, ranges); 5162 } 5163 if (compiler->unicode() && !compiler->one_byte()) { 5164 if (is_negated()) { 5165 ZoneList<CharacterRange>* negated = 5166 new (zone) ZoneList<CharacterRange>(2, zone); 5167 CharacterRange::Negate(ranges, negated, zone); 5168 ranges = negated; 5169 } 5170 if (ranges->length() == 0) { 5171 ranges->Add(CharacterRange::Everything(), zone); 5172 RegExpCharacterClass* fail = 5173 new (zone) RegExpCharacterClass(ranges, true); 5174 return new (zone) TextNode(fail, compiler->read_backward(), on_success); 5175 } 5176 if (standard_type() == '*') { 5177 return UnanchoredAdvance(compiler, on_success); 5178 } else { 5179 ChoiceNode* result = new (zone) ChoiceNode(2, zone); 5180 UnicodeRangeSplitter splitter(zone, ranges); 5181 AddBmpCharacters(compiler, result, on_success, &splitter); 5182 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); 5183 AddLoneLeadSurrogates(compiler, result, on_success, &splitter); 5184 AddLoneTrailSurrogates(compiler, result, on_success, &splitter); 5185 return result; 5186 } 5187 } else { 5188 return new (zone) TextNode(this, compiler->read_backward(), on_success); 5189 } 5190} 5191 5192 5193int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { 5194 RegExpAtom* atom1 = (*a)->AsAtom(); 5195 RegExpAtom* atom2 = (*b)->AsAtom(); 5196 uc16 character1 = atom1->data().at(0); 5197 uc16 character2 = atom2->data().at(0); 5198 if (character1 < character2) return -1; 5199 if (character1 > character2) return 1; 5200 return 0; 5201} 5202 5203 5204static unibrow::uchar Canonical( 5205 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5206 unibrow::uchar c) { 5207 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; 5208 int length = canonicalize->get(c, '\0', chars); 5209 DCHECK_LE(length, 1); 5210 unibrow::uchar canonical = c; 5211 if (length == 1) canonical = chars[0]; 5212 return canonical; 5213} 5214 5215 5216int CompareFirstCharCaseIndependent( 5217 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, 5218 RegExpTree* const* a, RegExpTree* const* b) { 5219 RegExpAtom* atom1 = (*a)->AsAtom(); 5220 RegExpAtom* atom2 = (*b)->AsAtom(); 5221 unibrow::uchar character1 = atom1->data().at(0); 5222 unibrow::uchar character2 = atom2->data().at(0); 5223 if (character1 == character2) return 0; 5224 if (character1 >= 'a' || character2 >= 'a') { 5225 character1 = Canonical(canonicalize, character1); 5226 character2 = Canonical(canonicalize, character2); 5227 } 5228 return static_cast<int>(character1) - static_cast<int>(character2); 5229} 5230 5231 5232// We can stable sort runs of atoms, since the order does not matter if they 5233// start with different characters. 5234// Returns true if any consecutive atoms were found. 5235bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { 5236 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5237 int length = alternatives->length(); 5238 bool found_consecutive_atoms = false; 5239 for (int i = 0; i < length; i++) { 5240 while (i < length) { 5241 RegExpTree* alternative = alternatives->at(i); 5242 if (alternative->IsAtom()) break; 5243 i++; 5244 } 5245 // i is length or it is the index of an atom. 5246 if (i == length) break; 5247 int first_atom = i; 5248 i++; 5249 while (i < length) { 5250 RegExpTree* alternative = alternatives->at(i); 5251 if (!alternative->IsAtom()) break; 5252 i++; 5253 } 5254 // Sort atoms to get ones with common prefixes together. 5255 // This step is more tricky if we are in a case-independent regexp, 5256 // because it would change /is|I/ to /I|is/, and order matters when 5257 // the regexp parts don't match only disjoint starting points. To fix 5258 // this we have a version of CompareFirstChar that uses case- 5259 // independent character classes for comparison. 5260 DCHECK_LT(first_atom, alternatives->length()); 5261 DCHECK_LE(i, alternatives->length()); 5262 DCHECK_LE(first_atom, i); 5263 if (compiler->ignore_case()) { 5264 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 5265 compiler->isolate()->regexp_macro_assembler_canonicalize(); 5266 auto compare_closure = 5267 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) { 5268 return CompareFirstCharCaseIndependent(canonicalize, a, b); 5269 }; 5270 alternatives->StableSort(compare_closure, first_atom, i - first_atom); 5271 } else { 5272 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); 5273 } 5274 if (i - first_atom > 1) found_consecutive_atoms = true; 5275 } 5276 return found_consecutive_atoms; 5277} 5278 5279 5280// Optimizes ab|ac|az to a(?:b|c|d). 5281void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { 5282 Zone* zone = compiler->zone(); 5283 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5284 int length = alternatives->length(); 5285 5286 int write_posn = 0; 5287 int i = 0; 5288 while (i < length) { 5289 RegExpTree* alternative = alternatives->at(i); 5290 if (!alternative->IsAtom()) { 5291 alternatives->at(write_posn++) = alternatives->at(i); 5292 i++; 5293 continue; 5294 } 5295 RegExpAtom* atom = alternative->AsAtom(); 5296 unibrow::uchar common_prefix = atom->data().at(0); 5297 int first_with_prefix = i; 5298 int prefix_length = atom->length(); 5299 i++; 5300 while (i < length) { 5301 alternative = alternatives->at(i); 5302 if (!alternative->IsAtom()) break; 5303 atom = alternative->AsAtom(); 5304 unibrow::uchar new_prefix = atom->data().at(0); 5305 if (new_prefix != common_prefix) { 5306 if (!compiler->ignore_case()) break; 5307 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = 5308 compiler->isolate()->regexp_macro_assembler_canonicalize(); 5309 new_prefix = Canonical(canonicalize, new_prefix); 5310 common_prefix = Canonical(canonicalize, common_prefix); 5311 if (new_prefix != common_prefix) break; 5312 } 5313 prefix_length = Min(prefix_length, atom->length()); 5314 i++; 5315 } 5316 if (i > first_with_prefix + 2) { 5317 // Found worthwhile run of alternatives with common prefix of at least one 5318 // character. The sorting function above did not sort on more than one 5319 // character for reasons of correctness, but there may still be a longer 5320 // common prefix if the terms were similar or presorted in the input. 5321 // Find out how long the common prefix is. 5322 int run_length = i - first_with_prefix; 5323 atom = alternatives->at(first_with_prefix)->AsAtom(); 5324 for (int j = 1; j < run_length && prefix_length > 1; j++) { 5325 RegExpAtom* old_atom = 5326 alternatives->at(j + first_with_prefix)->AsAtom(); 5327 for (int k = 1; k < prefix_length; k++) { 5328 if (atom->data().at(k) != old_atom->data().at(k)) { 5329 prefix_length = k; 5330 break; 5331 } 5332 } 5333 } 5334 RegExpAtom* prefix = 5335 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); 5336 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); 5337 pair->Add(prefix, zone); 5338 ZoneList<RegExpTree*>* suffixes = 5339 new (zone) ZoneList<RegExpTree*>(run_length, zone); 5340 for (int j = 0; j < run_length; j++) { 5341 RegExpAtom* old_atom = 5342 alternatives->at(j + first_with_prefix)->AsAtom(); 5343 int len = old_atom->length(); 5344 if (len == prefix_length) { 5345 suffixes->Add(new (zone) RegExpEmpty(), zone); 5346 } else { 5347 RegExpTree* suffix = new (zone) RegExpAtom( 5348 old_atom->data().SubVector(prefix_length, old_atom->length())); 5349 suffixes->Add(suffix, zone); 5350 } 5351 } 5352 pair->Add(new (zone) RegExpDisjunction(suffixes), zone); 5353 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair); 5354 } else { 5355 // Just copy any non-worthwhile alternatives. 5356 for (int j = first_with_prefix; j < i; j++) { 5357 alternatives->at(write_posn++) = alternatives->at(j); 5358 } 5359 } 5360 } 5361 alternatives->Rewind(write_posn); // Trim end of array. 5362} 5363 5364 5365// Optimizes b|c|z to [bcz]. 5366void RegExpDisjunction::FixSingleCharacterDisjunctions( 5367 RegExpCompiler* compiler) { 5368 Zone* zone = compiler->zone(); 5369 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5370 int length = alternatives->length(); 5371 5372 int write_posn = 0; 5373 int i = 0; 5374 while (i < length) { 5375 RegExpTree* alternative = alternatives->at(i); 5376 if (!alternative->IsAtom()) { 5377 alternatives->at(write_posn++) = alternatives->at(i); 5378 i++; 5379 continue; 5380 } 5381 RegExpAtom* atom = alternative->AsAtom(); 5382 if (atom->length() != 1) { 5383 alternatives->at(write_posn++) = alternatives->at(i); 5384 i++; 5385 continue; 5386 } 5387 int first_in_run = i; 5388 i++; 5389 while (i < length) { 5390 alternative = alternatives->at(i); 5391 if (!alternative->IsAtom()) break; 5392 atom = alternative->AsAtom(); 5393 if (atom->length() != 1) break; 5394 i++; 5395 } 5396 if (i > first_in_run + 1) { 5397 // Found non-trivial run of single-character alternatives. 5398 int run_length = i - first_in_run; 5399 ZoneList<CharacterRange>* ranges = 5400 new (zone) ZoneList<CharacterRange>(2, zone); 5401 for (int j = 0; j < run_length; j++) { 5402 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); 5403 DCHECK_EQ(old_atom->length(), 1); 5404 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); 5405 } 5406 alternatives->at(write_posn++) = 5407 new (zone) RegExpCharacterClass(ranges, false); 5408 } else { 5409 // Just copy any trivial alternatives. 5410 for (int j = first_in_run; j < i; j++) { 5411 alternatives->at(write_posn++) = alternatives->at(j); 5412 } 5413 } 5414 } 5415 alternatives->Rewind(write_posn); // Trim end of array. 5416} 5417 5418 5419RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 5420 RegExpNode* on_success) { 5421 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5422 5423 if (alternatives->length() > 2) { 5424 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); 5425 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); 5426 FixSingleCharacterDisjunctions(compiler); 5427 if (alternatives->length() == 1) { 5428 return alternatives->at(0)->ToNode(compiler, on_success); 5429 } 5430 } 5431 5432 int length = alternatives->length(); 5433 5434 ChoiceNode* result = 5435 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 5436 for (int i = 0; i < length; i++) { 5437 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 5438 on_success)); 5439 result->AddAlternative(alternative); 5440 } 5441 return result; 5442} 5443 5444 5445RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, 5446 RegExpNode* on_success) { 5447 return ToNode(min(), 5448 max(), 5449 is_greedy(), 5450 body(), 5451 compiler, 5452 on_success); 5453} 5454 5455 5456// Scoped object to keep track of how much we unroll quantifier loops in the 5457// regexp graph generator. 5458class RegExpExpansionLimiter { 5459 public: 5460 static const int kMaxExpansionFactor = 6; 5461 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor) 5462 : compiler_(compiler), 5463 saved_expansion_factor_(compiler->current_expansion_factor()), 5464 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) { 5465 DCHECK(factor > 0); 5466 if (ok_to_expand_) { 5467 if (factor > kMaxExpansionFactor) { 5468 // Avoid integer overflow of the current expansion factor. 5469 ok_to_expand_ = false; 5470 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1); 5471 } else { 5472 int new_factor = saved_expansion_factor_ * factor; 5473 ok_to_expand_ = (new_factor <= kMaxExpansionFactor); 5474 compiler->set_current_expansion_factor(new_factor); 5475 } 5476 } 5477 } 5478 5479 ~RegExpExpansionLimiter() { 5480 compiler_->set_current_expansion_factor(saved_expansion_factor_); 5481 } 5482 5483 bool ok_to_expand() { return ok_to_expand_; } 5484 5485 private: 5486 RegExpCompiler* compiler_; 5487 int saved_expansion_factor_; 5488 bool ok_to_expand_; 5489 5490 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); 5491}; 5492 5493 5494RegExpNode* RegExpQuantifier::ToNode(int min, 5495 int max, 5496 bool is_greedy, 5497 RegExpTree* body, 5498 RegExpCompiler* compiler, 5499 RegExpNode* on_success, 5500 bool not_at_start) { 5501 // x{f, t} becomes this: 5502 // 5503 // (r++)<-. 5504 // | ` 5505 // | (x) 5506 // v ^ 5507 // (r=0)-->(?)---/ [if r < t] 5508 // | 5509 // [if r >= f] \----> ... 5510 // 5511 5512 // 15.10.2.5 RepeatMatcher algorithm. 5513 // The parser has already eliminated the case where max is 0. In the case 5514 // where max_match is zero the parser has removed the quantifier if min was 5515 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom. 5516 5517 // If we know that we cannot match zero length then things are a little 5518 // simpler since we don't need to make the special zero length match check 5519 // from step 2.1. If the min and max are small we can unroll a little in 5520 // this case. 5521 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,} 5522 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3} 5523 if (max == 0) return on_success; // This can happen due to recursion. 5524 bool body_can_be_empty = (body->min_match() == 0); 5525 int body_start_reg = RegExpCompiler::kNoRegister; 5526 Interval capture_registers = body->CaptureRegisters(); 5527 bool needs_capture_clearing = !capture_registers.is_empty(); 5528 Zone* zone = compiler->zone(); 5529 5530 if (body_can_be_empty) { 5531 body_start_reg = compiler->AllocateRegister(); 5532 } else if (compiler->optimize() && !needs_capture_clearing) { 5533 // Only unroll if there are no captures and the body can't be 5534 // empty. 5535 { 5536 RegExpExpansionLimiter limiter( 5537 compiler, min + ((max != min) ? 1 : 0)); 5538 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) { 5539 int new_max = (max == kInfinity) ? max : max - min; 5540 // Recurse once to get the loop or optional matches after the fixed 5541 // ones. 5542 RegExpNode* answer = ToNode( 5543 0, new_max, is_greedy, body, compiler, on_success, true); 5544 // Unroll the forced matches from 0 to min. This can cause chains of 5545 // TextNodes (which the parser does not generate). These should be 5546 // combined if it turns out they hinder good code generation. 5547 for (int i = 0; i < min; i++) { 5548 answer = body->ToNode(compiler, answer); 5549 } 5550 return answer; 5551 } 5552 } 5553 if (max <= kMaxUnrolledMaxMatches && min == 0) { 5554 DCHECK(max > 0); // Due to the 'if' above. 5555 RegExpExpansionLimiter limiter(compiler, max); 5556 if (limiter.ok_to_expand()) { 5557 // Unroll the optional matches up to max. 5558 RegExpNode* answer = on_success; 5559 for (int i = 0; i < max; i++) { 5560 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone); 5561 if (is_greedy) { 5562 alternation->AddAlternative( 5563 GuardedAlternative(body->ToNode(compiler, answer))); 5564 alternation->AddAlternative(GuardedAlternative(on_success)); 5565 } else { 5566 alternation->AddAlternative(GuardedAlternative(on_success)); 5567 alternation->AddAlternative( 5568 GuardedAlternative(body->ToNode(compiler, answer))); 5569 } 5570 answer = alternation; 5571 if (not_at_start && !compiler->read_backward()) { 5572 alternation->set_not_at_start(); 5573 } 5574 } 5575 return answer; 5576 } 5577 } 5578 } 5579 bool has_min = min > 0; 5580 bool has_max = max < RegExpTree::kInfinity; 5581 bool needs_counter = has_min || has_max; 5582 int reg_ctr = needs_counter 5583 ? compiler->AllocateRegister() 5584 : RegExpCompiler::kNoRegister; 5585 LoopChoiceNode* center = new (zone) 5586 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone); 5587 if (not_at_start && !compiler->read_backward()) center->set_not_at_start(); 5588 RegExpNode* loop_return = needs_counter 5589 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) 5590 : static_cast<RegExpNode*>(center); 5591 if (body_can_be_empty) { 5592 // If the body can be empty we need to check if it was and then 5593 // backtrack. 5594 loop_return = ActionNode::EmptyMatchCheck(body_start_reg, 5595 reg_ctr, 5596 min, 5597 loop_return); 5598 } 5599 RegExpNode* body_node = body->ToNode(compiler, loop_return); 5600 if (body_can_be_empty) { 5601 // If the body can be empty we need to store the start position 5602 // so we can bail out if it was empty. 5603 body_node = ActionNode::StorePosition(body_start_reg, false, body_node); 5604 } 5605 if (needs_capture_clearing) { 5606 // Before entering the body of this loop we need to clear captures. 5607 body_node = ActionNode::ClearCaptures(capture_registers, body_node); 5608 } 5609 GuardedAlternative body_alt(body_node); 5610 if (has_max) { 5611 Guard* body_guard = 5612 new(zone) Guard(reg_ctr, Guard::LT, max); 5613 body_alt.AddGuard(body_guard, zone); 5614 } 5615 GuardedAlternative rest_alt(on_success); 5616 if (has_min) { 5617 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min); 5618 rest_alt.AddGuard(rest_guard, zone); 5619 } 5620 if (is_greedy) { 5621 center->AddLoopAlternative(body_alt); 5622 center->AddContinueAlternative(rest_alt); 5623 } else { 5624 center->AddContinueAlternative(rest_alt); 5625 center->AddLoopAlternative(body_alt); 5626 } 5627 if (needs_counter) { 5628 return ActionNode::SetRegister(reg_ctr, 0, center); 5629 } else { 5630 return center; 5631 } 5632} 5633 5634 5635RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, 5636 RegExpNode* on_success) { 5637 NodeInfo info; 5638 Zone* zone = compiler->zone(); 5639 5640 switch (assertion_type()) { 5641 case START_OF_LINE: 5642 return AssertionNode::AfterNewline(on_success); 5643 case START_OF_INPUT: 5644 return AssertionNode::AtStart(on_success); 5645 case BOUNDARY: 5646 return AssertionNode::AtBoundary(on_success); 5647 case NON_BOUNDARY: 5648 return AssertionNode::AtNonBoundary(on_success); 5649 case END_OF_INPUT: 5650 return AssertionNode::AtEnd(on_success); 5651 case END_OF_LINE: { 5652 // Compile $ in multiline regexps as an alternation with a positive 5653 // lookahead in one side and an end-of-input on the other side. 5654 // We need two registers for the lookahead. 5655 int stack_pointer_register = compiler->AllocateRegister(); 5656 int position_register = compiler->AllocateRegister(); 5657 // The ChoiceNode to distinguish between a newline and end-of-input. 5658 ChoiceNode* result = new(zone) ChoiceNode(2, zone); 5659 // Create a newline atom. 5660 ZoneList<CharacterRange>* newline_ranges = 5661 new(zone) ZoneList<CharacterRange>(3, zone); 5662 CharacterRange::AddClassEscape('n', newline_ranges, zone); 5663 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); 5664 TextNode* newline_matcher = new (zone) TextNode( 5665 newline_atom, false, ActionNode::PositiveSubmatchSuccess( 5666 stack_pointer_register, position_register, 5667 0, // No captures inside. 5668 -1, // Ignored if no captures. 5669 on_success)); 5670 // Create an end-of-input matcher. 5671 RegExpNode* end_of_line = ActionNode::BeginSubmatch( 5672 stack_pointer_register, 5673 position_register, 5674 newline_matcher); 5675 // Add the two alternatives to the ChoiceNode. 5676 GuardedAlternative eol_alternative(end_of_line); 5677 result->AddAlternative(eol_alternative); 5678 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success)); 5679 result->AddAlternative(end_alternative); 5680 return result; 5681 } 5682 default: 5683 UNREACHABLE(); 5684 } 5685 return on_success; 5686} 5687 5688 5689RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, 5690 RegExpNode* on_success) { 5691 return new (compiler->zone()) 5692 BackReferenceNode(RegExpCapture::StartRegister(index()), 5693 RegExpCapture::EndRegister(index()), 5694 compiler->read_backward(), on_success); 5695} 5696 5697 5698RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, 5699 RegExpNode* on_success) { 5700 return on_success; 5701} 5702 5703 5704RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, 5705 int stack_pointer_register, 5706 int position_register, 5707 int capture_register_count, 5708 int capture_register_start) 5709 : is_positive_(is_positive), 5710 on_success_(on_success), 5711 stack_pointer_register_(stack_pointer_register), 5712 position_register_(position_register) { 5713 if (is_positive_) { 5714 on_match_success_ = ActionNode::PositiveSubmatchSuccess( 5715 stack_pointer_register, position_register, capture_register_count, 5716 capture_register_start, on_success_); 5717 } else { 5718 Zone* zone = on_success_->zone(); 5719 on_match_success_ = new (zone) NegativeSubmatchSuccess( 5720 stack_pointer_register, position_register, capture_register_count, 5721 capture_register_start, zone); 5722 } 5723} 5724 5725 5726RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) { 5727 if (is_positive_) { 5728 return ActionNode::BeginSubmatch(stack_pointer_register_, 5729 position_register_, match); 5730 } else { 5731 Zone* zone = on_success_->zone(); 5732 // We use a ChoiceNode to represent the negative lookaround. The first 5733 // alternative is the negative match. On success, the end node backtracks. 5734 // On failure, the second alternative is tried and leads to success. 5735 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the 5736 // first exit when calculating quick checks. 5737 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode( 5738 GuardedAlternative(match), GuardedAlternative(on_success_), zone); 5739 return ActionNode::BeginSubmatch(stack_pointer_register_, 5740 position_register_, choice_node); 5741 } 5742} 5743 5744 5745RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler, 5746 RegExpNode* on_success) { 5747 int stack_pointer_register = compiler->AllocateRegister(); 5748 int position_register = compiler->AllocateRegister(); 5749 5750 const int registers_per_capture = 2; 5751 const int register_of_first_capture = 2; 5752 int register_count = capture_count_ * registers_per_capture; 5753 int register_start = 5754 register_of_first_capture + capture_from_ * registers_per_capture; 5755 5756 RegExpNode* result; 5757 bool was_reading_backward = compiler->read_backward(); 5758 compiler->set_read_backward(type() == LOOKBEHIND); 5759 Builder builder(is_positive(), on_success, stack_pointer_register, 5760 position_register, register_count, register_start); 5761 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success()); 5762 result = builder.ForMatch(match); 5763 compiler->set_read_backward(was_reading_backward); 5764 return result; 5765} 5766 5767 5768RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, 5769 RegExpNode* on_success) { 5770 return ToNode(body(), index(), compiler, on_success); 5771} 5772 5773 5774RegExpNode* RegExpCapture::ToNode(RegExpTree* body, 5775 int index, 5776 RegExpCompiler* compiler, 5777 RegExpNode* on_success) { 5778 DCHECK_NOT_NULL(body); 5779 int start_reg = RegExpCapture::StartRegister(index); 5780 int end_reg = RegExpCapture::EndRegister(index); 5781 if (compiler->read_backward()) std::swap(start_reg, end_reg); 5782 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success); 5783 RegExpNode* body_node = body->ToNode(compiler, store_end); 5784 return ActionNode::StorePosition(start_reg, true, body_node); 5785} 5786 5787 5788RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, 5789 RegExpNode* on_success) { 5790 ZoneList<RegExpTree*>* children = nodes(); 5791 RegExpNode* current = on_success; 5792 if (compiler->read_backward()) { 5793 for (int i = 0; i < children->length(); i++) { 5794 current = children->at(i)->ToNode(compiler, current); 5795 } 5796 } else { 5797 for (int i = children->length() - 1; i >= 0; i--) { 5798 current = children->at(i)->ToNode(compiler, current); 5799 } 5800 } 5801 return current; 5802} 5803 5804 5805static void AddClass(const int* elmv, 5806 int elmc, 5807 ZoneList<CharacterRange>* ranges, 5808 Zone* zone) { 5809 elmc--; 5810 DCHECK(elmv[elmc] == kRangeEndMarker); 5811 for (int i = 0; i < elmc; i += 2) { 5812 DCHECK(elmv[i] < elmv[i + 1]); 5813 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone); 5814 } 5815} 5816 5817 5818static void AddClassNegated(const int *elmv, 5819 int elmc, 5820 ZoneList<CharacterRange>* ranges, 5821 Zone* zone) { 5822 elmc--; 5823 DCHECK(elmv[elmc] == kRangeEndMarker); 5824 DCHECK(elmv[0] != 0x0000); 5825 DCHECK(elmv[elmc - 1] != String::kMaxCodePoint); 5826 uc16 last = 0x0000; 5827 for (int i = 0; i < elmc; i += 2) { 5828 DCHECK(last <= elmv[i] - 1); 5829 DCHECK(elmv[i] < elmv[i + 1]); 5830 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); 5831 last = elmv[i + 1]; 5832 } 5833 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); 5834} 5835 5836 5837void CharacterRange::AddClassEscape(uc16 type, 5838 ZoneList<CharacterRange>* ranges, 5839 Zone* zone) { 5840 switch (type) { 5841 case 's': 5842 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5843 break; 5844 case 'S': 5845 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); 5846 break; 5847 case 'w': 5848 AddClass(kWordRanges, kWordRangeCount, ranges, zone); 5849 break; 5850 case 'W': 5851 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); 5852 break; 5853 case 'd': 5854 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); 5855 break; 5856 case 'D': 5857 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); 5858 break; 5859 case '.': 5860 AddClassNegated(kLineTerminatorRanges, 5861 kLineTerminatorRangeCount, 5862 ranges, 5863 zone); 5864 break; 5865 // This is not a character range as defined by the spec but a 5866 // convenient shorthand for a character class that matches any 5867 // character. 5868 case '*': 5869 ranges->Add(CharacterRange::Everything(), zone); 5870 break; 5871 // This is the set of characters matched by the $ and ^ symbols 5872 // in multiline mode. 5873 case 'n': 5874 AddClass(kLineTerminatorRanges, 5875 kLineTerminatorRangeCount, 5876 ranges, 5877 zone); 5878 break; 5879 default: 5880 UNREACHABLE(); 5881 } 5882} 5883 5884 5885Vector<const int> CharacterRange::GetWordBounds() { 5886 return Vector<const int>(kWordRanges, kWordRangeCount - 1); 5887} 5888 5889 5890void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, 5891 ZoneList<CharacterRange>* ranges, 5892 bool is_one_byte) { 5893 CharacterRange::Canonicalize(ranges); 5894 int range_count = ranges->length(); 5895 for (int i = 0; i < range_count; i++) { 5896 CharacterRange range = ranges->at(i); 5897 uc32 bottom = range.from(); 5898 if (bottom > String::kMaxUtf16CodeUnit) return; 5899 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit); 5900 // Nothing to be done for surrogates. 5901 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return; 5902 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { 5903 if (bottom > String::kMaxOneByteCharCode) return; 5904 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; 5905 } 5906 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5907 if (top == bottom) { 5908 // If this is a singleton we just expand the one character. 5909 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); 5910 for (int i = 0; i < length; i++) { 5911 uc32 chr = chars[i]; 5912 if (chr != bottom) { 5913 ranges->Add(CharacterRange::Singleton(chars[i]), zone); 5914 } 5915 } 5916 } else { 5917 // If this is a range we expand the characters block by block, expanding 5918 // contiguous subranges (blocks) one at a time. The approach is as 5919 // follows. For a given start character we look up the remainder of the 5920 // block that contains it (represented by the end point), for instance we 5921 // find 'z' if the character is 'c'. A block is characterized by the 5922 // property that all characters uncanonicalize in the same way, except 5923 // that each entry in the result is incremented by the distance from the 5924 // first element. So a-z is a block because 'a' uncanonicalizes to ['a', 5925 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once 5926 // we've found the end point we look up its uncanonicalization and 5927 // produce a range for each element. For instance for [c-f] we look up 5928 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if 5929 // it is not already contained in the input, so [c-f] will be skipped but 5930 // [C-F] will be added. If this range is not completely contained in a 5931 // block we do this for all the blocks covered by the range (handling 5932 // characters that is not in a block as a "singleton block"). 5933 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 5934 int pos = bottom; 5935 while (pos <= top) { 5936 int length = 5937 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); 5938 uc32 block_end; 5939 if (length == 0) { 5940 block_end = pos; 5941 } else { 5942 DCHECK_EQ(1, length); 5943 block_end = equivalents[0]; 5944 } 5945 int end = (block_end > top) ? top : block_end; 5946 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', 5947 equivalents); 5948 for (int i = 0; i < length; i++) { 5949 uc32 c = equivalents[i]; 5950 uc32 range_from = c - (block_end - pos); 5951 uc32 range_to = c - (block_end - end); 5952 if (!(bottom <= range_from && range_to <= top)) { 5953 ranges->Add(CharacterRange::Range(range_from, range_to), zone); 5954 } 5955 } 5956 pos = end + 1; 5957 } 5958 } 5959 } 5960} 5961 5962 5963bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 5964 DCHECK_NOT_NULL(ranges); 5965 int n = ranges->length(); 5966 if (n <= 1) return true; 5967 int max = ranges->at(0).to(); 5968 for (int i = 1; i < n; i++) { 5969 CharacterRange next_range = ranges->at(i); 5970 if (next_range.from() <= max + 1) return false; 5971 max = next_range.to(); 5972 } 5973 return true; 5974} 5975 5976 5977ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { 5978 if (ranges_ == NULL) { 5979 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); 5980 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); 5981 } 5982 return ranges_; 5983} 5984 5985 5986// Move a number of elements in a zonelist to another position 5987// in the same list. Handles overlapping source and target areas. 5988static void MoveRanges(ZoneList<CharacterRange>* list, 5989 int from, 5990 int to, 5991 int count) { 5992 // Ranges are potentially overlapping. 5993 if (from < to) { 5994 for (int i = count - 1; i >= 0; i--) { 5995 list->at(to + i) = list->at(from + i); 5996 } 5997 } else { 5998 for (int i = 0; i < count; i++) { 5999 list->at(to + i) = list->at(from + i); 6000 } 6001 } 6002} 6003 6004 6005static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, 6006 int count, 6007 CharacterRange insert) { 6008 // Inserts a range into list[0..count[, which must be sorted 6009 // by from value and non-overlapping and non-adjacent, using at most 6010 // list[0..count] for the result. Returns the number of resulting 6011 // canonicalized ranges. Inserting a range may collapse existing ranges into 6012 // fewer ranges, so the return value can be anything in the range 1..count+1. 6013 uc32 from = insert.from(); 6014 uc32 to = insert.to(); 6015 int start_pos = 0; 6016 int end_pos = count; 6017 for (int i = count - 1; i >= 0; i--) { 6018 CharacterRange current = list->at(i); 6019 if (current.from() > to + 1) { 6020 end_pos = i; 6021 } else if (current.to() + 1 < from) { 6022 start_pos = i + 1; 6023 break; 6024 } 6025 } 6026 6027 // Inserted range overlaps, or is adjacent to, ranges at positions 6028 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are 6029 // not affected by the insertion. 6030 // If start_pos == end_pos, the range must be inserted before start_pos. 6031 // if start_pos < end_pos, the entire range from start_pos to end_pos 6032 // must be merged with the insert range. 6033 6034 if (start_pos == end_pos) { 6035 // Insert between existing ranges at position start_pos. 6036 if (start_pos < count) { 6037 MoveRanges(list, start_pos, start_pos + 1, count - start_pos); 6038 } 6039 list->at(start_pos) = insert; 6040 return count + 1; 6041 } 6042 if (start_pos + 1 == end_pos) { 6043 // Replace single existing range at position start_pos. 6044 CharacterRange to_replace = list->at(start_pos); 6045 int new_from = Min(to_replace.from(), from); 6046 int new_to = Max(to_replace.to(), to); 6047 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 6048 return count; 6049 } 6050 // Replace a number of existing ranges from start_pos to end_pos - 1. 6051 // Move the remaining ranges down. 6052 6053 int new_from = Min(list->at(start_pos).from(), from); 6054 int new_to = Max(list->at(end_pos - 1).to(), to); 6055 if (end_pos < count) { 6056 MoveRanges(list, end_pos, start_pos + 1, count - end_pos); 6057 } 6058 list->at(start_pos) = CharacterRange::Range(new_from, new_to); 6059 return count - (end_pos - start_pos) + 1; 6060} 6061 6062 6063void CharacterSet::Canonicalize() { 6064 // Special/default classes are always considered canonical. The result 6065 // of calling ranges() will be sorted. 6066 if (ranges_ == NULL) return; 6067 CharacterRange::Canonicalize(ranges_); 6068} 6069 6070 6071void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { 6072 if (character_ranges->length() <= 1) return; 6073 // Check whether ranges are already canonical (increasing, non-overlapping, 6074 // non-adjacent). 6075 int n = character_ranges->length(); 6076 int max = character_ranges->at(0).to(); 6077 int i = 1; 6078 while (i < n) { 6079 CharacterRange current = character_ranges->at(i); 6080 if (current.from() <= max + 1) { 6081 break; 6082 } 6083 max = current.to(); 6084 i++; 6085 } 6086 // Canonical until the i'th range. If that's all of them, we are done. 6087 if (i == n) return; 6088 6089 // The ranges at index i and forward are not canonicalized. Make them so by 6090 // doing the equivalent of insertion sort (inserting each into the previous 6091 // list, in order). 6092 // Notice that inserting a range can reduce the number of ranges in the 6093 // result due to combining of adjacent and overlapping ranges. 6094 int read = i; // Range to insert. 6095 int num_canonical = i; // Length of canonicalized part of list. 6096 do { 6097 num_canonical = InsertRangeInCanonicalList(character_ranges, 6098 num_canonical, 6099 character_ranges->at(read)); 6100 read++; 6101 } while (read < n); 6102 character_ranges->Rewind(num_canonical); 6103 6104 DCHECK(CharacterRange::IsCanonical(character_ranges)); 6105} 6106 6107 6108void CharacterRange::Negate(ZoneList<CharacterRange>* ranges, 6109 ZoneList<CharacterRange>* negated_ranges, 6110 Zone* zone) { 6111 DCHECK(CharacterRange::IsCanonical(ranges)); 6112 DCHECK_EQ(0, negated_ranges->length()); 6113 int range_count = ranges->length(); 6114 uc32 from = 0; 6115 int i = 0; 6116 if (range_count > 0 && ranges->at(0).from() == 0) { 6117 from = ranges->at(0).to() + 1; 6118 i = 1; 6119 } 6120 while (i < range_count) { 6121 CharacterRange range = ranges->at(i); 6122 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone); 6123 from = range.to() + 1; 6124 i++; 6125 } 6126 if (from < String::kMaxCodePoint) { 6127 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint), 6128 zone); 6129 } 6130} 6131 6132 6133// ------------------------------------------------------------------- 6134// Splay tree 6135 6136 6137OutSet* OutSet::Extend(unsigned value, Zone* zone) { 6138 if (Get(value)) 6139 return this; 6140 if (successors(zone) != NULL) { 6141 for (int i = 0; i < successors(zone)->length(); i++) { 6142 OutSet* successor = successors(zone)->at(i); 6143 if (successor->Get(value)) 6144 return successor; 6145 } 6146 } else { 6147 successors_ = new(zone) ZoneList<OutSet*>(2, zone); 6148 } 6149 OutSet* result = new(zone) OutSet(first_, remaining_); 6150 result->Set(value, zone); 6151 successors(zone)->Add(result, zone); 6152 return result; 6153} 6154 6155 6156void OutSet::Set(unsigned value, Zone *zone) { 6157 if (value < kFirstLimit) { 6158 first_ |= (1 << value); 6159 } else { 6160 if (remaining_ == NULL) 6161 remaining_ = new(zone) ZoneList<unsigned>(1, zone); 6162 if (remaining_->is_empty() || !remaining_->Contains(value)) 6163 remaining_->Add(value, zone); 6164 } 6165} 6166 6167 6168bool OutSet::Get(unsigned value) const { 6169 if (value < kFirstLimit) { 6170 return (first_ & (1 << value)) != 0; 6171 } else if (remaining_ == NULL) { 6172 return false; 6173 } else { 6174 return remaining_->Contains(value); 6175 } 6176} 6177 6178 6179const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; 6180 6181 6182void DispatchTable::AddRange(CharacterRange full_range, int value, 6183 Zone* zone) { 6184 CharacterRange current = full_range; 6185 if (tree()->is_empty()) { 6186 // If this is the first range we just insert into the table. 6187 ZoneSplayTree<Config>::Locator loc; 6188 bool inserted = tree()->Insert(current.from(), &loc); 6189 DCHECK(inserted); 6190 USE(inserted); 6191 loc.set_value(Entry(current.from(), current.to(), 6192 empty()->Extend(value, zone))); 6193 return; 6194 } 6195 // First see if there is a range to the left of this one that 6196 // overlaps. 6197 ZoneSplayTree<Config>::Locator loc; 6198 if (tree()->FindGreatestLessThan(current.from(), &loc)) { 6199 Entry* entry = &loc.value(); 6200 // If we've found a range that overlaps with this one, and it 6201 // starts strictly to the left of this one, we have to fix it 6202 // because the following code only handles ranges that start on 6203 // or after the start point of the range we're adding. 6204 if (entry->from() < current.from() && entry->to() >= current.from()) { 6205 // Snap the overlapping range in half around the start point of 6206 // the range we're adding. 6207 CharacterRange left = 6208 CharacterRange::Range(entry->from(), current.from() - 1); 6209 CharacterRange right = CharacterRange::Range(current.from(), entry->to()); 6210 // The left part of the overlapping range doesn't overlap. 6211 // Truncate the whole entry to be just the left part. 6212 entry->set_to(left.to()); 6213 // The right part is the one that overlaps. We add this part 6214 // to the map and let the next step deal with merging it with 6215 // the range we're adding. 6216 ZoneSplayTree<Config>::Locator loc; 6217 bool inserted = tree()->Insert(right.from(), &loc); 6218 DCHECK(inserted); 6219 USE(inserted); 6220 loc.set_value(Entry(right.from(), 6221 right.to(), 6222 entry->out_set())); 6223 } 6224 } 6225 while (current.is_valid()) { 6226 if (tree()->FindLeastGreaterThan(current.from(), &loc) && 6227 (loc.value().from() <= current.to()) && 6228 (loc.value().to() >= current.from())) { 6229 Entry* entry = &loc.value(); 6230 // We have overlap. If there is space between the start point of 6231 // the range we're adding and where the overlapping range starts 6232 // then we have to add a range covering just that space. 6233 if (current.from() < entry->from()) { 6234 ZoneSplayTree<Config>::Locator ins; 6235 bool inserted = tree()->Insert(current.from(), &ins); 6236 DCHECK(inserted); 6237 USE(inserted); 6238 ins.set_value(Entry(current.from(), 6239 entry->from() - 1, 6240 empty()->Extend(value, zone))); 6241 current.set_from(entry->from()); 6242 } 6243 DCHECK_EQ(current.from(), entry->from()); 6244 // If the overlapping range extends beyond the one we want to add 6245 // we have to snap the right part off and add it separately. 6246 if (entry->to() > current.to()) { 6247 ZoneSplayTree<Config>::Locator ins; 6248 bool inserted = tree()->Insert(current.to() + 1, &ins); 6249 DCHECK(inserted); 6250 USE(inserted); 6251 ins.set_value(Entry(current.to() + 1, 6252 entry->to(), 6253 entry->out_set())); 6254 entry->set_to(current.to()); 6255 } 6256 DCHECK(entry->to() <= current.to()); 6257 // The overlapping range is now completely contained by the range 6258 // we're adding so we can just update it and move the start point 6259 // of the range we're adding just past it. 6260 entry->AddValue(value, zone); 6261 DCHECK(entry->to() + 1 > current.from()); 6262 current.set_from(entry->to() + 1); 6263 } else { 6264 // There is no overlap so we can just add the range 6265 ZoneSplayTree<Config>::Locator ins; 6266 bool inserted = tree()->Insert(current.from(), &ins); 6267 DCHECK(inserted); 6268 USE(inserted); 6269 ins.set_value(Entry(current.from(), 6270 current.to(), 6271 empty()->Extend(value, zone))); 6272 break; 6273 } 6274 } 6275} 6276 6277 6278OutSet* DispatchTable::Get(uc32 value) { 6279 ZoneSplayTree<Config>::Locator loc; 6280 if (!tree()->FindGreatestLessThan(value, &loc)) 6281 return empty(); 6282 Entry* entry = &loc.value(); 6283 if (value <= entry->to()) 6284 return entry->out_set(); 6285 else 6286 return empty(); 6287} 6288 6289 6290// ------------------------------------------------------------------- 6291// Analysis 6292 6293 6294void Analysis::EnsureAnalyzed(RegExpNode* that) { 6295 StackLimitCheck check(isolate()); 6296 if (check.HasOverflowed()) { 6297 fail("Stack overflow"); 6298 return; 6299 } 6300 if (that->info()->been_analyzed || that->info()->being_analyzed) 6301 return; 6302 that->info()->being_analyzed = true; 6303 that->Accept(this); 6304 that->info()->being_analyzed = false; 6305 that->info()->been_analyzed = true; 6306} 6307 6308 6309void Analysis::VisitEnd(EndNode* that) { 6310 // nothing to do 6311} 6312 6313 6314void TextNode::CalculateOffsets() { 6315 int element_count = elements()->length(); 6316 // Set up the offsets of the elements relative to the start. This is a fixed 6317 // quantity since a TextNode can only contain fixed-width things. 6318 int cp_offset = 0; 6319 for (int i = 0; i < element_count; i++) { 6320 TextElement& elm = elements()->at(i); 6321 elm.set_cp_offset(cp_offset); 6322 cp_offset += elm.length(); 6323 } 6324} 6325 6326 6327void Analysis::VisitText(TextNode* that) { 6328 if (ignore_case()) { 6329 that->MakeCaseIndependent(isolate(), is_one_byte_); 6330 } 6331 EnsureAnalyzed(that->on_success()); 6332 if (!has_failed()) { 6333 that->CalculateOffsets(); 6334 } 6335} 6336 6337 6338void Analysis::VisitAction(ActionNode* that) { 6339 RegExpNode* target = that->on_success(); 6340 EnsureAnalyzed(target); 6341 if (!has_failed()) { 6342 // If the next node is interested in what it follows then this node 6343 // has to be interested too so it can pass the information on. 6344 that->info()->AddFromFollowing(target->info()); 6345 } 6346} 6347 6348 6349void Analysis::VisitChoice(ChoiceNode* that) { 6350 NodeInfo* info = that->info(); 6351 for (int i = 0; i < that->alternatives()->length(); i++) { 6352 RegExpNode* node = that->alternatives()->at(i).node(); 6353 EnsureAnalyzed(node); 6354 if (has_failed()) return; 6355 // Anything the following nodes need to know has to be known by 6356 // this node also, so it can pass it on. 6357 info->AddFromFollowing(node->info()); 6358 } 6359} 6360 6361 6362void Analysis::VisitLoopChoice(LoopChoiceNode* that) { 6363 NodeInfo* info = that->info(); 6364 for (int i = 0; i < that->alternatives()->length(); i++) { 6365 RegExpNode* node = that->alternatives()->at(i).node(); 6366 if (node != that->loop_node()) { 6367 EnsureAnalyzed(node); 6368 if (has_failed()) return; 6369 info->AddFromFollowing(node->info()); 6370 } 6371 } 6372 // Check the loop last since it may need the value of this node 6373 // to get a correct result. 6374 EnsureAnalyzed(that->loop_node()); 6375 if (!has_failed()) { 6376 info->AddFromFollowing(that->loop_node()->info()); 6377 } 6378} 6379 6380 6381void Analysis::VisitBackReference(BackReferenceNode* that) { 6382 EnsureAnalyzed(that->on_success()); 6383} 6384 6385 6386void Analysis::VisitAssertion(AssertionNode* that) { 6387 EnsureAnalyzed(that->on_success()); 6388} 6389 6390 6391void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6392 BoyerMooreLookahead* bm, 6393 bool not_at_start) { 6394 // Working out the set of characters that a backreference can match is too 6395 // hard, so we just say that any character can match. 6396 bm->SetRest(offset); 6397 SaveBMInfo(bm, not_at_start, offset); 6398} 6399 6400 6401STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 6402 RegExpMacroAssembler::kTableSize); 6403 6404 6405void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6406 BoyerMooreLookahead* bm, bool not_at_start) { 6407 ZoneList<GuardedAlternative>* alts = alternatives(); 6408 budget = (budget - 1) / alts->length(); 6409 for (int i = 0; i < alts->length(); i++) { 6410 GuardedAlternative& alt = alts->at(i); 6411 if (alt.guards() != NULL && alt.guards()->length() != 0) { 6412 bm->SetRest(offset); // Give up trying to fill in info. 6413 SaveBMInfo(bm, not_at_start, offset); 6414 return; 6415 } 6416 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start); 6417 } 6418 SaveBMInfo(bm, not_at_start, offset); 6419} 6420 6421 6422void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, 6423 BoyerMooreLookahead* bm, bool not_at_start) { 6424 if (initial_offset >= bm->length()) return; 6425 int offset = initial_offset; 6426 int max_char = bm->max_char(); 6427 for (int i = 0; i < elements()->length(); i++) { 6428 if (offset >= bm->length()) { 6429 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6430 return; 6431 } 6432 TextElement text = elements()->at(i); 6433 if (text.text_type() == TextElement::ATOM) { 6434 RegExpAtom* atom = text.atom(); 6435 for (int j = 0; j < atom->length(); j++, offset++) { 6436 if (offset >= bm->length()) { 6437 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6438 return; 6439 } 6440 uc16 character = atom->data()[j]; 6441 if (bm->compiler()->ignore_case()) { 6442 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 6443 int length = GetCaseIndependentLetters( 6444 isolate, character, bm->max_char() == String::kMaxOneByteCharCode, 6445 chars); 6446 for (int j = 0; j < length; j++) { 6447 bm->Set(offset, chars[j]); 6448 } 6449 } else { 6450 if (character <= max_char) bm->Set(offset, character); 6451 } 6452 } 6453 } else { 6454 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type()); 6455 RegExpCharacterClass* char_class = text.char_class(); 6456 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 6457 if (char_class->is_negated()) { 6458 bm->SetAll(offset); 6459 } else { 6460 for (int k = 0; k < ranges->length(); k++) { 6461 CharacterRange& range = ranges->at(k); 6462 if (range.from() > max_char) continue; 6463 int to = Min(max_char, static_cast<int>(range.to())); 6464 bm->SetInterval(offset, Interval(range.from(), to)); 6465 } 6466 } 6467 offset++; 6468 } 6469 } 6470 if (offset >= bm->length()) { 6471 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6472 return; 6473 } 6474 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, 6475 true); // Not at start after a text node. 6476 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6477} 6478 6479 6480// ------------------------------------------------------------------- 6481// Dispatch table construction 6482 6483 6484void DispatchTableConstructor::VisitEnd(EndNode* that) { 6485 AddRange(CharacterRange::Everything()); 6486} 6487 6488 6489void DispatchTableConstructor::BuildTable(ChoiceNode* node) { 6490 node->set_being_calculated(true); 6491 ZoneList<GuardedAlternative>* alternatives = node->alternatives(); 6492 for (int i = 0; i < alternatives->length(); i++) { 6493 set_choice_index(i); 6494 alternatives->at(i).node()->Accept(this); 6495 } 6496 node->set_being_calculated(false); 6497} 6498 6499 6500class AddDispatchRange { 6501 public: 6502 explicit AddDispatchRange(DispatchTableConstructor* constructor) 6503 : constructor_(constructor) { } 6504 void Call(uc32 from, DispatchTable::Entry entry); 6505 private: 6506 DispatchTableConstructor* constructor_; 6507}; 6508 6509 6510void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { 6511 constructor_->AddRange(CharacterRange::Range(from, entry.to())); 6512} 6513 6514 6515void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { 6516 if (node->being_calculated()) 6517 return; 6518 DispatchTable* table = node->GetTable(ignore_case_); 6519 AddDispatchRange adder(this); 6520 table->ForEach(&adder); 6521} 6522 6523 6524void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { 6525 // TODO(160): Find the node that we refer back to and propagate its start 6526 // set back to here. For now we just accept anything. 6527 AddRange(CharacterRange::Everything()); 6528} 6529 6530 6531void DispatchTableConstructor::VisitAssertion(AssertionNode* that) { 6532 RegExpNode* target = that->on_success(); 6533 target->Accept(this); 6534} 6535 6536 6537static int CompareRangeByFrom(const CharacterRange* a, 6538 const CharacterRange* b) { 6539 return Compare<uc16>(a->from(), b->from()); 6540} 6541 6542 6543void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { 6544 ranges->Sort(CompareRangeByFrom); 6545 uc16 last = 0; 6546 for (int i = 0; i < ranges->length(); i++) { 6547 CharacterRange range = ranges->at(i); 6548 if (last < range.from()) 6549 AddRange(CharacterRange::Range(last, range.from() - 1)); 6550 if (range.to() >= last) { 6551 if (range.to() == String::kMaxCodePoint) { 6552 return; 6553 } else { 6554 last = range.to() + 1; 6555 } 6556 } 6557 } 6558 AddRange(CharacterRange::Range(last, String::kMaxCodePoint)); 6559} 6560 6561 6562void DispatchTableConstructor::VisitText(TextNode* that) { 6563 TextElement elm = that->elements()->at(0); 6564 switch (elm.text_type()) { 6565 case TextElement::ATOM: { 6566 uc16 c = elm.atom()->data()[0]; 6567 AddRange(CharacterRange::Range(c, c)); 6568 break; 6569 } 6570 case TextElement::CHAR_CLASS: { 6571 RegExpCharacterClass* tree = elm.char_class(); 6572 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone()); 6573 if (tree->is_negated()) { 6574 AddInverse(ranges); 6575 } else { 6576 for (int i = 0; i < ranges->length(); i++) 6577 AddRange(ranges->at(i)); 6578 } 6579 break; 6580 } 6581 default: { 6582 UNIMPLEMENTED(); 6583 } 6584 } 6585} 6586 6587 6588void DispatchTableConstructor::VisitAction(ActionNode* that) { 6589 RegExpNode* target = that->on_success(); 6590 target->Accept(this); 6591} 6592 6593 6594RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, 6595 RegExpNode* on_success) { 6596 // If the regexp matching starts within a surrogate pair, step back 6597 // to the lead surrogate and start matching from there. 6598 DCHECK(!compiler->read_backward()); 6599 Zone* zone = compiler->zone(); 6600 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List( 6601 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); 6602 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List( 6603 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd)); 6604 6605 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone); 6606 6607 int stack_register = compiler->UnicodeLookaroundStackRegister(); 6608 int position_register = compiler->UnicodeLookaroundPositionRegister(); 6609 RegExpNode* step_back = TextNode::CreateForCharacterRanges( 6610 zone, lead_surrogates, true, on_success); 6611 RegExpLookaround::Builder builder(true, step_back, stack_register, 6612 position_register); 6613 RegExpNode* match_trail = TextNode::CreateForCharacterRanges( 6614 zone, trail_surrogates, false, builder.on_match_success()); 6615 6616 optional_step_back->AddAlternative( 6617 GuardedAlternative(builder.ForMatch(match_trail))); 6618 optional_step_back->AddAlternative(GuardedAlternative(on_success)); 6619 6620 return optional_step_back; 6621} 6622 6623 6624RegExpEngine::CompilationResult RegExpEngine::Compile( 6625 Isolate* isolate, Zone* zone, RegExpCompileData* data, 6626 JSRegExp::Flags flags, Handle<String> pattern, 6627 Handle<String> sample_subject, bool is_one_byte) { 6628 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6629 return IrregexpRegExpTooBig(isolate); 6630 } 6631 bool ignore_case = flags & JSRegExp::kIgnoreCase; 6632 bool is_sticky = flags & JSRegExp::kSticky; 6633 bool is_global = flags & JSRegExp::kGlobal; 6634 bool is_unicode = flags & JSRegExp::kUnicode; 6635 RegExpCompiler compiler(isolate, zone, data->capture_count, flags, 6636 is_one_byte); 6637 6638 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern)); 6639 6640 // Sample some characters from the middle of the string. 6641 static const int kSampleSize = 128; 6642 6643 sample_subject = String::Flatten(sample_subject); 6644 int chars_sampled = 0; 6645 int half_way = (sample_subject->length() - kSampleSize) / 2; 6646 for (int i = Max(0, half_way); 6647 i < sample_subject->length() && chars_sampled < kSampleSize; 6648 i++, chars_sampled++) { 6649 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); 6650 } 6651 6652 // Wrap the body of the regexp in capture #0. 6653 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 6654 0, 6655 &compiler, 6656 compiler.accept()); 6657 RegExpNode* node = captured_body; 6658 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 6659 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 6660 int max_length = data->tree->max_match(); 6661 if (!is_start_anchored && !is_sticky) { 6662 // Add a .*? at the beginning, outside the body capture, unless 6663 // this expression is anchored at the beginning or sticky. 6664 RegExpNode* loop_node = RegExpQuantifier::ToNode( 6665 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'), 6666 &compiler, captured_body, data->contains_anchor); 6667 6668 if (data->contains_anchor) { 6669 // Unroll loop once, to take care of the case that might start 6670 // at the start of input. 6671 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6672 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6673 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( 6674 new (zone) RegExpCharacterClass('*'), false, loop_node))); 6675 node = first_step_node; 6676 } else { 6677 node = loop_node; 6678 } 6679 } 6680 if (is_one_byte) { 6681 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6682 // Do it again to propagate the new nodes to places where they were not 6683 // put because they had not been calculated yet. 6684 if (node != NULL) { 6685 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6686 } 6687 } else if (compiler.unicode() && (is_global || is_sticky)) { 6688 node = OptionallyStepBackToLeadSurrogate(&compiler, node); 6689 } 6690 6691 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6692 data->node = node; 6693 Analysis analysis(isolate, flags, is_one_byte); 6694 analysis.EnsureAnalyzed(node); 6695 if (analysis.has_failed()) { 6696 const char* error_message = analysis.error_message(); 6697 return CompilationResult(isolate, error_message); 6698 } 6699 6700 // Create the correct assembler for the architecture. 6701#ifndef V8_INTERPRETED_REGEXP 6702 // Native regexp implementation. 6703 6704 NativeRegExpMacroAssembler::Mode mode = 6705 is_one_byte ? NativeRegExpMacroAssembler::LATIN1 6706 : NativeRegExpMacroAssembler::UC16; 6707 6708#if V8_TARGET_ARCH_IA32 6709 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode, 6710 (data->capture_count + 1) * 2); 6711#elif V8_TARGET_ARCH_X64 6712 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode, 6713 (data->capture_count + 1) * 2); 6714#elif V8_TARGET_ARCH_ARM 6715 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode, 6716 (data->capture_count + 1) * 2); 6717#elif V8_TARGET_ARCH_ARM64 6718 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode, 6719 (data->capture_count + 1) * 2); 6720#elif V8_TARGET_ARCH_S390 6721 RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode, 6722 (data->capture_count + 1) * 2); 6723#elif V8_TARGET_ARCH_PPC 6724 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode, 6725 (data->capture_count + 1) * 2); 6726#elif V8_TARGET_ARCH_MIPS 6727 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode, 6728 (data->capture_count + 1) * 2); 6729#elif V8_TARGET_ARCH_MIPS64 6730 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode, 6731 (data->capture_count + 1) * 2); 6732#elif V8_TARGET_ARCH_X87 6733 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode, 6734 (data->capture_count + 1) * 2); 6735#else 6736#error "Unsupported architecture" 6737#endif 6738 6739#else // V8_INTERPRETED_REGEXP 6740 // Interpreted regexp implementation. 6741 EmbeddedVector<byte, 1024> codes; 6742 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone); 6743#endif // V8_INTERPRETED_REGEXP 6744 6745 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern)); 6746 6747 // Inserted here, instead of in Assembler, because it depends on information 6748 // in the AST that isn't replicated in the Node structure. 6749 static const int kMaxBacksearchLimit = 1024; 6750 if (is_end_anchored && 6751 !is_start_anchored && 6752 max_length < kMaxBacksearchLimit) { 6753 macro_assembler.SetCurrentPositionFromEnd(max_length); 6754 } 6755 6756 if (is_global) { 6757 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; 6758 if (data->tree->min_match() > 0) { 6759 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; 6760 } else if (is_unicode) { 6761 mode = RegExpMacroAssembler::GLOBAL_UNICODE; 6762 } 6763 macro_assembler.set_global_mode(mode); 6764 } 6765 6766 return compiler.Assemble(¯o_assembler, 6767 node, 6768 data->capture_count, 6769 pattern); 6770} 6771 6772 6773bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) { 6774 Heap* heap = pattern->GetHeap(); 6775 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; 6776 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && 6777 heap->memory_allocator()->SizeExecutable() > 6778 RegExpImpl::kRegExpExecutableMemoryLimit) { 6779 too_much = true; 6780 } 6781 return too_much; 6782} 6783 6784 6785Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string, 6786 Object* key_pattern, 6787 FixedArray** last_match_cache, 6788 ResultsCacheType type) { 6789 FixedArray* cache; 6790 if (!key_string->IsInternalizedString()) return Smi::FromInt(0); 6791 if (type == STRING_SPLIT_SUBSTRINGS) { 6792 DCHECK(key_pattern->IsString()); 6793 if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0); 6794 cache = heap->string_split_cache(); 6795 } else { 6796 DCHECK(type == REGEXP_MULTIPLE_INDICES); 6797 DCHECK(key_pattern->IsFixedArray()); 6798 cache = heap->regexp_multiple_cache(); 6799 } 6800 6801 uint32_t hash = key_string->Hash(); 6802 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 6803 ~(kArrayEntriesPerCacheEntry - 1)); 6804 if (cache->get(index + kStringOffset) != key_string || 6805 cache->get(index + kPatternOffset) != key_pattern) { 6806 index = 6807 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 6808 if (cache->get(index + kStringOffset) != key_string || 6809 cache->get(index + kPatternOffset) != key_pattern) { 6810 return Smi::FromInt(0); 6811 } 6812 } 6813 6814 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset)); 6815 return cache->get(index + kArrayOffset); 6816} 6817 6818 6819void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string, 6820 Handle<Object> key_pattern, 6821 Handle<FixedArray> value_array, 6822 Handle<FixedArray> last_match_cache, 6823 ResultsCacheType type) { 6824 Factory* factory = isolate->factory(); 6825 Handle<FixedArray> cache; 6826 if (!key_string->IsInternalizedString()) return; 6827 if (type == STRING_SPLIT_SUBSTRINGS) { 6828 DCHECK(key_pattern->IsString()); 6829 if (!key_pattern->IsInternalizedString()) return; 6830 cache = factory->string_split_cache(); 6831 } else { 6832 DCHECK(type == REGEXP_MULTIPLE_INDICES); 6833 DCHECK(key_pattern->IsFixedArray()); 6834 cache = factory->regexp_multiple_cache(); 6835 } 6836 6837 uint32_t hash = key_string->Hash(); 6838 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) & 6839 ~(kArrayEntriesPerCacheEntry - 1)); 6840 if (cache->get(index + kStringOffset) == Smi::FromInt(0)) { 6841 cache->set(index + kStringOffset, *key_string); 6842 cache->set(index + kPatternOffset, *key_pattern); 6843 cache->set(index + kArrayOffset, *value_array); 6844 cache->set(index + kLastMatchOffset, *last_match_cache); 6845 } else { 6846 uint32_t index2 = 6847 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1)); 6848 if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) { 6849 cache->set(index2 + kStringOffset, *key_string); 6850 cache->set(index2 + kPatternOffset, *key_pattern); 6851 cache->set(index2 + kArrayOffset, *value_array); 6852 cache->set(index2 + kLastMatchOffset, *last_match_cache); 6853 } else { 6854 cache->set(index2 + kStringOffset, Smi::FromInt(0)); 6855 cache->set(index2 + kPatternOffset, Smi::FromInt(0)); 6856 cache->set(index2 + kArrayOffset, Smi::FromInt(0)); 6857 cache->set(index2 + kLastMatchOffset, Smi::FromInt(0)); 6858 cache->set(index + kStringOffset, *key_string); 6859 cache->set(index + kPatternOffset, *key_pattern); 6860 cache->set(index + kArrayOffset, *value_array); 6861 cache->set(index + kLastMatchOffset, *last_match_cache); 6862 } 6863 } 6864 // If the array is a reasonably short list of substrings, convert it into a 6865 // list of internalized strings. 6866 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) { 6867 for (int i = 0; i < value_array->length(); i++) { 6868 Handle<String> str(String::cast(value_array->get(i)), isolate); 6869 Handle<String> internalized_str = factory->InternalizeString(str); 6870 value_array->set(i, *internalized_str); 6871 } 6872 } 6873 // Convert backing store to a copy-on-write array. 6874 value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map()); 6875} 6876 6877 6878void RegExpResultsCache::Clear(FixedArray* cache) { 6879 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6880 cache->set(i, Smi::FromInt(0)); 6881 } 6882} 6883 6884} // namespace internal 6885} // namespace v8 6886