1// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/v8.h"
6
7#include "src/ast.h"
8#include "src/base/platform/platform.h"
9#include "src/compilation-cache.h"
10#include "src/compiler.h"
11#include "src/execution.h"
12#include "src/factory.h"
13#include "src/jsregexp-inl.h"
14#include "src/jsregexp.h"
15#include "src/ostreams.h"
16#include "src/parser.h"
17#include "src/regexp-macro-assembler.h"
18#include "src/regexp-macro-assembler-irregexp.h"
19#include "src/regexp-macro-assembler-tracer.h"
20#include "src/regexp-stack.h"
21#include "src/runtime.h"
22#include "src/string-search.h"
23
24#ifndef V8_INTERPRETED_REGEXP
25#if V8_TARGET_ARCH_IA32
26#include "src/ia32/regexp-macro-assembler-ia32.h"  // NOLINT
27#elif V8_TARGET_ARCH_X64
28#include "src/x64/regexp-macro-assembler-x64.h"  // NOLINT
29#elif V8_TARGET_ARCH_ARM64
30#include "src/arm64/regexp-macro-assembler-arm64.h"  // NOLINT
31#elif V8_TARGET_ARCH_ARM
32#include "src/arm/regexp-macro-assembler-arm.h"  // NOLINT
33#elif V8_TARGET_ARCH_MIPS
34#include "src/mips/regexp-macro-assembler-mips.h"  // NOLINT
35#elif V8_TARGET_ARCH_MIPS64
36#include "src/mips64/regexp-macro-assembler-mips64.h"  // NOLINT
37#elif V8_TARGET_ARCH_X87
38#include "src/x87/regexp-macro-assembler-x87.h"  // NOLINT
39#else
40#error Unsupported target architecture.
41#endif
42#endif
43
44#include "src/interpreter-irregexp.h"
45
46
47namespace v8 {
48namespace internal {
49
50MaybeHandle<Object> RegExpImpl::CreateRegExpLiteral(
51    Handle<JSFunction> constructor,
52    Handle<String> pattern,
53    Handle<String> flags) {
54  // Call the construct code with 2 arguments.
55  Handle<Object> argv[] = { pattern, flags };
56  return Execution::New(constructor, arraysize(argv), argv);
57}
58
59
60static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
61  int flags = JSRegExp::NONE;
62  for (int i = 0; i < str->length(); i++) {
63    switch (str->Get(i)) {
64      case 'i':
65        flags |= JSRegExp::IGNORE_CASE;
66        break;
67      case 'g':
68        flags |= JSRegExp::GLOBAL;
69        break;
70      case 'm':
71        flags |= JSRegExp::MULTILINE;
72        break;
73      case 'y':
74        if (FLAG_harmony_regexps) flags |= JSRegExp::STICKY;
75        break;
76    }
77  }
78  return JSRegExp::Flags(flags);
79}
80
81
82MUST_USE_RESULT
83static inline MaybeHandle<Object> ThrowRegExpException(
84    Handle<JSRegExp> re,
85    Handle<String> pattern,
86    Handle<String> error_text,
87    const char* message) {
88  Isolate* isolate = re->GetIsolate();
89  Factory* factory = isolate->factory();
90  Handle<FixedArray> elements = factory->NewFixedArray(2);
91  elements->set(0, *pattern);
92  elements->set(1, *error_text);
93  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
94  Handle<Object> regexp_err;
95  THROW_NEW_ERROR(isolate, NewSyntaxError(message, array), Object);
96}
97
98
99ContainedInLattice AddRange(ContainedInLattice containment,
100                            const int* ranges,
101                            int ranges_length,
102                            Interval new_range) {
103  DCHECK((ranges_length & 1) == 1);
104  DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
105  if (containment == kLatticeUnknown) return containment;
106  bool inside = false;
107  int last = 0;
108  for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
109    // Consider the range from last to ranges[i].
110    // We haven't got to the new range yet.
111    if (ranges[i] <= new_range.from()) continue;
112    // New range is wholly inside last-ranges[i].  Note that new_range.to() is
113    // inclusive, but the values in ranges are not.
114    if (last <= new_range.from() && new_range.to() < ranges[i]) {
115      return Combine(containment, inside ? kLatticeIn : kLatticeOut);
116    }
117    return kLatticeUnknown;
118  }
119  return containment;
120}
121
122
123// More makes code generation slower, less makes V8 benchmark score lower.
124const int kMaxLookaheadForBoyerMoore = 8;
125// In a 3-character pattern you can maximally step forwards 3 characters
126// at a time, which is not always enough to pay for the extra logic.
127const int kPatternTooShortForBoyerMoore = 2;
128
129
130// Identifies the sort of regexps where the regexp engine is faster
131// than the code used for atom matches.
132static bool HasFewDifferentCharacters(Handle<String> pattern) {
133  int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
134  if (length <= kPatternTooShortForBoyerMoore) return false;
135  const int kMod = 128;
136  bool character_found[kMod];
137  int different = 0;
138  memset(&character_found[0], 0, sizeof(character_found));
139  for (int i = 0; i < length; i++) {
140    int ch = (pattern->Get(i) & (kMod - 1));
141    if (!character_found[ch]) {
142      character_found[ch] = true;
143      different++;
144      // We declare a regexp low-alphabet if it has at least 3 times as many
145      // characters as it has different characters.
146      if (different * 3 > length) return false;
147    }
148  }
149  return true;
150}
151
152
153// Generic RegExp methods. Dispatches to implementation specific methods.
154
155
156MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
157                                        Handle<String> pattern,
158                                        Handle<String> flag_str) {
159  Isolate* isolate = re->GetIsolate();
160  Zone zone(isolate);
161  JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
162  CompilationCache* compilation_cache = isolate->compilation_cache();
163  MaybeHandle<FixedArray> maybe_cached =
164      compilation_cache->LookupRegExp(pattern, flags);
165  Handle<FixedArray> cached;
166  bool in_cache = maybe_cached.ToHandle(&cached);
167  LOG(isolate, RegExpCompileEvent(re, in_cache));
168
169  Handle<Object> result;
170  if (in_cache) {
171    re->set_data(*cached);
172    return re;
173  }
174  pattern = String::Flatten(pattern);
175  PostponeInterruptsScope postpone(isolate);
176  RegExpCompileData parse_result;
177  FlatStringReader reader(isolate, pattern);
178  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
179                                 &parse_result, &zone)) {
180    // Throw an exception if we fail to parse the pattern.
181    return ThrowRegExpException(re,
182                                pattern,
183                                parse_result.error,
184                                "malformed_regexp");
185  }
186
187  bool has_been_compiled = false;
188
189  if (parse_result.simple &&
190      !flags.is_ignore_case() &&
191      !flags.is_sticky() &&
192      !HasFewDifferentCharacters(pattern)) {
193    // Parse-tree is a single atom that is equal to the pattern.
194    AtomCompile(re, pattern, flags, pattern);
195    has_been_compiled = true;
196  } else if (parse_result.tree->IsAtom() &&
197      !flags.is_ignore_case() &&
198      !flags.is_sticky() &&
199      parse_result.capture_count == 0) {
200    RegExpAtom* atom = parse_result.tree->AsAtom();
201    Vector<const uc16> atom_pattern = atom->data();
202    Handle<String> atom_string;
203    ASSIGN_RETURN_ON_EXCEPTION(
204        isolate, atom_string,
205        isolate->factory()->NewStringFromTwoByte(atom_pattern),
206        Object);
207    if (!HasFewDifferentCharacters(atom_string)) {
208      AtomCompile(re, pattern, flags, atom_string);
209      has_been_compiled = true;
210    }
211  }
212  if (!has_been_compiled) {
213    IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
214  }
215  DCHECK(re->data()->IsFixedArray());
216  // Compilation succeeded so the data is set on the regexp
217  // and we can store it in the cache.
218  Handle<FixedArray> data(FixedArray::cast(re->data()));
219  compilation_cache->PutRegExp(pattern, flags, data);
220
221  return re;
222}
223
224
225MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
226                                     Handle<String> subject,
227                                     int index,
228                                     Handle<JSArray> last_match_info) {
229  switch (regexp->TypeTag()) {
230    case JSRegExp::ATOM:
231      return AtomExec(regexp, subject, index, last_match_info);
232    case JSRegExp::IRREGEXP: {
233      return IrregexpExec(regexp, subject, index, last_match_info);
234    }
235    default:
236      UNREACHABLE();
237      return MaybeHandle<Object>();
238  }
239}
240
241
242// RegExp Atom implementation: Simple string search using indexOf.
243
244
245void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
246                             Handle<String> pattern,
247                             JSRegExp::Flags flags,
248                             Handle<String> match_pattern) {
249  re->GetIsolate()->factory()->SetRegExpAtomData(re,
250                                                 JSRegExp::ATOM,
251                                                 pattern,
252                                                 flags,
253                                                 match_pattern);
254}
255
256
257static void SetAtomLastCapture(FixedArray* array,
258                               String* subject,
259                               int from,
260                               int to) {
261  SealHandleScope shs(array->GetIsolate());
262  RegExpImpl::SetLastCaptureCount(array, 2);
263  RegExpImpl::SetLastSubject(array, subject);
264  RegExpImpl::SetLastInput(array, subject);
265  RegExpImpl::SetCapture(array, 0, from);
266  RegExpImpl::SetCapture(array, 1, to);
267}
268
269
270int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
271                            Handle<String> subject,
272                            int index,
273                            int32_t* output,
274                            int output_size) {
275  Isolate* isolate = regexp->GetIsolate();
276
277  DCHECK(0 <= index);
278  DCHECK(index <= subject->length());
279
280  subject = String::Flatten(subject);
281  DisallowHeapAllocation no_gc;  // ensure vectors stay valid
282
283  String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
284  int needle_len = needle->length();
285  DCHECK(needle->IsFlat());
286  DCHECK_LT(0, needle_len);
287
288  if (index + needle_len > subject->length()) {
289    return RegExpImpl::RE_FAILURE;
290  }
291
292  for (int i = 0; i < output_size; i += 2) {
293    String::FlatContent needle_content = needle->GetFlatContent();
294    String::FlatContent subject_content = subject->GetFlatContent();
295    DCHECK(needle_content.IsFlat());
296    DCHECK(subject_content.IsFlat());
297    // dispatch on type of strings
298    index =
299        (needle_content.IsOneByte()
300             ? (subject_content.IsOneByte()
301                    ? SearchString(isolate, subject_content.ToOneByteVector(),
302                                   needle_content.ToOneByteVector(), index)
303                    : SearchString(isolate, subject_content.ToUC16Vector(),
304                                   needle_content.ToOneByteVector(), index))
305             : (subject_content.IsOneByte()
306                    ? SearchString(isolate, subject_content.ToOneByteVector(),
307                                   needle_content.ToUC16Vector(), index)
308                    : SearchString(isolate, subject_content.ToUC16Vector(),
309                                   needle_content.ToUC16Vector(), index)));
310    if (index == -1) {
311      return i / 2;  // Return number of matches.
312    } else {
313      output[i] = index;
314      output[i+1] = index + needle_len;
315      index += needle_len;
316    }
317  }
318  return output_size / 2;
319}
320
321
322Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
323                                    Handle<String> subject,
324                                    int index,
325                                    Handle<JSArray> last_match_info) {
326  Isolate* isolate = re->GetIsolate();
327
328  static const int kNumRegisters = 2;
329  STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
330  int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
331
332  int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
333
334  if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
335
336  DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
337  SealHandleScope shs(isolate);
338  FixedArray* array = FixedArray::cast(last_match_info->elements());
339  SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
340  return last_match_info;
341}
342
343
344// Irregexp implementation.
345
346// Ensures that the regexp object contains a compiled version of the
347// source for either one-byte or two-byte subject strings.
348// If the compiled version doesn't already exist, it is compiled
349// from the source pattern.
350// If compilation fails, an exception is thrown and this function
351// returns false.
352bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
353                                        Handle<String> sample_subject,
354                                        bool is_one_byte) {
355  Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
356#ifdef V8_INTERPRETED_REGEXP
357  if (compiled_code->IsByteArray()) return true;
358#else  // V8_INTERPRETED_REGEXP (RegExp native code)
359  if (compiled_code->IsCode()) return true;
360#endif
361  // We could potentially have marked this as flushable, but have kept
362  // a saved version if we did not flush it yet.
363  Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
364  if (saved_code->IsCode()) {
365    // Reinstate the code in the original place.
366    re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
367    DCHECK(compiled_code->IsSmi());
368    return true;
369  }
370  return CompileIrregexp(re, sample_subject, is_one_byte);
371}
372
373
374static void CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
375                                            Handle<String> error_message,
376                                            Isolate* isolate) {
377  Factory* factory = isolate->factory();
378  Handle<FixedArray> elements = factory->NewFixedArray(2);
379  elements->set(0, re->Pattern());
380  elements->set(1, *error_message);
381  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
382  Handle<Object> error;
383  MaybeHandle<Object> maybe_error =
384      factory->NewSyntaxError("malformed_regexp", array);
385  if (maybe_error.ToHandle(&error)) isolate->Throw(*error);
386}
387
388
389bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
390                                 Handle<String> sample_subject,
391                                 bool is_one_byte) {
392  // Compile the RegExp.
393  Isolate* isolate = re->GetIsolate();
394  Zone zone(isolate);
395  PostponeInterruptsScope postpone(isolate);
396  // If we had a compilation error the last time this is saved at the
397  // saved code index.
398  Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
399  // When arriving here entry can only be a smi, either representing an
400  // uncompiled regexp, a previous compilation error, or code that has
401  // been flushed.
402  DCHECK(entry->IsSmi());
403  int entry_value = Smi::cast(entry)->value();
404  DCHECK(entry_value == JSRegExp::kUninitializedValue ||
405         entry_value == JSRegExp::kCompilationErrorValue ||
406         (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
407
408  if (entry_value == JSRegExp::kCompilationErrorValue) {
409    // A previous compilation failed and threw an error which we store in
410    // the saved code index (we store the error message, not the actual
411    // error). Recreate the error object and throw it.
412    Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
413    DCHECK(error_string->IsString());
414    Handle<String> error_message(String::cast(error_string));
415    CreateRegExpErrorObjectAndThrow(re, error_message, isolate);
416    return false;
417  }
418
419  JSRegExp::Flags flags = re->GetFlags();
420
421  Handle<String> pattern(re->Pattern());
422  pattern = String::Flatten(pattern);
423  RegExpCompileData compile_data;
424  FlatStringReader reader(isolate, pattern);
425  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
426                                 &compile_data,
427                                 &zone)) {
428    // Throw an exception if we fail to parse the pattern.
429    // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
430    USE(ThrowRegExpException(re,
431                             pattern,
432                             compile_data.error,
433                             "malformed_regexp"));
434    return false;
435  }
436  RegExpEngine::CompilationResult result = RegExpEngine::Compile(
437      &compile_data, flags.is_ignore_case(), flags.is_global(),
438      flags.is_multiline(), flags.is_sticky(), pattern, sample_subject,
439      is_one_byte, &zone);
440  if (result.error_message != NULL) {
441    // Unable to compile regexp.
442    Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
443        CStrVector(result.error_message)).ToHandleChecked();
444    CreateRegExpErrorObjectAndThrow(re, error_message, isolate);
445    return false;
446  }
447
448  Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
449  data->set(JSRegExp::code_index(is_one_byte), result.code);
450  int register_max = IrregexpMaxRegisterCount(*data);
451  if (result.num_registers > register_max) {
452    SetIrregexpMaxRegisterCount(*data, result.num_registers);
453  }
454
455  return true;
456}
457
458
459int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
460  return Smi::cast(
461      re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
462}
463
464
465void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
466  re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
467}
468
469
470int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
471  return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
472}
473
474
475int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
476  return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
477}
478
479
480ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
481  return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
482}
483
484
485Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
486  return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
487}
488
489
490void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
491                                    Handle<String> pattern,
492                                    JSRegExp::Flags flags,
493                                    int capture_count) {
494  // Initialize compiled code entries to null.
495  re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
496                                                     JSRegExp::IRREGEXP,
497                                                     pattern,
498                                                     flags,
499                                                     capture_count);
500}
501
502
503int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
504                                Handle<String> subject) {
505  subject = String::Flatten(subject);
506
507  // Check representation of the underlying storage.
508  bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
509  if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
510
511#ifdef V8_INTERPRETED_REGEXP
512  // Byte-code regexp needs space allocated for all its registers.
513  // The result captures are copied to the start of the registers array
514  // if the match succeeds.  This way those registers are not clobbered
515  // when we set the last match info from last successful match.
516  return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
517         (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
518#else  // V8_INTERPRETED_REGEXP
519  // Native regexp only needs room to output captures. Registers are handled
520  // internally.
521  return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
522#endif  // V8_INTERPRETED_REGEXP
523}
524
525
526int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
527                                Handle<String> subject,
528                                int index,
529                                int32_t* output,
530                                int output_size) {
531  Isolate* isolate = regexp->GetIsolate();
532
533  Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
534
535  DCHECK(index >= 0);
536  DCHECK(index <= subject->length());
537  DCHECK(subject->IsFlat());
538
539  bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
540
541#ifndef V8_INTERPRETED_REGEXP
542  DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
543  do {
544    EnsureCompiledIrregexp(regexp, subject, is_one_byte);
545    Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
546    // The stack is used to allocate registers for the compiled regexp code.
547    // This means that in case of failure, the output registers array is left
548    // untouched and contains the capture results from the previous successful
549    // match.  We can use that to set the last match info lazily.
550    NativeRegExpMacroAssembler::Result res =
551        NativeRegExpMacroAssembler::Match(code,
552                                          subject,
553                                          output,
554                                          output_size,
555                                          index,
556                                          isolate);
557    if (res != NativeRegExpMacroAssembler::RETRY) {
558      DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
559             isolate->has_pending_exception());
560      STATIC_ASSERT(
561          static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
562      STATIC_ASSERT(
563          static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
564      STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
565                    == RE_EXCEPTION);
566      return static_cast<IrregexpResult>(res);
567    }
568    // If result is RETRY, the string has changed representation, and we
569    // must restart from scratch.
570    // In this case, it means we must make sure we are prepared to handle
571    // the, potentially, different subject (the string can switch between
572    // being internal and external, and even between being Latin1 and UC16,
573    // but the characters are always the same).
574    IrregexpPrepare(regexp, subject);
575    is_one_byte = subject->IsOneByteRepresentationUnderneath();
576  } while (true);
577  UNREACHABLE();
578  return RE_EXCEPTION;
579#else  // V8_INTERPRETED_REGEXP
580
581  DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
582  // We must have done EnsureCompiledIrregexp, so we can get the number of
583  // registers.
584  int number_of_capture_registers =
585      (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
586  int32_t* raw_output = &output[number_of_capture_registers];
587  // We do not touch the actual capture result registers until we know there
588  // has been a match so that we can use those capture results to set the
589  // last match info.
590  for (int i = number_of_capture_registers - 1; i >= 0; i--) {
591    raw_output[i] = -1;
592  }
593  Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
594                               isolate);
595
596  IrregexpResult result = IrregexpInterpreter::Match(isolate,
597                                                     byte_codes,
598                                                     subject,
599                                                     raw_output,
600                                                     index);
601  if (result == RE_SUCCESS) {
602    // Copy capture results to the start of the registers array.
603    MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
604  }
605  if (result == RE_EXCEPTION) {
606    DCHECK(!isolate->has_pending_exception());
607    isolate->StackOverflow();
608  }
609  return result;
610#endif  // V8_INTERPRETED_REGEXP
611}
612
613
614MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
615                                             Handle<String> subject,
616                                             int previous_index,
617                                             Handle<JSArray> last_match_info) {
618  Isolate* isolate = regexp->GetIsolate();
619  DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
620
621  // Prepare space for the return values.
622#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
623  if (FLAG_trace_regexp_bytecodes) {
624    String* pattern = regexp->Pattern();
625    PrintF("\n\nRegexp match:   /%s/\n\n", pattern->ToCString().get());
626    PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
627  }
628#endif
629  int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
630  if (required_registers < 0) {
631    // Compiling failed with an exception.
632    DCHECK(isolate->has_pending_exception());
633    return MaybeHandle<Object>();
634  }
635
636  int32_t* output_registers = NULL;
637  if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
638    output_registers = NewArray<int32_t>(required_registers);
639  }
640  SmartArrayPointer<int32_t> auto_release(output_registers);
641  if (output_registers == NULL) {
642    output_registers = isolate->jsregexp_static_offsets_vector();
643  }
644
645  int res = RegExpImpl::IrregexpExecRaw(
646      regexp, subject, previous_index, output_registers, required_registers);
647  if (res == RE_SUCCESS) {
648    int capture_count =
649        IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
650    return SetLastMatchInfo(
651        last_match_info, subject, capture_count, output_registers);
652  }
653  if (res == RE_EXCEPTION) {
654    DCHECK(isolate->has_pending_exception());
655    return MaybeHandle<Object>();
656  }
657  DCHECK(res == RE_FAILURE);
658  return isolate->factory()->null_value();
659}
660
661
662Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
663                                             Handle<String> subject,
664                                             int capture_count,
665                                             int32_t* match) {
666  DCHECK(last_match_info->HasFastObjectElements());
667  int capture_register_count = (capture_count + 1) * 2;
668  JSArray::EnsureSize(last_match_info,
669                      capture_register_count + kLastMatchOverhead);
670  DisallowHeapAllocation no_allocation;
671  FixedArray* array = FixedArray::cast(last_match_info->elements());
672  if (match != NULL) {
673    for (int i = 0; i < capture_register_count; i += 2) {
674      SetCapture(array, i, match[i]);
675      SetCapture(array, i + 1, match[i + 1]);
676    }
677  }
678  SetLastCaptureCount(array, capture_register_count);
679  SetLastSubject(array, *subject);
680  SetLastInput(array, *subject);
681  return last_match_info;
682}
683
684
685RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
686                                     Handle<String> subject,
687                                     bool is_global,
688                                     Isolate* isolate)
689  : register_array_(NULL),
690    register_array_size_(0),
691    regexp_(regexp),
692    subject_(subject) {
693#ifdef V8_INTERPRETED_REGEXP
694  bool interpreted = true;
695#else
696  bool interpreted = false;
697#endif  // V8_INTERPRETED_REGEXP
698
699  if (regexp_->TypeTag() == JSRegExp::ATOM) {
700    static const int kAtomRegistersPerMatch = 2;
701    registers_per_match_ = kAtomRegistersPerMatch;
702    // There is no distinction between interpreted and native for atom regexps.
703    interpreted = false;
704  } else {
705    registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
706    if (registers_per_match_ < 0) {
707      num_matches_ = -1;  // Signal exception.
708      return;
709    }
710  }
711
712  if (is_global && !interpreted) {
713    register_array_size_ =
714        Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
715    max_matches_ = register_array_size_ / registers_per_match_;
716  } else {
717    // Global loop in interpreted regexp is not implemented.  We choose
718    // the size of the offsets vector so that it can only store one match.
719    register_array_size_ = registers_per_match_;
720    max_matches_ = 1;
721  }
722
723  if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
724    register_array_ = NewArray<int32_t>(register_array_size_);
725  } else {
726    register_array_ = isolate->jsregexp_static_offsets_vector();
727  }
728
729  // Set state so that fetching the results the first time triggers a call
730  // to the compiled regexp.
731  current_match_index_ = max_matches_ - 1;
732  num_matches_ = max_matches_;
733  DCHECK(registers_per_match_ >= 2);  // Each match has at least one capture.
734  DCHECK_GE(register_array_size_, registers_per_match_);
735  int32_t* last_match =
736      &register_array_[current_match_index_ * registers_per_match_];
737  last_match[0] = -1;
738  last_match[1] = 0;
739}
740
741
742// -------------------------------------------------------------------
743// Implementation of the Irregexp regular expression engine.
744//
745// The Irregexp regular expression engine is intended to be a complete
746// implementation of ECMAScript regular expressions.  It generates either
747// bytecodes or native code.
748
749//   The Irregexp regexp engine is structured in three steps.
750//   1) The parser generates an abstract syntax tree.  See ast.cc.
751//   2) From the AST a node network is created.  The nodes are all
752//      subclasses of RegExpNode.  The nodes represent states when
753//      executing a regular expression.  Several optimizations are
754//      performed on the node network.
755//   3) From the nodes we generate either byte codes or native code
756//      that can actually execute the regular expression (perform
757//      the search).  The code generation step is described in more
758//      detail below.
759
760// Code generation.
761//
762//   The nodes are divided into four main categories.
763//   * Choice nodes
764//        These represent places where the regular expression can
765//        match in more than one way.  For example on entry to an
766//        alternation (foo|bar) or a repetition (*, +, ? or {}).
767//   * Action nodes
768//        These represent places where some action should be
769//        performed.  Examples include recording the current position
770//        in the input string to a register (in order to implement
771//        captures) or other actions on register for example in order
772//        to implement the counters needed for {} repetitions.
773//   * Matching nodes
774//        These attempt to match some element part of the input string.
775//        Examples of elements include character classes, plain strings
776//        or back references.
777//   * End nodes
778//        These are used to implement the actions required on finding
779//        a successful match or failing to find a match.
780//
781//   The code generated (whether as byte codes or native code) maintains
782//   some state as it runs.  This consists of the following elements:
783//
784//   * The capture registers.  Used for string captures.
785//   * Other registers.  Used for counters etc.
786//   * The current position.
787//   * The stack of backtracking information.  Used when a matching node
788//     fails to find a match and needs to try an alternative.
789//
790// Conceptual regular expression execution model:
791//
792//   There is a simple conceptual model of regular expression execution
793//   which will be presented first.  The actual code generated is a more
794//   efficient simulation of the simple conceptual model:
795//
796//   * Choice nodes are implemented as follows:
797//     For each choice except the last {
798//       push current position
799//       push backtrack code location
800//       <generate code to test for choice>
801//       backtrack code location:
802//       pop current position
803//     }
804//     <generate code to test for last choice>
805//
806//   * Actions nodes are generated as follows
807//     <push affected registers on backtrack stack>
808//     <generate code to perform action>
809//     push backtrack code location
810//     <generate code to test for following nodes>
811//     backtrack code location:
812//     <pop affected registers to restore their state>
813//     <pop backtrack location from stack and go to it>
814//
815//   * Matching nodes are generated as follows:
816//     if input string matches at current position
817//       update current position
818//       <generate code to test for following nodes>
819//     else
820//       <pop backtrack location from stack and go to it>
821//
822//   Thus it can be seen that the current position is saved and restored
823//   by the choice nodes, whereas the registers are saved and restored by
824//   by the action nodes that manipulate them.
825//
826//   The other interesting aspect of this model is that nodes are generated
827//   at the point where they are needed by a recursive call to Emit().  If
828//   the node has already been code generated then the Emit() call will
829//   generate a jump to the previously generated code instead.  In order to
830//   limit recursion it is possible for the Emit() function to put the node
831//   on a work list for later generation and instead generate a jump.  The
832//   destination of the jump is resolved later when the code is generated.
833//
834// Actual regular expression code generation.
835//
836//   Code generation is actually more complicated than the above.  In order
837//   to improve the efficiency of the generated code some optimizations are
838//   performed
839//
840//   * Choice nodes have 1-character lookahead.
841//     A choice node looks at the following character and eliminates some of
842//     the choices immediately based on that character.  This is not yet
843//     implemented.
844//   * Simple greedy loops store reduced backtracking information.
845//     A quantifier like /.*foo/m will greedily match the whole input.  It will
846//     then need to backtrack to a point where it can match "foo".  The naive
847//     implementation of this would push each character position onto the
848//     backtracking stack, then pop them off one by one.  This would use space
849//     proportional to the length of the input string.  However since the "."
850//     can only match in one way and always has a constant length (in this case
851//     of 1) it suffices to store the current position on the top of the stack
852//     once.  Matching now becomes merely incrementing the current position and
853//     backtracking becomes decrementing the current position and checking the
854//     result against the stored current position.  This is faster and saves
855//     space.
856//   * The current state is virtualized.
857//     This is used to defer expensive operations until it is clear that they
858//     are needed and to generate code for a node more than once, allowing
859//     specialized an efficient versions of the code to be created. This is
860//     explained in the section below.
861//
862// Execution state virtualization.
863//
864//   Instead of emitting code, nodes that manipulate the state can record their
865//   manipulation in an object called the Trace.  The Trace object can record a
866//   current position offset, an optional backtrack code location on the top of
867//   the virtualized backtrack stack and some register changes.  When a node is
868//   to be emitted it can flush the Trace or update it.  Flushing the Trace
869//   will emit code to bring the actual state into line with the virtual state.
870//   Avoiding flushing the state can postpone some work (e.g. updates of capture
871//   registers).  Postponing work can save time when executing the regular
872//   expression since it may be found that the work never has to be done as a
873//   failure to match can occur.  In addition it is much faster to jump to a
874//   known backtrack code location than it is to pop an unknown backtrack
875//   location from the stack and jump there.
876//
877//   The virtual state found in the Trace affects code generation.  For example
878//   the virtual state contains the difference between the actual current
879//   position and the virtual current position, and matching code needs to use
880//   this offset to attempt a match in the correct location of the input
881//   string.  Therefore code generated for a non-trivial trace is specialized
882//   to that trace.  The code generator therefore has the ability to generate
883//   code for each node several times.  In order to limit the size of the
884//   generated code there is an arbitrary limit on how many specialized sets of
885//   code may be generated for a given node.  If the limit is reached, the
886//   trace is flushed and a generic version of the code for a node is emitted.
887//   This is subsequently used for that node.  The code emitted for non-generic
888//   trace is not recorded in the node and so it cannot currently be reused in
889//   the event that code generation is requested for an identical trace.
890
891
892void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
893  UNREACHABLE();
894}
895
896
897void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
898  text->AddElement(TextElement::Atom(this), zone);
899}
900
901
902void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
903  text->AddElement(TextElement::CharClass(this), zone);
904}
905
906
907void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
908  for (int i = 0; i < elements()->length(); i++)
909    text->AddElement(elements()->at(i), zone);
910}
911
912
913TextElement TextElement::Atom(RegExpAtom* atom) {
914  return TextElement(ATOM, atom);
915}
916
917
918TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
919  return TextElement(CHAR_CLASS, char_class);
920}
921
922
923int TextElement::length() const {
924  switch (text_type()) {
925    case ATOM:
926      return atom()->length();
927
928    case CHAR_CLASS:
929      return 1;
930  }
931  UNREACHABLE();
932  return 0;
933}
934
935
936DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
937  if (table_ == NULL) {
938    table_ = new(zone()) DispatchTable(zone());
939    DispatchTableConstructor cons(table_, ignore_case, zone());
940    cons.BuildTable(this);
941  }
942  return table_;
943}
944
945
946class FrequencyCollator {
947 public:
948  FrequencyCollator() : total_samples_(0) {
949    for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
950      frequencies_[i] = CharacterFrequency(i);
951    }
952  }
953
954  void CountCharacter(int character) {
955    int index = (character & RegExpMacroAssembler::kTableMask);
956    frequencies_[index].Increment();
957    total_samples_++;
958  }
959
960  // Does not measure in percent, but rather per-128 (the table size from the
961  // regexp macro assembler).
962  int Frequency(int in_character) {
963    DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
964    if (total_samples_ < 1) return 1;  // Division by zero.
965    int freq_in_per128 =
966        (frequencies_[in_character].counter() * 128) / total_samples_;
967    return freq_in_per128;
968  }
969
970 private:
971  class CharacterFrequency {
972   public:
973    CharacterFrequency() : counter_(0), character_(-1) { }
974    explicit CharacterFrequency(int character)
975        : counter_(0), character_(character) { }
976
977    void Increment() { counter_++; }
978    int counter() { return counter_; }
979    int character() { return character_; }
980
981   private:
982    int counter_;
983    int character_;
984  };
985
986
987 private:
988  CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
989  int total_samples_;
990};
991
992
993class RegExpCompiler {
994 public:
995  RegExpCompiler(int capture_count, bool ignore_case, bool is_one_byte,
996                 Zone* zone);
997
998  int AllocateRegister() {
999    if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1000      reg_exp_too_big_ = true;
1001      return next_register_;
1002    }
1003    return next_register_++;
1004  }
1005
1006  RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
1007                                           RegExpNode* start,
1008                                           int capture_count,
1009                                           Handle<String> pattern);
1010
1011  inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1012
1013  static const int kImplementationOffset = 0;
1014  static const int kNumberOfRegistersOffset = 0;
1015  static const int kCodeOffset = 1;
1016
1017  RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1018  EndNode* accept() { return accept_; }
1019
1020  static const int kMaxRecursion = 100;
1021  inline int recursion_depth() { return recursion_depth_; }
1022  inline void IncrementRecursionDepth() { recursion_depth_++; }
1023  inline void DecrementRecursionDepth() { recursion_depth_--; }
1024
1025  void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1026
1027  inline bool ignore_case() { return ignore_case_; }
1028  inline bool one_byte() { return one_byte_; }
1029  FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1030
1031  int current_expansion_factor() { return current_expansion_factor_; }
1032  void set_current_expansion_factor(int value) {
1033    current_expansion_factor_ = value;
1034  }
1035
1036  Zone* zone() const { return zone_; }
1037
1038  static const int kNoRegister = -1;
1039
1040 private:
1041  EndNode* accept_;
1042  int next_register_;
1043  List<RegExpNode*>* work_list_;
1044  int recursion_depth_;
1045  RegExpMacroAssembler* macro_assembler_;
1046  bool ignore_case_;
1047  bool one_byte_;
1048  bool reg_exp_too_big_;
1049  int current_expansion_factor_;
1050  FrequencyCollator frequency_collator_;
1051  Zone* zone_;
1052};
1053
1054
1055class RecursionCheck {
1056 public:
1057  explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1058    compiler->IncrementRecursionDepth();
1059  }
1060  ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1061 private:
1062  RegExpCompiler* compiler_;
1063};
1064
1065
1066static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1067  return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1068}
1069
1070
1071// Attempts to compile the regexp using an Irregexp code generator.  Returns
1072// a fixed array or a null handle depending on whether it succeeded.
1073RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case,
1074                               bool one_byte, Zone* zone)
1075    : next_register_(2 * (capture_count + 1)),
1076      work_list_(NULL),
1077      recursion_depth_(0),
1078      ignore_case_(ignore_case),
1079      one_byte_(one_byte),
1080      reg_exp_too_big_(false),
1081      current_expansion_factor_(1),
1082      frequency_collator_(),
1083      zone_(zone) {
1084  accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1085  DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1086}
1087
1088
1089RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1090    RegExpMacroAssembler* macro_assembler,
1091    RegExpNode* start,
1092    int capture_count,
1093    Handle<String> pattern) {
1094  Heap* heap = pattern->GetHeap();
1095
1096  bool use_slow_safe_regexp_compiler = false;
1097  if (heap->total_regexp_code_generated() >
1098          RegExpImpl::kRegWxpCompiledLimit &&
1099      heap->isolate()->memory_allocator()->SizeExecutable() >
1100          RegExpImpl::kRegExpExecutableMemoryLimit) {
1101    use_slow_safe_regexp_compiler = true;
1102  }
1103
1104  macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
1105
1106#ifdef DEBUG
1107  if (FLAG_trace_regexp_assembler)
1108    macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1109  else
1110#endif
1111    macro_assembler_ = macro_assembler;
1112
1113  List <RegExpNode*> work_list(0);
1114  work_list_ = &work_list;
1115  Label fail;
1116  macro_assembler_->PushBacktrack(&fail);
1117  Trace new_trace;
1118  start->Emit(this, &new_trace);
1119  macro_assembler_->Bind(&fail);
1120  macro_assembler_->Fail();
1121  while (!work_list.is_empty()) {
1122    work_list.RemoveLast()->Emit(this, &new_trace);
1123  }
1124  if (reg_exp_too_big_) return IrregexpRegExpTooBig(zone_->isolate());
1125
1126  Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1127  heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1128  work_list_ = NULL;
1129#ifdef DEBUG
1130  if (FLAG_print_code) {
1131    CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1132    OFStream os(trace_scope.file());
1133    Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1134  }
1135  if (FLAG_trace_regexp_assembler) {
1136    delete macro_assembler_;
1137  }
1138#endif
1139  return RegExpEngine::CompilationResult(*code, next_register_);
1140}
1141
1142
1143bool Trace::DeferredAction::Mentions(int that) {
1144  if (action_type() == ActionNode::CLEAR_CAPTURES) {
1145    Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1146    return range.Contains(that);
1147  } else {
1148    return reg() == that;
1149  }
1150}
1151
1152
1153bool Trace::mentions_reg(int reg) {
1154  for (DeferredAction* action = actions_;
1155       action != NULL;
1156       action = action->next()) {
1157    if (action->Mentions(reg))
1158      return true;
1159  }
1160  return false;
1161}
1162
1163
1164bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1165  DCHECK_EQ(0, *cp_offset);
1166  for (DeferredAction* action = actions_;
1167       action != NULL;
1168       action = action->next()) {
1169    if (action->Mentions(reg)) {
1170      if (action->action_type() == ActionNode::STORE_POSITION) {
1171        *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1172        return true;
1173      } else {
1174        return false;
1175      }
1176    }
1177  }
1178  return false;
1179}
1180
1181
1182int Trace::FindAffectedRegisters(OutSet* affected_registers,
1183                                 Zone* zone) {
1184  int max_register = RegExpCompiler::kNoRegister;
1185  for (DeferredAction* action = actions_;
1186       action != NULL;
1187       action = action->next()) {
1188    if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1189      Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1190      for (int i = range.from(); i <= range.to(); i++)
1191        affected_registers->Set(i, zone);
1192      if (range.to() > max_register) max_register = range.to();
1193    } else {
1194      affected_registers->Set(action->reg(), zone);
1195      if (action->reg() > max_register) max_register = action->reg();
1196    }
1197  }
1198  return max_register;
1199}
1200
1201
1202void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1203                                     int max_register,
1204                                     const OutSet& registers_to_pop,
1205                                     const OutSet& registers_to_clear) {
1206  for (int reg = max_register; reg >= 0; reg--) {
1207    if (registers_to_pop.Get(reg)) {
1208      assembler->PopRegister(reg);
1209    } else if (registers_to_clear.Get(reg)) {
1210      int clear_to = reg;
1211      while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1212        reg--;
1213      }
1214      assembler->ClearRegisters(reg, clear_to);
1215    }
1216  }
1217}
1218
1219
1220void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1221                                   int max_register,
1222                                   const OutSet& affected_registers,
1223                                   OutSet* registers_to_pop,
1224                                   OutSet* registers_to_clear,
1225                                   Zone* zone) {
1226  // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1227  const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1228
1229  // Count pushes performed to force a stack limit check occasionally.
1230  int pushes = 0;
1231
1232  for (int reg = 0; reg <= max_register; reg++) {
1233    if (!affected_registers.Get(reg)) {
1234      continue;
1235    }
1236
1237    // The chronologically first deferred action in the trace
1238    // is used to infer the action needed to restore a register
1239    // to its previous state (or not, if it's safe to ignore it).
1240    enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1241    DeferredActionUndoType undo_action = IGNORE;
1242
1243    int value = 0;
1244    bool absolute = false;
1245    bool clear = false;
1246    int store_position = -1;
1247    // This is a little tricky because we are scanning the actions in reverse
1248    // historical order (newest first).
1249    for (DeferredAction* action = actions_;
1250         action != NULL;
1251         action = action->next()) {
1252      if (action->Mentions(reg)) {
1253        switch (action->action_type()) {
1254          case ActionNode::SET_REGISTER: {
1255            Trace::DeferredSetRegister* psr =
1256                static_cast<Trace::DeferredSetRegister*>(action);
1257            if (!absolute) {
1258              value += psr->value();
1259              absolute = true;
1260            }
1261            // SET_REGISTER is currently only used for newly introduced loop
1262            // counters. They can have a significant previous value if they
1263            // occour in a loop. TODO(lrn): Propagate this information, so
1264            // we can set undo_action to IGNORE if we know there is no value to
1265            // restore.
1266            undo_action = RESTORE;
1267            DCHECK_EQ(store_position, -1);
1268            DCHECK(!clear);
1269            break;
1270          }
1271          case ActionNode::INCREMENT_REGISTER:
1272            if (!absolute) {
1273              value++;
1274            }
1275            DCHECK_EQ(store_position, -1);
1276            DCHECK(!clear);
1277            undo_action = RESTORE;
1278            break;
1279          case ActionNode::STORE_POSITION: {
1280            Trace::DeferredCapture* pc =
1281                static_cast<Trace::DeferredCapture*>(action);
1282            if (!clear && store_position == -1) {
1283              store_position = pc->cp_offset();
1284            }
1285
1286            // For captures we know that stores and clears alternate.
1287            // Other register, are never cleared, and if the occur
1288            // inside a loop, they might be assigned more than once.
1289            if (reg <= 1) {
1290              // Registers zero and one, aka "capture zero", is
1291              // always set correctly if we succeed. There is no
1292              // need to undo a setting on backtrack, because we
1293              // will set it again or fail.
1294              undo_action = IGNORE;
1295            } else {
1296              undo_action = pc->is_capture() ? CLEAR : RESTORE;
1297            }
1298            DCHECK(!absolute);
1299            DCHECK_EQ(value, 0);
1300            break;
1301          }
1302          case ActionNode::CLEAR_CAPTURES: {
1303            // Since we're scanning in reverse order, if we've already
1304            // set the position we have to ignore historically earlier
1305            // clearing operations.
1306            if (store_position == -1) {
1307              clear = true;
1308            }
1309            undo_action = RESTORE;
1310            DCHECK(!absolute);
1311            DCHECK_EQ(value, 0);
1312            break;
1313          }
1314          default:
1315            UNREACHABLE();
1316            break;
1317        }
1318      }
1319    }
1320    // Prepare for the undo-action (e.g., push if it's going to be popped).
1321    if (undo_action == RESTORE) {
1322      pushes++;
1323      RegExpMacroAssembler::StackCheckFlag stack_check =
1324          RegExpMacroAssembler::kNoStackLimitCheck;
1325      if (pushes == push_limit) {
1326        stack_check = RegExpMacroAssembler::kCheckStackLimit;
1327        pushes = 0;
1328      }
1329
1330      assembler->PushRegister(reg, stack_check);
1331      registers_to_pop->Set(reg, zone);
1332    } else if (undo_action == CLEAR) {
1333      registers_to_clear->Set(reg, zone);
1334    }
1335    // Perform the chronologically last action (or accumulated increment)
1336    // for the register.
1337    if (store_position != -1) {
1338      assembler->WriteCurrentPositionToRegister(reg, store_position);
1339    } else if (clear) {
1340      assembler->ClearRegisters(reg, reg);
1341    } else if (absolute) {
1342      assembler->SetRegister(reg, value);
1343    } else if (value != 0) {
1344      assembler->AdvanceRegister(reg, value);
1345    }
1346  }
1347}
1348
1349
1350// This is called as we come into a loop choice node and some other tricky
1351// nodes.  It normalizes the state of the code generator to ensure we can
1352// generate generic code.
1353void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1354  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1355
1356  DCHECK(!is_trivial());
1357
1358  if (actions_ == NULL && backtrack() == NULL) {
1359    // Here we just have some deferred cp advances to fix and we are back to
1360    // a normal situation.  We may also have to forget some information gained
1361    // through a quick check that was already performed.
1362    if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1363    // Create a new trivial state and generate the node with that.
1364    Trace new_state;
1365    successor->Emit(compiler, &new_state);
1366    return;
1367  }
1368
1369  // Generate deferred actions here along with code to undo them again.
1370  OutSet affected_registers;
1371
1372  if (backtrack() != NULL) {
1373    // Here we have a concrete backtrack location.  These are set up by choice
1374    // nodes and so they indicate that we have a deferred save of the current
1375    // position which we may need to emit here.
1376    assembler->PushCurrentPosition();
1377  }
1378
1379  int max_register = FindAffectedRegisters(&affected_registers,
1380                                           compiler->zone());
1381  OutSet registers_to_pop;
1382  OutSet registers_to_clear;
1383  PerformDeferredActions(assembler,
1384                         max_register,
1385                         affected_registers,
1386                         &registers_to_pop,
1387                         &registers_to_clear,
1388                         compiler->zone());
1389  if (cp_offset_ != 0) {
1390    assembler->AdvanceCurrentPosition(cp_offset_);
1391  }
1392
1393  // Create a new trivial state and generate the node with that.
1394  Label undo;
1395  assembler->PushBacktrack(&undo);
1396  Trace new_state;
1397  successor->Emit(compiler, &new_state);
1398
1399  // On backtrack we need to restore state.
1400  assembler->Bind(&undo);
1401  RestoreAffectedRegisters(assembler,
1402                           max_register,
1403                           registers_to_pop,
1404                           registers_to_clear);
1405  if (backtrack() == NULL) {
1406    assembler->Backtrack();
1407  } else {
1408    assembler->PopCurrentPosition();
1409    assembler->GoTo(backtrack());
1410  }
1411}
1412
1413
1414void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1415  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1416
1417  // Omit flushing the trace. We discard the entire stack frame anyway.
1418
1419  if (!label()->is_bound()) {
1420    // We are completely independent of the trace, since we ignore it,
1421    // so this code can be used as the generic version.
1422    assembler->Bind(label());
1423  }
1424
1425  // Throw away everything on the backtrack stack since the start
1426  // of the negative submatch and restore the character position.
1427  assembler->ReadCurrentPositionFromRegister(current_position_register_);
1428  assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1429  if (clear_capture_count_ > 0) {
1430    // Clear any captures that might have been performed during the success
1431    // of the body of the negative look-ahead.
1432    int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1433    assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1434  }
1435  // Now that we have unwound the stack we find at the top of the stack the
1436  // backtrack that the BeginSubmatch node got.
1437  assembler->Backtrack();
1438}
1439
1440
1441void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1442  if (!trace->is_trivial()) {
1443    trace->Flush(compiler, this);
1444    return;
1445  }
1446  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1447  if (!label()->is_bound()) {
1448    assembler->Bind(label());
1449  }
1450  switch (action_) {
1451    case ACCEPT:
1452      assembler->Succeed();
1453      return;
1454    case BACKTRACK:
1455      assembler->GoTo(trace->backtrack());
1456      return;
1457    case NEGATIVE_SUBMATCH_SUCCESS:
1458      // This case is handled in a different virtual method.
1459      UNREACHABLE();
1460  }
1461  UNIMPLEMENTED();
1462}
1463
1464
1465void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1466  if (guards_ == NULL)
1467    guards_ = new(zone) ZoneList<Guard*>(1, zone);
1468  guards_->Add(guard, zone);
1469}
1470
1471
1472ActionNode* ActionNode::SetRegister(int reg,
1473                                    int val,
1474                                    RegExpNode* on_success) {
1475  ActionNode* result =
1476      new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1477  result->data_.u_store_register.reg = reg;
1478  result->data_.u_store_register.value = val;
1479  return result;
1480}
1481
1482
1483ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1484  ActionNode* result =
1485      new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1486  result->data_.u_increment_register.reg = reg;
1487  return result;
1488}
1489
1490
1491ActionNode* ActionNode::StorePosition(int reg,
1492                                      bool is_capture,
1493                                      RegExpNode* on_success) {
1494  ActionNode* result =
1495      new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1496  result->data_.u_position_register.reg = reg;
1497  result->data_.u_position_register.is_capture = is_capture;
1498  return result;
1499}
1500
1501
1502ActionNode* ActionNode::ClearCaptures(Interval range,
1503                                      RegExpNode* on_success) {
1504  ActionNode* result =
1505      new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1506  result->data_.u_clear_captures.range_from = range.from();
1507  result->data_.u_clear_captures.range_to = range.to();
1508  return result;
1509}
1510
1511
1512ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1513                                      int position_reg,
1514                                      RegExpNode* on_success) {
1515  ActionNode* result =
1516      new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1517  result->data_.u_submatch.stack_pointer_register = stack_reg;
1518  result->data_.u_submatch.current_position_register = position_reg;
1519  return result;
1520}
1521
1522
1523ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1524                                                int position_reg,
1525                                                int clear_register_count,
1526                                                int clear_register_from,
1527                                                RegExpNode* on_success) {
1528  ActionNode* result =
1529      new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1530  result->data_.u_submatch.stack_pointer_register = stack_reg;
1531  result->data_.u_submatch.current_position_register = position_reg;
1532  result->data_.u_submatch.clear_register_count = clear_register_count;
1533  result->data_.u_submatch.clear_register_from = clear_register_from;
1534  return result;
1535}
1536
1537
1538ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1539                                        int repetition_register,
1540                                        int repetition_limit,
1541                                        RegExpNode* on_success) {
1542  ActionNode* result =
1543      new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1544  result->data_.u_empty_match_check.start_register = start_register;
1545  result->data_.u_empty_match_check.repetition_register = repetition_register;
1546  result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1547  return result;
1548}
1549
1550
1551#define DEFINE_ACCEPT(Type)                                          \
1552  void Type##Node::Accept(NodeVisitor* visitor) {                    \
1553    visitor->Visit##Type(this);                                      \
1554  }
1555FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1556#undef DEFINE_ACCEPT
1557
1558
1559void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1560  visitor->VisitLoopChoice(this);
1561}
1562
1563
1564// -------------------------------------------------------------------
1565// Emit code.
1566
1567
1568void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1569                               Guard* guard,
1570                               Trace* trace) {
1571  switch (guard->op()) {
1572    case Guard::LT:
1573      DCHECK(!trace->mentions_reg(guard->reg()));
1574      macro_assembler->IfRegisterGE(guard->reg(),
1575                                    guard->value(),
1576                                    trace->backtrack());
1577      break;
1578    case Guard::GEQ:
1579      DCHECK(!trace->mentions_reg(guard->reg()));
1580      macro_assembler->IfRegisterLT(guard->reg(),
1581                                    guard->value(),
1582                                    trace->backtrack());
1583      break;
1584  }
1585}
1586
1587
1588// Returns the number of characters in the equivalence class, omitting those
1589// that cannot occur in the source string because it is ASCII.
1590static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1591                                     bool one_byte_subject,
1592                                     unibrow::uchar* letters) {
1593  int length =
1594      isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1595  // Unibrow returns 0 or 1 for characters where case independence is
1596  // trivial.
1597  if (length == 0) {
1598    letters[0] = character;
1599    length = 1;
1600  }
1601  if (!one_byte_subject || character <= String::kMaxOneByteCharCode) {
1602    return length;
1603  }
1604
1605  // The standard requires that non-ASCII characters cannot have ASCII
1606  // character codes in their equivalence class.
1607  // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
1608  // is it?  For example, \u00C5 is equivalent to \u212B.
1609  return 0;
1610}
1611
1612
1613static inline bool EmitSimpleCharacter(Isolate* isolate,
1614                                       RegExpCompiler* compiler,
1615                                       uc16 c,
1616                                       Label* on_failure,
1617                                       int cp_offset,
1618                                       bool check,
1619                                       bool preloaded) {
1620  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1621  bool bound_checked = false;
1622  if (!preloaded) {
1623    assembler->LoadCurrentCharacter(
1624        cp_offset,
1625        on_failure,
1626        check);
1627    bound_checked = true;
1628  }
1629  assembler->CheckNotCharacter(c, on_failure);
1630  return bound_checked;
1631}
1632
1633
1634// Only emits non-letters (things that don't have case).  Only used for case
1635// independent matches.
1636static inline bool EmitAtomNonLetter(Isolate* isolate,
1637                                     RegExpCompiler* compiler,
1638                                     uc16 c,
1639                                     Label* on_failure,
1640                                     int cp_offset,
1641                                     bool check,
1642                                     bool preloaded) {
1643  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1644  bool one_byte = compiler->one_byte();
1645  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1646  int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1647  if (length < 1) {
1648    // This can't match.  Must be an one-byte subject and a non-one-byte
1649    // character.  We do not need to do anything since the one-byte pass
1650    // already handled this.
1651    return false;  // Bounds not checked.
1652  }
1653  bool checked = false;
1654  // We handle the length > 1 case in a later pass.
1655  if (length == 1) {
1656    if (one_byte && c > String::kMaxOneByteCharCodeU) {
1657      // Can't match - see above.
1658      return false;  // Bounds not checked.
1659    }
1660    if (!preloaded) {
1661      macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1662      checked = check;
1663    }
1664    macro_assembler->CheckNotCharacter(c, on_failure);
1665  }
1666  return checked;
1667}
1668
1669
1670static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1671                                      bool one_byte, uc16 c1, uc16 c2,
1672                                      Label* on_failure) {
1673  uc16 char_mask;
1674  if (one_byte) {
1675    char_mask = String::kMaxOneByteCharCode;
1676  } else {
1677    char_mask = String::kMaxUtf16CodeUnit;
1678  }
1679  uc16 exor = c1 ^ c2;
1680  // Check whether exor has only one bit set.
1681  if (((exor - 1) & exor) == 0) {
1682    // If c1 and c2 differ only by one bit.
1683    // Ecma262UnCanonicalize always gives the highest number last.
1684    DCHECK(c2 > c1);
1685    uc16 mask = char_mask ^ exor;
1686    macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1687    return true;
1688  }
1689  DCHECK(c2 > c1);
1690  uc16 diff = c2 - c1;
1691  if (((diff - 1) & diff) == 0 && c1 >= diff) {
1692    // If the characters differ by 2^n but don't differ by one bit then
1693    // subtract the difference from the found character, then do the or
1694    // trick.  We avoid the theoretical case where negative numbers are
1695    // involved in order to simplify code generation.
1696    uc16 mask = char_mask ^ diff;
1697    macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1698                                                    diff,
1699                                                    mask,
1700                                                    on_failure);
1701    return true;
1702  }
1703  return false;
1704}
1705
1706
1707typedef bool EmitCharacterFunction(Isolate* isolate,
1708                                   RegExpCompiler* compiler,
1709                                   uc16 c,
1710                                   Label* on_failure,
1711                                   int cp_offset,
1712                                   bool check,
1713                                   bool preloaded);
1714
1715// Only emits letters (things that have case).  Only used for case independent
1716// matches.
1717static inline bool EmitAtomLetter(Isolate* isolate,
1718                                  RegExpCompiler* compiler,
1719                                  uc16 c,
1720                                  Label* on_failure,
1721                                  int cp_offset,
1722                                  bool check,
1723                                  bool preloaded) {
1724  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1725  bool one_byte = compiler->one_byte();
1726  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1727  int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1728  if (length <= 1) return false;
1729  // We may not need to check against the end of the input string
1730  // if this character lies before a character that matched.
1731  if (!preloaded) {
1732    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1733  }
1734  Label ok;
1735  DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1736  switch (length) {
1737    case 2: {
1738      if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1739                                    chars[1], on_failure)) {
1740      } else {
1741        macro_assembler->CheckCharacter(chars[0], &ok);
1742        macro_assembler->CheckNotCharacter(chars[1], on_failure);
1743        macro_assembler->Bind(&ok);
1744      }
1745      break;
1746    }
1747    case 4:
1748      macro_assembler->CheckCharacter(chars[3], &ok);
1749      // Fall through!
1750    case 3:
1751      macro_assembler->CheckCharacter(chars[0], &ok);
1752      macro_assembler->CheckCharacter(chars[1], &ok);
1753      macro_assembler->CheckNotCharacter(chars[2], on_failure);
1754      macro_assembler->Bind(&ok);
1755      break;
1756    default:
1757      UNREACHABLE();
1758      break;
1759  }
1760  return true;
1761}
1762
1763
1764static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1765                             int border,
1766                             Label* fall_through,
1767                             Label* above_or_equal,
1768                             Label* below) {
1769  if (below != fall_through) {
1770    masm->CheckCharacterLT(border, below);
1771    if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1772  } else {
1773    masm->CheckCharacterGT(border - 1, above_or_equal);
1774  }
1775}
1776
1777
1778static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1779                                   int first,
1780                                   int last,
1781                                   Label* fall_through,
1782                                   Label* in_range,
1783                                   Label* out_of_range) {
1784  if (in_range == fall_through) {
1785    if (first == last) {
1786      masm->CheckNotCharacter(first, out_of_range);
1787    } else {
1788      masm->CheckCharacterNotInRange(first, last, out_of_range);
1789    }
1790  } else {
1791    if (first == last) {
1792      masm->CheckCharacter(first, in_range);
1793    } else {
1794      masm->CheckCharacterInRange(first, last, in_range);
1795    }
1796    if (out_of_range != fall_through) masm->GoTo(out_of_range);
1797  }
1798}
1799
1800
1801// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1802// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1803static void EmitUseLookupTable(
1804    RegExpMacroAssembler* masm,
1805    ZoneList<int>* ranges,
1806    int start_index,
1807    int end_index,
1808    int min_char,
1809    Label* fall_through,
1810    Label* even_label,
1811    Label* odd_label) {
1812  static const int kSize = RegExpMacroAssembler::kTableSize;
1813  static const int kMask = RegExpMacroAssembler::kTableMask;
1814
1815  int base = (min_char & ~kMask);
1816  USE(base);
1817
1818  // Assert that everything is on one kTableSize page.
1819  for (int i = start_index; i <= end_index; i++) {
1820    DCHECK_EQ(ranges->at(i) & ~kMask, base);
1821  }
1822  DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1823
1824  char templ[kSize];
1825  Label* on_bit_set;
1826  Label* on_bit_clear;
1827  int bit;
1828  if (even_label == fall_through) {
1829    on_bit_set = odd_label;
1830    on_bit_clear = even_label;
1831    bit = 1;
1832  } else {
1833    on_bit_set = even_label;
1834    on_bit_clear = odd_label;
1835    bit = 0;
1836  }
1837  for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1838    templ[i] = bit;
1839  }
1840  int j = 0;
1841  bit ^= 1;
1842  for (int i = start_index; i < end_index; i++) {
1843    for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1844      templ[j] = bit;
1845    }
1846    bit ^= 1;
1847  }
1848  for (int i = j; i < kSize; i++) {
1849    templ[i] = bit;
1850  }
1851  Factory* factory = masm->zone()->isolate()->factory();
1852  // TODO(erikcorry): Cache these.
1853  Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1854  for (int i = 0; i < kSize; i++) {
1855    ba->set(i, templ[i]);
1856  }
1857  masm->CheckBitInTable(ba, on_bit_set);
1858  if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1859}
1860
1861
1862static void CutOutRange(RegExpMacroAssembler* masm,
1863                        ZoneList<int>* ranges,
1864                        int start_index,
1865                        int end_index,
1866                        int cut_index,
1867                        Label* even_label,
1868                        Label* odd_label) {
1869  bool odd = (((cut_index - start_index) & 1) == 1);
1870  Label* in_range_label = odd ? odd_label : even_label;
1871  Label dummy;
1872  EmitDoubleBoundaryTest(masm,
1873                         ranges->at(cut_index),
1874                         ranges->at(cut_index + 1) - 1,
1875                         &dummy,
1876                         in_range_label,
1877                         &dummy);
1878  DCHECK(!dummy.is_linked());
1879  // Cut out the single range by rewriting the array.  This creates a new
1880  // range that is a merger of the two ranges on either side of the one we
1881  // are cutting out.  The oddity of the labels is preserved.
1882  for (int j = cut_index; j > start_index; j--) {
1883    ranges->at(j) = ranges->at(j - 1);
1884  }
1885  for (int j = cut_index + 1; j < end_index; j++) {
1886    ranges->at(j) = ranges->at(j + 1);
1887  }
1888}
1889
1890
1891// Unicode case.  Split the search space into kSize spaces that are handled
1892// with recursion.
1893static void SplitSearchSpace(ZoneList<int>* ranges,
1894                             int start_index,
1895                             int end_index,
1896                             int* new_start_index,
1897                             int* new_end_index,
1898                             int* border) {
1899  static const int kSize = RegExpMacroAssembler::kTableSize;
1900  static const int kMask = RegExpMacroAssembler::kTableMask;
1901
1902  int first = ranges->at(start_index);
1903  int last = ranges->at(end_index) - 1;
1904
1905  *new_start_index = start_index;
1906  *border = (ranges->at(start_index) & ~kMask) + kSize;
1907  while (*new_start_index < end_index) {
1908    if (ranges->at(*new_start_index) > *border) break;
1909    (*new_start_index)++;
1910  }
1911  // new_start_index is the index of the first edge that is beyond the
1912  // current kSize space.
1913
1914  // For very large search spaces we do a binary chop search of the non-Latin1
1915  // space instead of just going to the end of the current kSize space.  The
1916  // heuristics are complicated a little by the fact that any 128-character
1917  // encoding space can be quickly tested with a table lookup, so we don't
1918  // wish to do binary chop search at a smaller granularity than that.  A
1919  // 128-character space can take up a lot of space in the ranges array if,
1920  // for example, we only want to match every second character (eg. the lower
1921  // case characters on some Unicode pages).
1922  int binary_chop_index = (end_index + start_index) / 2;
1923  // The first test ensures that we get to the code that handles the Latin1
1924  // range with a single not-taken branch, speeding up this important
1925  // character range (even non-Latin1 charset-based text has spaces and
1926  // punctuation).
1927  if (*border - 1 > String::kMaxOneByteCharCode &&  // Latin1 case.
1928      end_index - start_index > (*new_start_index - start_index) * 2 &&
1929      last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1930      ranges->at(binary_chop_index) >= first + 2 * kSize) {
1931    int scan_forward_for_section_border = binary_chop_index;;
1932    int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1933
1934    while (scan_forward_for_section_border < end_index) {
1935      if (ranges->at(scan_forward_for_section_border) > new_border) {
1936        *new_start_index = scan_forward_for_section_border;
1937        *border = new_border;
1938        break;
1939      }
1940      scan_forward_for_section_border++;
1941    }
1942  }
1943
1944  DCHECK(*new_start_index > start_index);
1945  *new_end_index = *new_start_index - 1;
1946  if (ranges->at(*new_end_index) == *border) {
1947    (*new_end_index)--;
1948  }
1949  if (*border >= ranges->at(end_index)) {
1950    *border = ranges->at(end_index);
1951    *new_start_index = end_index;  // Won't be used.
1952    *new_end_index = end_index - 1;
1953  }
1954}
1955
1956
1957// Gets a series of segment boundaries representing a character class.  If the
1958// character is in the range between an even and an odd boundary (counting from
1959// start_index) then go to even_label, otherwise go to odd_label.  We already
1960// know that the character is in the range of min_char to max_char inclusive.
1961// Either label can be NULL indicating backtracking.  Either label can also be
1962// equal to the fall_through label.
1963static void GenerateBranches(RegExpMacroAssembler* masm,
1964                             ZoneList<int>* ranges,
1965                             int start_index,
1966                             int end_index,
1967                             uc16 min_char,
1968                             uc16 max_char,
1969                             Label* fall_through,
1970                             Label* even_label,
1971                             Label* odd_label) {
1972  int first = ranges->at(start_index);
1973  int last = ranges->at(end_index) - 1;
1974
1975  DCHECK_LT(min_char, first);
1976
1977  // Just need to test if the character is before or on-or-after
1978  // a particular character.
1979  if (start_index == end_index) {
1980    EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1981    return;
1982  }
1983
1984  // Another almost trivial case:  There is one interval in the middle that is
1985  // different from the end intervals.
1986  if (start_index + 1 == end_index) {
1987    EmitDoubleBoundaryTest(
1988        masm, first, last, fall_through, even_label, odd_label);
1989    return;
1990  }
1991
1992  // It's not worth using table lookup if there are very few intervals in the
1993  // character class.
1994  if (end_index - start_index <= 6) {
1995    // It is faster to test for individual characters, so we look for those
1996    // first, then try arbitrary ranges in the second round.
1997    static int kNoCutIndex = -1;
1998    int cut = kNoCutIndex;
1999    for (int i = start_index; i < end_index; i++) {
2000      if (ranges->at(i) == ranges->at(i + 1) - 1) {
2001        cut = i;
2002        break;
2003      }
2004    }
2005    if (cut == kNoCutIndex) cut = start_index;
2006    CutOutRange(
2007        masm, ranges, start_index, end_index, cut, even_label, odd_label);
2008    DCHECK_GE(end_index - start_index, 2);
2009    GenerateBranches(masm,
2010                     ranges,
2011                     start_index + 1,
2012                     end_index - 1,
2013                     min_char,
2014                     max_char,
2015                     fall_through,
2016                     even_label,
2017                     odd_label);
2018    return;
2019  }
2020
2021  // If there are a lot of intervals in the regexp, then we will use tables to
2022  // determine whether the character is inside or outside the character class.
2023  static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2024
2025  if ((max_char >> kBits) == (min_char >> kBits)) {
2026    EmitUseLookupTable(masm,
2027                       ranges,
2028                       start_index,
2029                       end_index,
2030                       min_char,
2031                       fall_through,
2032                       even_label,
2033                       odd_label);
2034    return;
2035  }
2036
2037  if ((min_char >> kBits) != (first >> kBits)) {
2038    masm->CheckCharacterLT(first, odd_label);
2039    GenerateBranches(masm,
2040                     ranges,
2041                     start_index + 1,
2042                     end_index,
2043                     first,
2044                     max_char,
2045                     fall_through,
2046                     odd_label,
2047                     even_label);
2048    return;
2049  }
2050
2051  int new_start_index = 0;
2052  int new_end_index = 0;
2053  int border = 0;
2054
2055  SplitSearchSpace(ranges,
2056                   start_index,
2057                   end_index,
2058                   &new_start_index,
2059                   &new_end_index,
2060                   &border);
2061
2062  Label handle_rest;
2063  Label* above = &handle_rest;
2064  if (border == last + 1) {
2065    // We didn't find any section that started after the limit, so everything
2066    // above the border is one of the terminal labels.
2067    above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2068    DCHECK(new_end_index == end_index - 1);
2069  }
2070
2071  DCHECK_LE(start_index, new_end_index);
2072  DCHECK_LE(new_start_index, end_index);
2073  DCHECK_LT(start_index, new_start_index);
2074  DCHECK_LT(new_end_index, end_index);
2075  DCHECK(new_end_index + 1 == new_start_index ||
2076         (new_end_index + 2 == new_start_index &&
2077          border == ranges->at(new_end_index + 1)));
2078  DCHECK_LT(min_char, border - 1);
2079  DCHECK_LT(border, max_char);
2080  DCHECK_LT(ranges->at(new_end_index), border);
2081  DCHECK(border < ranges->at(new_start_index) ||
2082         (border == ranges->at(new_start_index) &&
2083          new_start_index == end_index &&
2084          new_end_index == end_index - 1 &&
2085          border == last + 1));
2086  DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2087
2088  masm->CheckCharacterGT(border - 1, above);
2089  Label dummy;
2090  GenerateBranches(masm,
2091                   ranges,
2092                   start_index,
2093                   new_end_index,
2094                   min_char,
2095                   border - 1,
2096                   &dummy,
2097                   even_label,
2098                   odd_label);
2099  if (handle_rest.is_linked()) {
2100    masm->Bind(&handle_rest);
2101    bool flip = (new_start_index & 1) != (start_index & 1);
2102    GenerateBranches(masm,
2103                     ranges,
2104                     new_start_index,
2105                     end_index,
2106                     border,
2107                     max_char,
2108                     &dummy,
2109                     flip ? odd_label : even_label,
2110                     flip ? even_label : odd_label);
2111  }
2112}
2113
2114
2115static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2116                          RegExpCharacterClass* cc, bool one_byte,
2117                          Label* on_failure, int cp_offset, bool check_offset,
2118                          bool preloaded, Zone* zone) {
2119  ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2120  if (!CharacterRange::IsCanonical(ranges)) {
2121    CharacterRange::Canonicalize(ranges);
2122  }
2123
2124  int max_char;
2125  if (one_byte) {
2126    max_char = String::kMaxOneByteCharCode;
2127  } else {
2128    max_char = String::kMaxUtf16CodeUnit;
2129  }
2130
2131  int range_count = ranges->length();
2132
2133  int last_valid_range = range_count - 1;
2134  while (last_valid_range >= 0) {
2135    CharacterRange& range = ranges->at(last_valid_range);
2136    if (range.from() <= max_char) {
2137      break;
2138    }
2139    last_valid_range--;
2140  }
2141
2142  if (last_valid_range < 0) {
2143    if (!cc->is_negated()) {
2144      macro_assembler->GoTo(on_failure);
2145    }
2146    if (check_offset) {
2147      macro_assembler->CheckPosition(cp_offset, on_failure);
2148    }
2149    return;
2150  }
2151
2152  if (last_valid_range == 0 &&
2153      ranges->at(0).IsEverything(max_char)) {
2154    if (cc->is_negated()) {
2155      macro_assembler->GoTo(on_failure);
2156    } else {
2157      // This is a common case hit by non-anchored expressions.
2158      if (check_offset) {
2159        macro_assembler->CheckPosition(cp_offset, on_failure);
2160      }
2161    }
2162    return;
2163  }
2164  if (last_valid_range == 0 &&
2165      !cc->is_negated() &&
2166      ranges->at(0).IsEverything(max_char)) {
2167    // This is a common case hit by non-anchored expressions.
2168    if (check_offset) {
2169      macro_assembler->CheckPosition(cp_offset, on_failure);
2170    }
2171    return;
2172  }
2173
2174  if (!preloaded) {
2175    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2176  }
2177
2178  if (cc->is_standard(zone) &&
2179        macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2180                                                    on_failure)) {
2181      return;
2182  }
2183
2184
2185  // A new list with ascending entries.  Each entry is a code unit
2186  // where there is a boundary between code units that are part of
2187  // the class and code units that are not.  Normally we insert an
2188  // entry at zero which goes to the failure label, but if there
2189  // was already one there we fall through for success on that entry.
2190  // Subsequent entries have alternating meaning (success/failure).
2191  ZoneList<int>* range_boundaries =
2192      new(zone) ZoneList<int>(last_valid_range, zone);
2193
2194  bool zeroth_entry_is_failure = !cc->is_negated();
2195
2196  for (int i = 0; i <= last_valid_range; i++) {
2197    CharacterRange& range = ranges->at(i);
2198    if (range.from() == 0) {
2199      DCHECK_EQ(i, 0);
2200      zeroth_entry_is_failure = !zeroth_entry_is_failure;
2201    } else {
2202      range_boundaries->Add(range.from(), zone);
2203    }
2204    range_boundaries->Add(range.to() + 1, zone);
2205  }
2206  int end_index = range_boundaries->length() - 1;
2207  if (range_boundaries->at(end_index) > max_char) {
2208    end_index--;
2209  }
2210
2211  Label fall_through;
2212  GenerateBranches(macro_assembler,
2213                   range_boundaries,
2214                   0,  // start_index.
2215                   end_index,
2216                   0,  // min_char.
2217                   max_char,
2218                   &fall_through,
2219                   zeroth_entry_is_failure ? &fall_through : on_failure,
2220                   zeroth_entry_is_failure ? on_failure : &fall_through);
2221  macro_assembler->Bind(&fall_through);
2222}
2223
2224
2225RegExpNode::~RegExpNode() {
2226}
2227
2228
2229RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2230                                                  Trace* trace) {
2231  // If we are generating a greedy loop then don't stop and don't reuse code.
2232  if (trace->stop_node() != NULL) {
2233    return CONTINUE;
2234  }
2235
2236  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2237  if (trace->is_trivial()) {
2238    if (label_.is_bound()) {
2239      // We are being asked to generate a generic version, but that's already
2240      // been done so just go to it.
2241      macro_assembler->GoTo(&label_);
2242      return DONE;
2243    }
2244    if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
2245      // To avoid too deep recursion we push the node to the work queue and just
2246      // generate a goto here.
2247      compiler->AddWork(this);
2248      macro_assembler->GoTo(&label_);
2249      return DONE;
2250    }
2251    // Generate generic version of the node and bind the label for later use.
2252    macro_assembler->Bind(&label_);
2253    return CONTINUE;
2254  }
2255
2256  // We are being asked to make a non-generic version.  Keep track of how many
2257  // non-generic versions we generate so as not to overdo it.
2258  trace_count_++;
2259  if (FLAG_regexp_optimization &&
2260      trace_count_ < kMaxCopiesCodeGenerated &&
2261      compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
2262    return CONTINUE;
2263  }
2264
2265  // If we get here code has been generated for this node too many times or
2266  // recursion is too deep.  Time to switch to a generic version.  The code for
2267  // generic versions above can handle deep recursion properly.
2268  trace->Flush(compiler, this);
2269  return DONE;
2270}
2271
2272
2273int ActionNode::EatsAtLeast(int still_to_find,
2274                            int budget,
2275                            bool not_at_start) {
2276  if (budget <= 0) return 0;
2277  if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
2278  return on_success()->EatsAtLeast(still_to_find,
2279                                   budget - 1,
2280                                   not_at_start);
2281}
2282
2283
2284void ActionNode::FillInBMInfo(int offset,
2285                              int budget,
2286                              BoyerMooreLookahead* bm,
2287                              bool not_at_start) {
2288  if (action_type_ == BEGIN_SUBMATCH) {
2289    bm->SetRest(offset);
2290  } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2291    on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2292  }
2293  SaveBMInfo(bm, not_at_start, offset);
2294}
2295
2296
2297int AssertionNode::EatsAtLeast(int still_to_find,
2298                               int budget,
2299                               bool not_at_start) {
2300  if (budget <= 0) return 0;
2301  // If we know we are not at the start and we are asked "how many characters
2302  // will you match if you succeed?" then we can answer anything since false
2303  // implies false.  So lets just return the max answer (still_to_find) since
2304  // that won't prevent us from preloading a lot of characters for the other
2305  // branches in the node graph.
2306  if (assertion_type() == AT_START && not_at_start) return still_to_find;
2307  return on_success()->EatsAtLeast(still_to_find,
2308                                   budget - 1,
2309                                   not_at_start);
2310}
2311
2312
2313void AssertionNode::FillInBMInfo(int offset,
2314                                 int budget,
2315                                 BoyerMooreLookahead* bm,
2316                                 bool not_at_start) {
2317  // Match the behaviour of EatsAtLeast on this node.
2318  if (assertion_type() == AT_START && not_at_start) return;
2319  on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2320  SaveBMInfo(bm, not_at_start, offset);
2321}
2322
2323
2324int BackReferenceNode::EatsAtLeast(int still_to_find,
2325                                   int budget,
2326                                   bool not_at_start) {
2327  if (budget <= 0) return 0;
2328  return on_success()->EatsAtLeast(still_to_find,
2329                                   budget - 1,
2330                                   not_at_start);
2331}
2332
2333
2334int TextNode::EatsAtLeast(int still_to_find,
2335                          int budget,
2336                          bool not_at_start) {
2337  int answer = Length();
2338  if (answer >= still_to_find) return answer;
2339  if (budget <= 0) return answer;
2340  // We are not at start after this node so we set the last argument to 'true'.
2341  return answer + on_success()->EatsAtLeast(still_to_find - answer,
2342                                            budget - 1,
2343                                            true);
2344}
2345
2346
2347int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
2348                                             int budget,
2349                                             bool not_at_start) {
2350  if (budget <= 0) return 0;
2351  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2352  // afterwards.
2353  RegExpNode* node = alternatives_->at(1).node();
2354  return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2355}
2356
2357
2358void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
2359    QuickCheckDetails* details,
2360    RegExpCompiler* compiler,
2361    int filled_in,
2362    bool not_at_start) {
2363  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2364  // afterwards.
2365  RegExpNode* node = alternatives_->at(1).node();
2366  return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2367}
2368
2369
2370int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2371                                  int budget,
2372                                  RegExpNode* ignore_this_node,
2373                                  bool not_at_start) {
2374  if (budget <= 0) return 0;
2375  int min = 100;
2376  int choice_count = alternatives_->length();
2377  budget = (budget - 1) / choice_count;
2378  for (int i = 0; i < choice_count; i++) {
2379    RegExpNode* node = alternatives_->at(i).node();
2380    if (node == ignore_this_node) continue;
2381    int node_eats_at_least =
2382        node->EatsAtLeast(still_to_find, budget, not_at_start);
2383    if (node_eats_at_least < min) min = node_eats_at_least;
2384    if (min == 0) return 0;
2385  }
2386  return min;
2387}
2388
2389
2390int LoopChoiceNode::EatsAtLeast(int still_to_find,
2391                                int budget,
2392                                bool not_at_start) {
2393  return EatsAtLeastHelper(still_to_find,
2394                           budget - 1,
2395                           loop_node_,
2396                           not_at_start);
2397}
2398
2399
2400int ChoiceNode::EatsAtLeast(int still_to_find,
2401                            int budget,
2402                            bool not_at_start) {
2403  return EatsAtLeastHelper(still_to_find,
2404                           budget,
2405                           NULL,
2406                           not_at_start);
2407}
2408
2409
2410// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2411static inline uint32_t SmearBitsRight(uint32_t v) {
2412  v |= v >> 1;
2413  v |= v >> 2;
2414  v |= v >> 4;
2415  v |= v >> 8;
2416  v |= v >> 16;
2417  return v;
2418}
2419
2420
2421bool QuickCheckDetails::Rationalize(bool asc) {
2422  bool found_useful_op = false;
2423  uint32_t char_mask;
2424  if (asc) {
2425    char_mask = String::kMaxOneByteCharCode;
2426  } else {
2427    char_mask = String::kMaxUtf16CodeUnit;
2428  }
2429  mask_ = 0;
2430  value_ = 0;
2431  int char_shift = 0;
2432  for (int i = 0; i < characters_; i++) {
2433    Position* pos = &positions_[i];
2434    if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2435      found_useful_op = true;
2436    }
2437    mask_ |= (pos->mask & char_mask) << char_shift;
2438    value_ |= (pos->value & char_mask) << char_shift;
2439    char_shift += asc ? 8 : 16;
2440  }
2441  return found_useful_op;
2442}
2443
2444
2445bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2446                                Trace* bounds_check_trace,
2447                                Trace* trace,
2448                                bool preload_has_checked_bounds,
2449                                Label* on_possible_success,
2450                                QuickCheckDetails* details,
2451                                bool fall_through_on_failure) {
2452  if (details->characters() == 0) return false;
2453  GetQuickCheckDetails(
2454      details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2455  if (details->cannot_match()) return false;
2456  if (!details->Rationalize(compiler->one_byte())) return false;
2457  DCHECK(details->characters() == 1 ||
2458         compiler->macro_assembler()->CanReadUnaligned());
2459  uint32_t mask = details->mask();
2460  uint32_t value = details->value();
2461
2462  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2463
2464  if (trace->characters_preloaded() != details->characters()) {
2465    DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2466    // We are attempting to preload the minimum number of characters
2467    // any choice would eat, so if the bounds check fails, then none of the
2468    // choices can succeed, so we can just immediately backtrack, rather
2469    // than go to the next choice.
2470    assembler->LoadCurrentCharacter(trace->cp_offset(),
2471                                    bounds_check_trace->backtrack(),
2472                                    !preload_has_checked_bounds,
2473                                    details->characters());
2474  }
2475
2476
2477  bool need_mask = true;
2478
2479  if (details->characters() == 1) {
2480    // If number of characters preloaded is 1 then we used a byte or 16 bit
2481    // load so the value is already masked down.
2482    uint32_t char_mask;
2483    if (compiler->one_byte()) {
2484      char_mask = String::kMaxOneByteCharCode;
2485    } else {
2486      char_mask = String::kMaxUtf16CodeUnit;
2487    }
2488    if ((mask & char_mask) == char_mask) need_mask = false;
2489    mask &= char_mask;
2490  } else {
2491    // For 2-character preloads in one-byte mode or 1-character preloads in
2492    // two-byte mode we also use a 16 bit load with zero extend.
2493    if (details->characters() == 2 && compiler->one_byte()) {
2494      if ((mask & 0xffff) == 0xffff) need_mask = false;
2495    } else if (details->characters() == 1 && !compiler->one_byte()) {
2496      if ((mask & 0xffff) == 0xffff) need_mask = false;
2497    } else {
2498      if (mask == 0xffffffff) need_mask = false;
2499    }
2500  }
2501
2502  if (fall_through_on_failure) {
2503    if (need_mask) {
2504      assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2505    } else {
2506      assembler->CheckCharacter(value, on_possible_success);
2507    }
2508  } else {
2509    if (need_mask) {
2510      assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2511    } else {
2512      assembler->CheckNotCharacter(value, trace->backtrack());
2513    }
2514  }
2515  return true;
2516}
2517
2518
2519// Here is the meat of GetQuickCheckDetails (see also the comment on the
2520// super-class in the .h file).
2521//
2522// We iterate along the text object, building up for each character a
2523// mask and value that can be used to test for a quick failure to match.
2524// The masks and values for the positions will be combined into a single
2525// machine word for the current character width in order to be used in
2526// generating a quick check.
2527void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2528                                    RegExpCompiler* compiler,
2529                                    int characters_filled_in,
2530                                    bool not_at_start) {
2531  Isolate* isolate = compiler->macro_assembler()->zone()->isolate();
2532  DCHECK(characters_filled_in < details->characters());
2533  int characters = details->characters();
2534  int char_mask;
2535  if (compiler->one_byte()) {
2536    char_mask = String::kMaxOneByteCharCode;
2537  } else {
2538    char_mask = String::kMaxUtf16CodeUnit;
2539  }
2540  for (int k = 0; k < elms_->length(); k++) {
2541    TextElement elm = elms_->at(k);
2542    if (elm.text_type() == TextElement::ATOM) {
2543      Vector<const uc16> quarks = elm.atom()->data();
2544      for (int i = 0; i < characters && i < quarks.length(); i++) {
2545        QuickCheckDetails::Position* pos =
2546            details->positions(characters_filled_in);
2547        uc16 c = quarks[i];
2548        if (c > char_mask) {
2549          // If we expect a non-Latin1 character from an one-byte string,
2550          // there is no way we can match. Not even case-independent
2551          // matching can turn an Latin1 character into non-Latin1 or
2552          // vice versa.
2553          // TODO(dcarney): issue 3550.  Verify that this works as expected.
2554          // For example, \u0178 is uppercase of \u00ff (y-umlaut).
2555          details->set_cannot_match();
2556          pos->determines_perfectly = false;
2557          return;
2558        }
2559        if (compiler->ignore_case()) {
2560          unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2561          int length = GetCaseIndependentLetters(isolate, c,
2562                                                 compiler->one_byte(), chars);
2563          DCHECK(length != 0);  // Can only happen if c > char_mask (see above).
2564          if (length == 1) {
2565            // This letter has no case equivalents, so it's nice and simple
2566            // and the mask-compare will determine definitely whether we have
2567            // a match at this character position.
2568            pos->mask = char_mask;
2569            pos->value = c;
2570            pos->determines_perfectly = true;
2571          } else {
2572            uint32_t common_bits = char_mask;
2573            uint32_t bits = chars[0];
2574            for (int j = 1; j < length; j++) {
2575              uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2576              common_bits ^= differing_bits;
2577              bits &= common_bits;
2578            }
2579            // If length is 2 and common bits has only one zero in it then
2580            // our mask and compare instruction will determine definitely
2581            // whether we have a match at this character position.  Otherwise
2582            // it can only be an approximate check.
2583            uint32_t one_zero = (common_bits | ~char_mask);
2584            if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2585              pos->determines_perfectly = true;
2586            }
2587            pos->mask = common_bits;
2588            pos->value = bits;
2589          }
2590        } else {
2591          // Don't ignore case.  Nice simple case where the mask-compare will
2592          // determine definitely whether we have a match at this character
2593          // position.
2594          pos->mask = char_mask;
2595          pos->value = c;
2596          pos->determines_perfectly = true;
2597        }
2598        characters_filled_in++;
2599        DCHECK(characters_filled_in <= details->characters());
2600        if (characters_filled_in == details->characters()) {
2601          return;
2602        }
2603      }
2604    } else {
2605      QuickCheckDetails::Position* pos =
2606          details->positions(characters_filled_in);
2607      RegExpCharacterClass* tree = elm.char_class();
2608      ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2609      if (tree->is_negated()) {
2610        // A quick check uses multi-character mask and compare.  There is no
2611        // useful way to incorporate a negative char class into this scheme
2612        // so we just conservatively create a mask and value that will always
2613        // succeed.
2614        pos->mask = 0;
2615        pos->value = 0;
2616      } else {
2617        int first_range = 0;
2618        while (ranges->at(first_range).from() > char_mask) {
2619          first_range++;
2620          if (first_range == ranges->length()) {
2621            details->set_cannot_match();
2622            pos->determines_perfectly = false;
2623            return;
2624          }
2625        }
2626        CharacterRange range = ranges->at(first_range);
2627        uc16 from = range.from();
2628        uc16 to = range.to();
2629        if (to > char_mask) {
2630          to = char_mask;
2631        }
2632        uint32_t differing_bits = (from ^ to);
2633        // A mask and compare is only perfect if the differing bits form a
2634        // number like 00011111 with one single block of trailing 1s.
2635        if ((differing_bits & (differing_bits + 1)) == 0 &&
2636             from + differing_bits == to) {
2637          pos->determines_perfectly = true;
2638        }
2639        uint32_t common_bits = ~SmearBitsRight(differing_bits);
2640        uint32_t bits = (from & common_bits);
2641        for (int i = first_range + 1; i < ranges->length(); i++) {
2642          CharacterRange range = ranges->at(i);
2643          uc16 from = range.from();
2644          uc16 to = range.to();
2645          if (from > char_mask) continue;
2646          if (to > char_mask) to = char_mask;
2647          // Here we are combining more ranges into the mask and compare
2648          // value.  With each new range the mask becomes more sparse and
2649          // so the chances of a false positive rise.  A character class
2650          // with multiple ranges is assumed never to be equivalent to a
2651          // mask and compare operation.
2652          pos->determines_perfectly = false;
2653          uint32_t new_common_bits = (from ^ to);
2654          new_common_bits = ~SmearBitsRight(new_common_bits);
2655          common_bits &= new_common_bits;
2656          bits &= new_common_bits;
2657          uint32_t differing_bits = (from & common_bits) ^ bits;
2658          common_bits ^= differing_bits;
2659          bits &= common_bits;
2660        }
2661        pos->mask = common_bits;
2662        pos->value = bits;
2663      }
2664      characters_filled_in++;
2665      DCHECK(characters_filled_in <= details->characters());
2666      if (characters_filled_in == details->characters()) {
2667        return;
2668      }
2669    }
2670  }
2671  DCHECK(characters_filled_in != details->characters());
2672  if (!details->cannot_match()) {
2673    on_success()-> GetQuickCheckDetails(details,
2674                                        compiler,
2675                                        characters_filled_in,
2676                                        true);
2677  }
2678}
2679
2680
2681void QuickCheckDetails::Clear() {
2682  for (int i = 0; i < characters_; i++) {
2683    positions_[i].mask = 0;
2684    positions_[i].value = 0;
2685    positions_[i].determines_perfectly = false;
2686  }
2687  characters_ = 0;
2688}
2689
2690
2691void QuickCheckDetails::Advance(int by, bool one_byte) {
2692  DCHECK(by >= 0);
2693  if (by >= characters_) {
2694    Clear();
2695    return;
2696  }
2697  for (int i = 0; i < characters_ - by; i++) {
2698    positions_[i] = positions_[by + i];
2699  }
2700  for (int i = characters_ - by; i < characters_; i++) {
2701    positions_[i].mask = 0;
2702    positions_[i].value = 0;
2703    positions_[i].determines_perfectly = false;
2704  }
2705  characters_ -= by;
2706  // We could change mask_ and value_ here but we would never advance unless
2707  // they had already been used in a check and they won't be used again because
2708  // it would gain us nothing.  So there's no point.
2709}
2710
2711
2712void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2713  DCHECK(characters_ == other->characters_);
2714  if (other->cannot_match_) {
2715    return;
2716  }
2717  if (cannot_match_) {
2718    *this = *other;
2719    return;
2720  }
2721  for (int i = from_index; i < characters_; i++) {
2722    QuickCheckDetails::Position* pos = positions(i);
2723    QuickCheckDetails::Position* other_pos = other->positions(i);
2724    if (pos->mask != other_pos->mask ||
2725        pos->value != other_pos->value ||
2726        !other_pos->determines_perfectly) {
2727      // Our mask-compare operation will be approximate unless we have the
2728      // exact same operation on both sides of the alternation.
2729      pos->determines_perfectly = false;
2730    }
2731    pos->mask &= other_pos->mask;
2732    pos->value &= pos->mask;
2733    other_pos->value &= pos->mask;
2734    uc16 differing_bits = (pos->value ^ other_pos->value);
2735    pos->mask &= ~differing_bits;
2736    pos->value &= pos->mask;
2737  }
2738}
2739
2740
2741class VisitMarker {
2742 public:
2743  explicit VisitMarker(NodeInfo* info) : info_(info) {
2744    DCHECK(!info->visited);
2745    info->visited = true;
2746  }
2747  ~VisitMarker() {
2748    info_->visited = false;
2749  }
2750 private:
2751  NodeInfo* info_;
2752};
2753
2754
2755RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2756  if (info()->replacement_calculated) return replacement();
2757  if (depth < 0) return this;
2758  DCHECK(!info()->visited);
2759  VisitMarker marker(info());
2760  return FilterSuccessor(depth - 1, ignore_case);
2761}
2762
2763
2764RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2765  RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2766  if (next == NULL) return set_replacement(NULL);
2767  on_success_ = next;
2768  return set_replacement(this);
2769}
2770
2771
2772// We need to check for the following characters: 0x39c 0x3bc 0x178.
2773static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2774  // TODO(dcarney): this could be a lot more efficient.
2775  return range.Contains(0x39c) ||
2776      range.Contains(0x3bc) || range.Contains(0x178);
2777}
2778
2779
2780static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2781  for (int i = 0; i < ranges->length(); i++) {
2782    // TODO(dcarney): this could be a lot more efficient.
2783    if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2784  }
2785  return false;
2786}
2787
2788
2789RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2790  if (info()->replacement_calculated) return replacement();
2791  if (depth < 0) return this;
2792  DCHECK(!info()->visited);
2793  VisitMarker marker(info());
2794  int element_count = elms_->length();
2795  for (int i = 0; i < element_count; i++) {
2796    TextElement elm = elms_->at(i);
2797    if (elm.text_type() == TextElement::ATOM) {
2798      Vector<const uc16> quarks = elm.atom()->data();
2799      for (int j = 0; j < quarks.length(); j++) {
2800        uint16_t c = quarks[j];
2801        if (c <= String::kMaxOneByteCharCode) continue;
2802        if (!ignore_case) return set_replacement(NULL);
2803        // Here, we need to check for characters whose upper and lower cases
2804        // are outside the Latin-1 range.
2805        uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2806        // Character is outside Latin-1 completely
2807        if (converted == 0) return set_replacement(NULL);
2808        // Convert quark to Latin-1 in place.
2809        uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2810        copy[j] = converted;
2811      }
2812    } else {
2813      DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2814      RegExpCharacterClass* cc = elm.char_class();
2815      ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2816      if (!CharacterRange::IsCanonical(ranges)) {
2817        CharacterRange::Canonicalize(ranges);
2818      }
2819      // Now they are in order so we only need to look at the first.
2820      int range_count = ranges->length();
2821      if (cc->is_negated()) {
2822        if (range_count != 0 &&
2823            ranges->at(0).from() == 0 &&
2824            ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2825          // This will be handled in a later filter.
2826          if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2827          return set_replacement(NULL);
2828        }
2829      } else {
2830        if (range_count == 0 ||
2831            ranges->at(0).from() > String::kMaxOneByteCharCode) {
2832          // This will be handled in a later filter.
2833          if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2834          return set_replacement(NULL);
2835        }
2836      }
2837    }
2838  }
2839  return FilterSuccessor(depth - 1, ignore_case);
2840}
2841
2842
2843RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2844  if (info()->replacement_calculated) return replacement();
2845  if (depth < 0) return this;
2846  if (info()->visited) return this;
2847  {
2848    VisitMarker marker(info());
2849
2850    RegExpNode* continue_replacement =
2851        continue_node_->FilterOneByte(depth - 1, ignore_case);
2852    // If we can't continue after the loop then there is no sense in doing the
2853    // loop.
2854    if (continue_replacement == NULL) return set_replacement(NULL);
2855  }
2856
2857  return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2858}
2859
2860
2861RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2862  if (info()->replacement_calculated) return replacement();
2863  if (depth < 0) return this;
2864  if (info()->visited) return this;
2865  VisitMarker marker(info());
2866  int choice_count = alternatives_->length();
2867
2868  for (int i = 0; i < choice_count; i++) {
2869    GuardedAlternative alternative = alternatives_->at(i);
2870    if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2871      set_replacement(this);
2872      return this;
2873    }
2874  }
2875
2876  int surviving = 0;
2877  RegExpNode* survivor = NULL;
2878  for (int i = 0; i < choice_count; i++) {
2879    GuardedAlternative alternative = alternatives_->at(i);
2880    RegExpNode* replacement =
2881        alternative.node()->FilterOneByte(depth - 1, ignore_case);
2882    DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
2883    if (replacement != NULL) {
2884      alternatives_->at(i).set_node(replacement);
2885      surviving++;
2886      survivor = replacement;
2887    }
2888  }
2889  if (surviving < 2) return set_replacement(survivor);
2890
2891  set_replacement(this);
2892  if (surviving == choice_count) {
2893    return this;
2894  }
2895  // Only some of the nodes survived the filtering.  We need to rebuild the
2896  // alternatives list.
2897  ZoneList<GuardedAlternative>* new_alternatives =
2898      new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2899  for (int i = 0; i < choice_count; i++) {
2900    RegExpNode* replacement =
2901        alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2902    if (replacement != NULL) {
2903      alternatives_->at(i).set_node(replacement);
2904      new_alternatives->Add(alternatives_->at(i), zone());
2905    }
2906  }
2907  alternatives_ = new_alternatives;
2908  return this;
2909}
2910
2911
2912RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(int depth,
2913                                                       bool ignore_case) {
2914  if (info()->replacement_calculated) return replacement();
2915  if (depth < 0) return this;
2916  if (info()->visited) return this;
2917  VisitMarker marker(info());
2918  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2919  // afterwards.
2920  RegExpNode* node = alternatives_->at(1).node();
2921  RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2922  if (replacement == NULL) return set_replacement(NULL);
2923  alternatives_->at(1).set_node(replacement);
2924
2925  RegExpNode* neg_node = alternatives_->at(0).node();
2926  RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2927  // If the negative lookahead is always going to fail then
2928  // we don't need to check it.
2929  if (neg_replacement == NULL) return set_replacement(replacement);
2930  alternatives_->at(0).set_node(neg_replacement);
2931  return set_replacement(this);
2932}
2933
2934
2935void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2936                                          RegExpCompiler* compiler,
2937                                          int characters_filled_in,
2938                                          bool not_at_start) {
2939  if (body_can_be_zero_length_ || info()->visited) return;
2940  VisitMarker marker(info());
2941  return ChoiceNode::GetQuickCheckDetails(details,
2942                                          compiler,
2943                                          characters_filled_in,
2944                                          not_at_start);
2945}
2946
2947
2948void LoopChoiceNode::FillInBMInfo(int offset,
2949                                  int budget,
2950                                  BoyerMooreLookahead* bm,
2951                                  bool not_at_start) {
2952  if (body_can_be_zero_length_ || budget <= 0) {
2953    bm->SetRest(offset);
2954    SaveBMInfo(bm, not_at_start, offset);
2955    return;
2956  }
2957  ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
2958  SaveBMInfo(bm, not_at_start, offset);
2959}
2960
2961
2962void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2963                                      RegExpCompiler* compiler,
2964                                      int characters_filled_in,
2965                                      bool not_at_start) {
2966  not_at_start = (not_at_start || not_at_start_);
2967  int choice_count = alternatives_->length();
2968  DCHECK(choice_count > 0);
2969  alternatives_->at(0).node()->GetQuickCheckDetails(details,
2970                                                    compiler,
2971                                                    characters_filled_in,
2972                                                    not_at_start);
2973  for (int i = 1; i < choice_count; i++) {
2974    QuickCheckDetails new_details(details->characters());
2975    RegExpNode* node = alternatives_->at(i).node();
2976    node->GetQuickCheckDetails(&new_details, compiler,
2977                               characters_filled_in,
2978                               not_at_start);
2979    // Here we merge the quick match details of the two branches.
2980    details->Merge(&new_details, characters_filled_in);
2981  }
2982}
2983
2984
2985// Check for [0-9A-Z_a-z].
2986static void EmitWordCheck(RegExpMacroAssembler* assembler,
2987                          Label* word,
2988                          Label* non_word,
2989                          bool fall_through_on_word) {
2990  if (assembler->CheckSpecialCharacterClass(
2991          fall_through_on_word ? 'w' : 'W',
2992          fall_through_on_word ? non_word : word)) {
2993    // Optimized implementation available.
2994    return;
2995  }
2996  assembler->CheckCharacterGT('z', non_word);
2997  assembler->CheckCharacterLT('0', non_word);
2998  assembler->CheckCharacterGT('a' - 1, word);
2999  assembler->CheckCharacterLT('9' + 1, word);
3000  assembler->CheckCharacterLT('A', non_word);
3001  assembler->CheckCharacterLT('Z' + 1, word);
3002  if (fall_through_on_word) {
3003    assembler->CheckNotCharacter('_', non_word);
3004  } else {
3005    assembler->CheckCharacter('_', word);
3006  }
3007}
3008
3009
3010// Emit the code to check for a ^ in multiline mode (1-character lookbehind
3011// that matches newline or the start of input).
3012static void EmitHat(RegExpCompiler* compiler,
3013                    RegExpNode* on_success,
3014                    Trace* trace) {
3015  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3016  // We will be loading the previous character into the current character
3017  // register.
3018  Trace new_trace(*trace);
3019  new_trace.InvalidateCurrentCharacter();
3020
3021  Label ok;
3022  if (new_trace.cp_offset() == 0) {
3023    // The start of input counts as a newline in this context, so skip to
3024    // ok if we are at the start.
3025    assembler->CheckAtStart(&ok);
3026  }
3027  // We already checked that we are not at the start of input so it must be
3028  // OK to load the previous character.
3029  assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3030                                  new_trace.backtrack(),
3031                                  false);
3032  if (!assembler->CheckSpecialCharacterClass('n',
3033                                             new_trace.backtrack())) {
3034    // Newline means \n, \r, 0x2028 or 0x2029.
3035    if (!compiler->one_byte()) {
3036      assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3037    }
3038    assembler->CheckCharacter('\n', &ok);
3039    assembler->CheckNotCharacter('\r', new_trace.backtrack());
3040  }
3041  assembler->Bind(&ok);
3042  on_success->Emit(compiler, &new_trace);
3043}
3044
3045
3046// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3047void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3048  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3049  Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3050  bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3051  BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3052  if (lookahead == NULL) {
3053    int eats_at_least =
3054        Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3055                                                    kRecursionBudget,
3056                                                    not_at_start));
3057    if (eats_at_least >= 1) {
3058      BoyerMooreLookahead* bm =
3059          new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3060      FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
3061      if (bm->at(0)->is_non_word())
3062        next_is_word_character = Trace::FALSE_VALUE;
3063      if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3064    }
3065  } else {
3066    if (lookahead->at(0)->is_non_word())
3067      next_is_word_character = Trace::FALSE_VALUE;
3068    if (lookahead->at(0)->is_word())
3069      next_is_word_character = Trace::TRUE_VALUE;
3070  }
3071  bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3072  if (next_is_word_character == Trace::UNKNOWN) {
3073    Label before_non_word;
3074    Label before_word;
3075    if (trace->characters_preloaded() != 1) {
3076      assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3077    }
3078    // Fall through on non-word.
3079    EmitWordCheck(assembler, &before_word, &before_non_word, false);
3080    // Next character is not a word character.
3081    assembler->Bind(&before_non_word);
3082    Label ok;
3083    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3084    assembler->GoTo(&ok);
3085
3086    assembler->Bind(&before_word);
3087    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3088    assembler->Bind(&ok);
3089  } else if (next_is_word_character == Trace::TRUE_VALUE) {
3090    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3091  } else {
3092    DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3093    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3094  }
3095}
3096
3097
3098void AssertionNode::BacktrackIfPrevious(
3099    RegExpCompiler* compiler,
3100    Trace* trace,
3101    AssertionNode::IfPrevious backtrack_if_previous) {
3102  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3103  Trace new_trace(*trace);
3104  new_trace.InvalidateCurrentCharacter();
3105
3106  Label fall_through, dummy;
3107
3108  Label* non_word = backtrack_if_previous == kIsNonWord ?
3109                    new_trace.backtrack() :
3110                    &fall_through;
3111  Label* word = backtrack_if_previous == kIsNonWord ?
3112                &fall_through :
3113                new_trace.backtrack();
3114
3115  if (new_trace.cp_offset() == 0) {
3116    // The start of input counts as a non-word character, so the question is
3117    // decided if we are at the start.
3118    assembler->CheckAtStart(non_word);
3119  }
3120  // We already checked that we are not at the start of input so it must be
3121  // OK to load the previous character.
3122  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3123  EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3124
3125  assembler->Bind(&fall_through);
3126  on_success()->Emit(compiler, &new_trace);
3127}
3128
3129
3130void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3131                                         RegExpCompiler* compiler,
3132                                         int filled_in,
3133                                         bool not_at_start) {
3134  if (assertion_type_ == AT_START && not_at_start) {
3135    details->set_cannot_match();
3136    return;
3137  }
3138  return on_success()->GetQuickCheckDetails(details,
3139                                            compiler,
3140                                            filled_in,
3141                                            not_at_start);
3142}
3143
3144
3145void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3146  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3147  switch (assertion_type_) {
3148    case AT_END: {
3149      Label ok;
3150      assembler->CheckPosition(trace->cp_offset(), &ok);
3151      assembler->GoTo(trace->backtrack());
3152      assembler->Bind(&ok);
3153      break;
3154    }
3155    case AT_START: {
3156      if (trace->at_start() == Trace::FALSE_VALUE) {
3157        assembler->GoTo(trace->backtrack());
3158        return;
3159      }
3160      if (trace->at_start() == Trace::UNKNOWN) {
3161        assembler->CheckNotAtStart(trace->backtrack());
3162        Trace at_start_trace = *trace;
3163        at_start_trace.set_at_start(true);
3164        on_success()->Emit(compiler, &at_start_trace);
3165        return;
3166      }
3167    }
3168    break;
3169    case AFTER_NEWLINE:
3170      EmitHat(compiler, on_success(), trace);
3171      return;
3172    case AT_BOUNDARY:
3173    case AT_NON_BOUNDARY: {
3174      EmitBoundaryCheck(compiler, trace);
3175      return;
3176    }
3177  }
3178  on_success()->Emit(compiler, trace);
3179}
3180
3181
3182static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3183  if (quick_check == NULL) return false;
3184  if (offset >= quick_check->characters()) return false;
3185  return quick_check->positions(offset)->determines_perfectly;
3186}
3187
3188
3189static void UpdateBoundsCheck(int index, int* checked_up_to) {
3190  if (index > *checked_up_to) {
3191    *checked_up_to = index;
3192  }
3193}
3194
3195
3196// We call this repeatedly to generate code for each pass over the text node.
3197// The passes are in increasing order of difficulty because we hope one
3198// of the first passes will fail in which case we are saved the work of the
3199// later passes.  for example for the case independent regexp /%[asdfghjkl]a/
3200// we will check the '%' in the first pass, the case independent 'a' in the
3201// second pass and the character class in the last pass.
3202//
3203// The passes are done from right to left, so for example to test for /bar/
3204// we will first test for an 'r' with offset 2, then an 'a' with offset 1
3205// and then a 'b' with offset 0.  This means we can avoid the end-of-input
3206// bounds check most of the time.  In the example we only need to check for
3207// end-of-input when loading the putative 'r'.
3208//
3209// A slight complication involves the fact that the first character may already
3210// be fetched into a register by the previous node.  In this case we want to
3211// do the test for that character first.  We do this in separate passes.  The
3212// 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
3213// pass has been performed then subsequent passes will have true in
3214// first_element_checked to indicate that that character does not need to be
3215// checked again.
3216//
3217// In addition to all this we are passed a Trace, which can
3218// contain an AlternativeGeneration object.  In this AlternativeGeneration
3219// object we can see details of any quick check that was already passed in
3220// order to get to the code we are now generating.  The quick check can involve
3221// loading characters, which means we do not need to recheck the bounds
3222// up to the limit the quick check already checked.  In addition the quick
3223// check can have involved a mask and compare operation which may simplify
3224// or obviate the need for further checks at some character positions.
3225void TextNode::TextEmitPass(RegExpCompiler* compiler,
3226                            TextEmitPassType pass,
3227                            bool preloaded,
3228                            Trace* trace,
3229                            bool first_element_checked,
3230                            int* checked_up_to) {
3231  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3232  Isolate* isolate = assembler->zone()->isolate();
3233  bool one_byte = compiler->one_byte();
3234  Label* backtrack = trace->backtrack();
3235  QuickCheckDetails* quick_check = trace->quick_check_performed();
3236  int element_count = elms_->length();
3237  for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3238    TextElement elm = elms_->at(i);
3239    int cp_offset = trace->cp_offset() + elm.cp_offset();
3240    if (elm.text_type() == TextElement::ATOM) {
3241      Vector<const uc16> quarks = elm.atom()->data();
3242      for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3243        if (first_element_checked && i == 0 && j == 0) continue;
3244        if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3245        EmitCharacterFunction* emit_function = NULL;
3246        switch (pass) {
3247          case NON_LATIN1_MATCH:
3248            DCHECK(one_byte);
3249            if (quarks[j] > String::kMaxOneByteCharCode) {
3250              assembler->GoTo(backtrack);
3251              return;
3252            }
3253            break;
3254          case NON_LETTER_CHARACTER_MATCH:
3255            emit_function = &EmitAtomNonLetter;
3256            break;
3257          case SIMPLE_CHARACTER_MATCH:
3258            emit_function = &EmitSimpleCharacter;
3259            break;
3260          case CASE_CHARACTER_MATCH:
3261            emit_function = &EmitAtomLetter;
3262            break;
3263          default:
3264            break;
3265        }
3266        if (emit_function != NULL) {
3267          bool bound_checked = emit_function(isolate,
3268                                             compiler,
3269                                             quarks[j],
3270                                             backtrack,
3271                                             cp_offset + j,
3272                                             *checked_up_to < cp_offset + j,
3273                                             preloaded);
3274          if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3275        }
3276      }
3277    } else {
3278      DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3279      if (pass == CHARACTER_CLASS_MATCH) {
3280        if (first_element_checked && i == 0) continue;
3281        if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3282        RegExpCharacterClass* cc = elm.char_class();
3283        EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3284                      *checked_up_to < cp_offset, preloaded, zone());
3285        UpdateBoundsCheck(cp_offset, checked_up_to);
3286      }
3287    }
3288  }
3289}
3290
3291
3292int TextNode::Length() {
3293  TextElement elm = elms_->last();
3294  DCHECK(elm.cp_offset() >= 0);
3295  return elm.cp_offset() + elm.length();
3296}
3297
3298
3299bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3300  TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3301  if (ignore_case) {
3302    return pass == SIMPLE_CHARACTER_MATCH;
3303  } else {
3304    return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3305  }
3306}
3307
3308
3309// This generates the code to match a text node.  A text node can contain
3310// straight character sequences (possibly to be matched in a case-independent
3311// way) and character classes.  For efficiency we do not do this in a single
3312// pass from left to right.  Instead we pass over the text node several times,
3313// emitting code for some character positions every time.  See the comment on
3314// TextEmitPass for details.
3315void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3316  LimitResult limit_result = LimitVersions(compiler, trace);
3317  if (limit_result == DONE) return;
3318  DCHECK(limit_result == CONTINUE);
3319
3320  if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3321    compiler->SetRegExpTooBig();
3322    return;
3323  }
3324
3325  if (compiler->one_byte()) {
3326    int dummy = 0;
3327    TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3328  }
3329
3330  bool first_elt_done = false;
3331  int bound_checked_to = trace->cp_offset() - 1;
3332  bound_checked_to += trace->bound_checked_up_to();
3333
3334  // If a character is preloaded into the current character register then
3335  // check that now.
3336  if (trace->characters_preloaded() == 1) {
3337    for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3338      if (!SkipPass(pass, compiler->ignore_case())) {
3339        TextEmitPass(compiler,
3340                     static_cast<TextEmitPassType>(pass),
3341                     true,
3342                     trace,
3343                     false,
3344                     &bound_checked_to);
3345      }
3346    }
3347    first_elt_done = true;
3348  }
3349
3350  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3351    if (!SkipPass(pass, compiler->ignore_case())) {
3352      TextEmitPass(compiler,
3353                   static_cast<TextEmitPassType>(pass),
3354                   false,
3355                   trace,
3356                   first_elt_done,
3357                   &bound_checked_to);
3358    }
3359  }
3360
3361  Trace successor_trace(*trace);
3362  successor_trace.set_at_start(false);
3363  successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
3364  RecursionCheck rc(compiler);
3365  on_success()->Emit(compiler, &successor_trace);
3366}
3367
3368
3369void Trace::InvalidateCurrentCharacter() {
3370  characters_preloaded_ = 0;
3371}
3372
3373
3374void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3375  DCHECK(by > 0);
3376  // We don't have an instruction for shifting the current character register
3377  // down or for using a shifted value for anything so lets just forget that
3378  // we preloaded any characters into it.
3379  characters_preloaded_ = 0;
3380  // Adjust the offsets of the quick check performed information.  This
3381  // information is used to find out what we already determined about the
3382  // characters by means of mask and compare.
3383  quick_check_performed_.Advance(by, compiler->one_byte());
3384  cp_offset_ += by;
3385  if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3386    compiler->SetRegExpTooBig();
3387    cp_offset_ = 0;
3388  }
3389  bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3390}
3391
3392
3393void TextNode::MakeCaseIndependent(bool is_one_byte) {
3394  int element_count = elms_->length();
3395  for (int i = 0; i < element_count; i++) {
3396    TextElement elm = elms_->at(i);
3397    if (elm.text_type() == TextElement::CHAR_CLASS) {
3398      RegExpCharacterClass* cc = elm.char_class();
3399      // None of the standard character classes is different in the case
3400      // independent case and it slows us down if we don't know that.
3401      if (cc->is_standard(zone())) continue;
3402      ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3403      int range_count = ranges->length();
3404      for (int j = 0; j < range_count; j++) {
3405        ranges->at(j).AddCaseEquivalents(ranges, is_one_byte, zone());
3406      }
3407    }
3408  }
3409}
3410
3411
3412int TextNode::GreedyLoopTextLength() {
3413  TextElement elm = elms_->at(elms_->length() - 1);
3414  return elm.cp_offset() + elm.length();
3415}
3416
3417
3418RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3419    RegExpCompiler* compiler) {
3420  if (elms_->length() != 1) return NULL;
3421  TextElement elm = elms_->at(0);
3422  if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3423  RegExpCharacterClass* node = elm.char_class();
3424  ZoneList<CharacterRange>* ranges = node->ranges(zone());
3425  if (!CharacterRange::IsCanonical(ranges)) {
3426    CharacterRange::Canonicalize(ranges);
3427  }
3428  if (node->is_negated()) {
3429    return ranges->length() == 0 ? on_success() : NULL;
3430  }
3431  if (ranges->length() != 1) return NULL;
3432  uint32_t max_char;
3433  if (compiler->one_byte()) {
3434    max_char = String::kMaxOneByteCharCode;
3435  } else {
3436    max_char = String::kMaxUtf16CodeUnit;
3437  }
3438  return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3439}
3440
3441
3442// Finds the fixed match length of a sequence of nodes that goes from
3443// this alternative and back to this choice node.  If there are variable
3444// length nodes or other complications in the way then return a sentinel
3445// value indicating that a greedy loop cannot be constructed.
3446int ChoiceNode::GreedyLoopTextLengthForAlternative(
3447    GuardedAlternative* alternative) {
3448  int length = 0;
3449  RegExpNode* node = alternative->node();
3450  // Later we will generate code for all these text nodes using recursion
3451  // so we have to limit the max number.
3452  int recursion_depth = 0;
3453  while (node != this) {
3454    if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3455      return kNodeIsTooComplexForGreedyLoops;
3456    }
3457    int node_length = node->GreedyLoopTextLength();
3458    if (node_length == kNodeIsTooComplexForGreedyLoops) {
3459      return kNodeIsTooComplexForGreedyLoops;
3460    }
3461    length += node_length;
3462    SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3463    node = seq_node->on_success();
3464  }
3465  return length;
3466}
3467
3468
3469void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3470  DCHECK_EQ(loop_node_, NULL);
3471  AddAlternative(alt);
3472  loop_node_ = alt.node();
3473}
3474
3475
3476void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3477  DCHECK_EQ(continue_node_, NULL);
3478  AddAlternative(alt);
3479  continue_node_ = alt.node();
3480}
3481
3482
3483void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3484  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3485  if (trace->stop_node() == this) {
3486    // Back edge of greedy optimized loop node graph.
3487    int text_length =
3488        GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3489    DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3490    // Update the counter-based backtracking info on the stack.  This is an
3491    // optimization for greedy loops (see below).
3492    DCHECK(trace->cp_offset() == text_length);
3493    macro_assembler->AdvanceCurrentPosition(text_length);
3494    macro_assembler->GoTo(trace->loop_label());
3495    return;
3496  }
3497  DCHECK(trace->stop_node() == NULL);
3498  if (!trace->is_trivial()) {
3499    trace->Flush(compiler, this);
3500    return;
3501  }
3502  ChoiceNode::Emit(compiler, trace);
3503}
3504
3505
3506int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3507                                           int eats_at_least) {
3508  int preload_characters = Min(4, eats_at_least);
3509  if (compiler->macro_assembler()->CanReadUnaligned()) {
3510    bool one_byte = compiler->one_byte();
3511    if (one_byte) {
3512      if (preload_characters > 4) preload_characters = 4;
3513      // We can't preload 3 characters because there is no machine instruction
3514      // to do that.  We can't just load 4 because we could be reading
3515      // beyond the end of the string, which could cause a memory fault.
3516      if (preload_characters == 3) preload_characters = 2;
3517    } else {
3518      if (preload_characters > 2) preload_characters = 2;
3519    }
3520  } else {
3521    if (preload_characters > 1) preload_characters = 1;
3522  }
3523  return preload_characters;
3524}
3525
3526
3527// This class is used when generating the alternatives in a choice node.  It
3528// records the way the alternative is being code generated.
3529class AlternativeGeneration: public Malloced {
3530 public:
3531  AlternativeGeneration()
3532      : possible_success(),
3533        expects_preload(false),
3534        after(),
3535        quick_check_details() { }
3536  Label possible_success;
3537  bool expects_preload;
3538  Label after;
3539  QuickCheckDetails quick_check_details;
3540};
3541
3542
3543// Creates a list of AlternativeGenerations.  If the list has a reasonable
3544// size then it is on the stack, otherwise the excess is on the heap.
3545class AlternativeGenerationList {
3546 public:
3547  AlternativeGenerationList(int count, Zone* zone)
3548      : alt_gens_(count, zone) {
3549    for (int i = 0; i < count && i < kAFew; i++) {
3550      alt_gens_.Add(a_few_alt_gens_ + i, zone);
3551    }
3552    for (int i = kAFew; i < count; i++) {
3553      alt_gens_.Add(new AlternativeGeneration(), zone);
3554    }
3555  }
3556  ~AlternativeGenerationList() {
3557    for (int i = kAFew; i < alt_gens_.length(); i++) {
3558      delete alt_gens_[i];
3559      alt_gens_[i] = NULL;
3560    }
3561  }
3562
3563  AlternativeGeneration* at(int i) {
3564    return alt_gens_[i];
3565  }
3566
3567 private:
3568  static const int kAFew = 10;
3569  ZoneList<AlternativeGeneration*> alt_gens_;
3570  AlternativeGeneration a_few_alt_gens_[kAFew];
3571};
3572
3573
3574// The '2' variant is has inclusive from and exclusive to.
3575// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3576// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3577static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
3578    0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
3579    0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
3580    0xFEFF, 0xFF00, 0x10000 };
3581static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3582
3583static const int kWordRanges[] = {
3584    '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3585static const int kWordRangeCount = arraysize(kWordRanges);
3586static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3587static const int kDigitRangeCount = arraysize(kDigitRanges);
3588static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3589static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3590static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3591    0x2028, 0x202A, 0x10000 };
3592static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3593
3594
3595void BoyerMoorePositionInfo::Set(int character) {
3596  SetInterval(Interval(character, character));
3597}
3598
3599
3600void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3601  s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3602  w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3603  d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3604  surrogate_ =
3605      AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3606  if (interval.to() - interval.from() >= kMapSize - 1) {
3607    if (map_count_ != kMapSize) {
3608      map_count_ = kMapSize;
3609      for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3610    }
3611    return;
3612  }
3613  for (int i = interval.from(); i <= interval.to(); i++) {
3614    int mod_character = (i & kMask);
3615    if (!map_->at(mod_character)) {
3616      map_count_++;
3617      map_->at(mod_character) = true;
3618    }
3619    if (map_count_ == kMapSize) return;
3620  }
3621}
3622
3623
3624void BoyerMoorePositionInfo::SetAll() {
3625  s_ = w_ = d_ = kLatticeUnknown;
3626  if (map_count_ != kMapSize) {
3627    map_count_ = kMapSize;
3628    for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3629  }
3630}
3631
3632
3633BoyerMooreLookahead::BoyerMooreLookahead(
3634    int length, RegExpCompiler* compiler, Zone* zone)
3635    : length_(length),
3636      compiler_(compiler) {
3637  if (compiler->one_byte()) {
3638    max_char_ = String::kMaxOneByteCharCode;
3639  } else {
3640    max_char_ = String::kMaxUtf16CodeUnit;
3641  }
3642  bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3643  for (int i = 0; i < length; i++) {
3644    bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3645  }
3646}
3647
3648
3649// Find the longest range of lookahead that has the fewest number of different
3650// characters that can occur at a given position.  Since we are optimizing two
3651// different parameters at once this is a tradeoff.
3652bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3653  int biggest_points = 0;
3654  // If more than 32 characters out of 128 can occur it is unlikely that we can
3655  // be lucky enough to step forwards much of the time.
3656  const int kMaxMax = 32;
3657  for (int max_number_of_chars = 4;
3658       max_number_of_chars < kMaxMax;
3659       max_number_of_chars *= 2) {
3660    biggest_points =
3661        FindBestInterval(max_number_of_chars, biggest_points, from, to);
3662  }
3663  if (biggest_points == 0) return false;
3664  return true;
3665}
3666
3667
3668// Find the highest-points range between 0 and length_ where the character
3669// information is not too vague.  'Too vague' means that there are more than
3670// max_number_of_chars that can occur at this position.  Calculates the number
3671// of points as the product of width-of-the-range and
3672// probability-of-finding-one-of-the-characters, where the probability is
3673// calculated using the frequency distribution of the sample subject string.
3674int BoyerMooreLookahead::FindBestInterval(
3675    int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3676  int biggest_points = old_biggest_points;
3677  static const int kSize = RegExpMacroAssembler::kTableSize;
3678  for (int i = 0; i < length_; ) {
3679    while (i < length_ && Count(i) > max_number_of_chars) i++;
3680    if (i == length_) break;
3681    int remembered_from = i;
3682    bool union_map[kSize];
3683    for (int j = 0; j < kSize; j++) union_map[j] = false;
3684    while (i < length_ && Count(i) <= max_number_of_chars) {
3685      BoyerMoorePositionInfo* map = bitmaps_->at(i);
3686      for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3687      i++;
3688    }
3689    int frequency = 0;
3690    for (int j = 0; j < kSize; j++) {
3691      if (union_map[j]) {
3692        // Add 1 to the frequency to give a small per-character boost for
3693        // the cases where our sampling is not good enough and many
3694        // characters have a frequency of zero.  This means the frequency
3695        // can theoretically be up to 2*kSize though we treat it mostly as
3696        // a fraction of kSize.
3697        frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3698      }
3699    }
3700    // We use the probability of skipping times the distance we are skipping to
3701    // judge the effectiveness of this.  Actually we have a cut-off:  By
3702    // dividing by 2 we switch off the skipping if the probability of skipping
3703    // is less than 50%.  This is because the multibyte mask-and-compare
3704    // skipping in quickcheck is more likely to do well on this case.
3705    bool in_quickcheck_range =
3706        ((i - remembered_from < 4) ||
3707         (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3708    // Called 'probability' but it is only a rough estimate and can actually
3709    // be outside the 0-kSize range.
3710    int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3711    int points = (i - remembered_from) * probability;
3712    if (points > biggest_points) {
3713      *from = remembered_from;
3714      *to = i - 1;
3715      biggest_points = points;
3716    }
3717  }
3718  return biggest_points;
3719}
3720
3721
3722// Take all the characters that will not prevent a successful match if they
3723// occur in the subject string in the range between min_lookahead and
3724// max_lookahead (inclusive) measured from the current position.  If the
3725// character at max_lookahead offset is not one of these characters, then we
3726// can safely skip forwards by the number of characters in the range.
3727int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3728                                      int max_lookahead,
3729                                      Handle<ByteArray> boolean_skip_table) {
3730  const int kSize = RegExpMacroAssembler::kTableSize;
3731
3732  const int kSkipArrayEntry = 0;
3733  const int kDontSkipArrayEntry = 1;
3734
3735  for (int i = 0; i < kSize; i++) {
3736    boolean_skip_table->set(i, kSkipArrayEntry);
3737  }
3738  int skip = max_lookahead + 1 - min_lookahead;
3739
3740  for (int i = max_lookahead; i >= min_lookahead; i--) {
3741    BoyerMoorePositionInfo* map = bitmaps_->at(i);
3742    for (int j = 0; j < kSize; j++) {
3743      if (map->at(j)) {
3744        boolean_skip_table->set(j, kDontSkipArrayEntry);
3745      }
3746    }
3747  }
3748
3749  return skip;
3750}
3751
3752
3753// See comment above on the implementation of GetSkipTable.
3754void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3755  const int kSize = RegExpMacroAssembler::kTableSize;
3756
3757  int min_lookahead = 0;
3758  int max_lookahead = 0;
3759
3760  if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3761
3762  bool found_single_character = false;
3763  int single_character = 0;
3764  for (int i = max_lookahead; i >= min_lookahead; i--) {
3765    BoyerMoorePositionInfo* map = bitmaps_->at(i);
3766    if (map->map_count() > 1 ||
3767        (found_single_character && map->map_count() != 0)) {
3768      found_single_character = false;
3769      break;
3770    }
3771    for (int j = 0; j < kSize; j++) {
3772      if (map->at(j)) {
3773        found_single_character = true;
3774        single_character = j;
3775        break;
3776      }
3777    }
3778  }
3779
3780  int lookahead_width = max_lookahead + 1 - min_lookahead;
3781
3782  if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3783    // The mask-compare can probably handle this better.
3784    return;
3785  }
3786
3787  if (found_single_character) {
3788    Label cont, again;
3789    masm->Bind(&again);
3790    masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3791    if (max_char_ > kSize) {
3792      masm->CheckCharacterAfterAnd(single_character,
3793                                   RegExpMacroAssembler::kTableMask,
3794                                   &cont);
3795    } else {
3796      masm->CheckCharacter(single_character, &cont);
3797    }
3798    masm->AdvanceCurrentPosition(lookahead_width);
3799    masm->GoTo(&again);
3800    masm->Bind(&cont);
3801    return;
3802  }
3803
3804  Factory* factory = masm->zone()->isolate()->factory();
3805  Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3806  int skip_distance = GetSkipTable(
3807      min_lookahead, max_lookahead, boolean_skip_table);
3808  DCHECK(skip_distance != 0);
3809
3810  Label cont, again;
3811  masm->Bind(&again);
3812  masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3813  masm->CheckBitInTable(boolean_skip_table, &cont);
3814  masm->AdvanceCurrentPosition(skip_distance);
3815  masm->GoTo(&again);
3816  masm->Bind(&cont);
3817}
3818
3819
3820/* Code generation for choice nodes.
3821 *
3822 * We generate quick checks that do a mask and compare to eliminate a
3823 * choice.  If the quick check succeeds then it jumps to the continuation to
3824 * do slow checks and check subsequent nodes.  If it fails (the common case)
3825 * it falls through to the next choice.
3826 *
3827 * Here is the desired flow graph.  Nodes directly below each other imply
3828 * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
3829 * 3 doesn't have a quick check so we have to call the slow check.
3830 * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
3831 * regexp continuation is generated directly after the Sn node, up to the
3832 * next GoTo if we decide to reuse some already generated code.  Some
3833 * nodes expect preload_characters to be preloaded into the current
3834 * character register.  R nodes do this preloading.  Vertices are marked
3835 * F for failures and S for success (possible success in the case of quick
3836 * nodes).  L, V, < and > are used as arrow heads.
3837 *
3838 * ----------> R
3839 *             |
3840 *             V
3841 *            Q1 -----> S1
3842 *             |   S   /
3843 *            F|      /
3844 *             |    F/
3845 *             |    /
3846 *             |   R
3847 *             |  /
3848 *             V L
3849 *            Q2 -----> S2
3850 *             |   S   /
3851 *            F|      /
3852 *             |    F/
3853 *             |    /
3854 *             |   R
3855 *             |  /
3856 *             V L
3857 *            S3
3858 *             |
3859 *            F|
3860 *             |
3861 *             R
3862 *             |
3863 * backtrack   V
3864 * <----------Q4
3865 *   \    F    |
3866 *    \        |S
3867 *     \   F   V
3868 *      \-----S4
3869 *
3870 * For greedy loops we push the current position, then generate the code that
3871 * eats the input specially in EmitGreedyLoop.  The other choice (the
3872 * continuation) is generated by the normal code in EmitChoices, and steps back
3873 * in the input to the starting position when it fails to match.  The loop code
3874 * looks like this (U is the unwind code that steps back in the greedy loop).
3875 *
3876 *              _____
3877 *             /     \
3878 *             V     |
3879 * ----------> S1    |
3880 *            /|     |
3881 *           / |S    |
3882 *         F/  \_____/
3883 *         /
3884 *        |<-----
3885 *        |      \
3886 *        V       |S
3887 *        Q2 ---> U----->backtrack
3888 *        |  F   /
3889 *       S|     /
3890 *        V  F /
3891 *        S2--/
3892 */
3893
3894GreedyLoopState::GreedyLoopState(bool not_at_start) {
3895  counter_backtrack_trace_.set_backtrack(&label_);
3896  if (not_at_start) counter_backtrack_trace_.set_at_start(false);
3897}
3898
3899
3900void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3901#ifdef DEBUG
3902  int choice_count = alternatives_->length();
3903  for (int i = 0; i < choice_count - 1; i++) {
3904    GuardedAlternative alternative = alternatives_->at(i);
3905    ZoneList<Guard*>* guards = alternative.guards();
3906    int guard_count = (guards == NULL) ? 0 : guards->length();
3907    for (int j = 0; j < guard_count; j++) {
3908      DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3909    }
3910  }
3911#endif
3912}
3913
3914
3915void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3916                              Trace* current_trace,
3917                              PreloadState* state) {
3918    if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3919      // Save some time by looking at most one machine word ahead.
3920      state->eats_at_least_ =
3921          EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3922                      current_trace->at_start() == Trace::FALSE_VALUE);
3923    }
3924    state->preload_characters_ =
3925        CalculatePreloadCharacters(compiler, state->eats_at_least_);
3926
3927    state->preload_is_current_ =
3928        (current_trace->characters_preloaded() == state->preload_characters_);
3929    state->preload_has_checked_bounds_ = state->preload_is_current_;
3930}
3931
3932
3933void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3934  int choice_count = alternatives_->length();
3935
3936  AssertGuardsMentionRegisters(trace);
3937
3938  LimitResult limit_result = LimitVersions(compiler, trace);
3939  if (limit_result == DONE) return;
3940  DCHECK(limit_result == CONTINUE);
3941
3942  // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3943  // other choice nodes we only flush if we are out of code size budget.
3944  if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3945    trace->Flush(compiler, this);
3946    return;
3947  }
3948
3949  RecursionCheck rc(compiler);
3950
3951  PreloadState preload;
3952  preload.init();
3953  GreedyLoopState greedy_loop_state(not_at_start());
3954
3955  int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3956  AlternativeGenerationList alt_gens(choice_count, zone());
3957
3958  if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3959    trace = EmitGreedyLoop(compiler,
3960                           trace,
3961                           &alt_gens,
3962                           &preload,
3963                           &greedy_loop_state,
3964                           text_length);
3965  } else {
3966    // TODO(erikcorry): Delete this.  We don't need this label, but it makes us
3967    // match the traces produced pre-cleanup.
3968    Label second_choice;
3969    compiler->macro_assembler()->Bind(&second_choice);
3970
3971    preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3972
3973    EmitChoices(compiler,
3974                &alt_gens,
3975                0,
3976                trace,
3977                &preload);
3978  }
3979
3980  // At this point we need to generate slow checks for the alternatives where
3981  // the quick check was inlined.  We can recognize these because the associated
3982  // label was bound.
3983  int new_flush_budget = trace->flush_budget() / choice_count;
3984  for (int i = 0; i < choice_count; i++) {
3985    AlternativeGeneration* alt_gen = alt_gens.at(i);
3986    Trace new_trace(*trace);
3987    // If there are actions to be flushed we have to limit how many times
3988    // they are flushed.  Take the budget of the parent trace and distribute
3989    // it fairly amongst the children.
3990    if (new_trace.actions() != NULL) {
3991      new_trace.set_flush_budget(new_flush_budget);
3992    }
3993    bool next_expects_preload =
3994        i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3995    EmitOutOfLineContinuation(compiler,
3996                              &new_trace,
3997                              alternatives_->at(i),
3998                              alt_gen,
3999                              preload.preload_characters_,
4000                              next_expects_preload);
4001  }
4002}
4003
4004
4005Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4006                                  Trace* trace,
4007                                  AlternativeGenerationList* alt_gens,
4008                                  PreloadState* preload,
4009                                  GreedyLoopState* greedy_loop_state,
4010                                  int text_length) {
4011  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4012  // Here we have special handling for greedy loops containing only text nodes
4013  // and other simple nodes.  These are handled by pushing the current
4014  // position on the stack and then incrementing the current position each
4015  // time around the switch.  On backtrack we decrement the current position
4016  // and check it against the pushed value.  This avoids pushing backtrack
4017  // information for each iteration of the loop, which could take up a lot of
4018  // space.
4019  DCHECK(trace->stop_node() == NULL);
4020  macro_assembler->PushCurrentPosition();
4021  Label greedy_match_failed;
4022  Trace greedy_match_trace;
4023  if (not_at_start()) greedy_match_trace.set_at_start(false);
4024  greedy_match_trace.set_backtrack(&greedy_match_failed);
4025  Label loop_label;
4026  macro_assembler->Bind(&loop_label);
4027  greedy_match_trace.set_stop_node(this);
4028  greedy_match_trace.set_loop_label(&loop_label);
4029  alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4030  macro_assembler->Bind(&greedy_match_failed);
4031
4032  Label second_choice;  // For use in greedy matches.
4033  macro_assembler->Bind(&second_choice);
4034
4035  Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4036
4037  EmitChoices(compiler,
4038              alt_gens,
4039              1,
4040              new_trace,
4041              preload);
4042
4043  macro_assembler->Bind(greedy_loop_state->label());
4044  // If we have unwound to the bottom then backtrack.
4045  macro_assembler->CheckGreedyLoop(trace->backtrack());
4046  // Otherwise try the second priority at an earlier position.
4047  macro_assembler->AdvanceCurrentPosition(-text_length);
4048  macro_assembler->GoTo(&second_choice);
4049  return new_trace;
4050}
4051
4052int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4053                                              Trace* trace) {
4054  int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4055  if (alternatives_->length() != 2) return eats_at_least;
4056
4057  GuardedAlternative alt1 = alternatives_->at(1);
4058  if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4059    return eats_at_least;
4060  }
4061  RegExpNode* eats_anything_node = alt1.node();
4062  if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4063    return eats_at_least;
4064  }
4065
4066  // Really we should be creating a new trace when we execute this function,
4067  // but there is no need, because the code it generates cannot backtrack, and
4068  // we always arrive here with a trivial trace (since it's the entry to a
4069  // loop.  That also implies that there are no preloaded characters, which is
4070  // good, because it means we won't be violating any assumptions by
4071  // overwriting those characters with new load instructions.
4072  DCHECK(trace->is_trivial());
4073
4074  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4075  // At this point we know that we are at a non-greedy loop that will eat
4076  // any character one at a time.  Any non-anchored regexp has such a
4077  // loop prepended to it in order to find where it starts.  We look for
4078  // a pattern of the form ...abc... where we can look 6 characters ahead
4079  // and step forwards 3 if the character is not one of abc.  Abc need
4080  // not be atoms, they can be any reasonably limited character class or
4081  // small alternation.
4082  BoyerMooreLookahead* bm = bm_info(false);
4083  if (bm == NULL) {
4084    eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4085                        EatsAtLeast(kMaxLookaheadForBoyerMoore,
4086                                    kRecursionBudget,
4087                                    false));
4088    if (eats_at_least >= 1) {
4089      bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4090                                           compiler,
4091                                           zone());
4092      GuardedAlternative alt0 = alternatives_->at(0);
4093      alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, false);
4094    }
4095  }
4096  if (bm != NULL) {
4097    bm->EmitSkipInstructions(macro_assembler);
4098  }
4099  return eats_at_least;
4100}
4101
4102
4103void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4104                             AlternativeGenerationList* alt_gens,
4105                             int first_choice,
4106                             Trace* trace,
4107                             PreloadState* preload) {
4108  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4109  SetUpPreLoad(compiler, trace, preload);
4110
4111  // For now we just call all choices one after the other.  The idea ultimately
4112  // is to use the Dispatch table to try only the relevant ones.
4113  int choice_count = alternatives_->length();
4114
4115  int new_flush_budget = trace->flush_budget() / choice_count;
4116
4117  for (int i = first_choice; i < choice_count; i++) {
4118    bool is_last = i == choice_count - 1;
4119    bool fall_through_on_failure = !is_last;
4120    GuardedAlternative alternative = alternatives_->at(i);
4121    AlternativeGeneration* alt_gen = alt_gens->at(i);
4122    alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4123    ZoneList<Guard*>* guards = alternative.guards();
4124    int guard_count = (guards == NULL) ? 0 : guards->length();
4125    Trace new_trace(*trace);
4126    new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4127                                         preload->preload_characters_ :
4128                                         0);
4129    if (preload->preload_has_checked_bounds_) {
4130      new_trace.set_bound_checked_up_to(preload->preload_characters_);
4131    }
4132    new_trace.quick_check_performed()->Clear();
4133    if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4134    if (!is_last) {
4135      new_trace.set_backtrack(&alt_gen->after);
4136    }
4137    alt_gen->expects_preload = preload->preload_is_current_;
4138    bool generate_full_check_inline = false;
4139    if (FLAG_regexp_optimization &&
4140        try_to_emit_quick_check_for_alternative(i == 0) &&
4141        alternative.node()->EmitQuickCheck(compiler,
4142                                           trace,
4143                                           &new_trace,
4144                                           preload->preload_has_checked_bounds_,
4145                                           &alt_gen->possible_success,
4146                                           &alt_gen->quick_check_details,
4147                                           fall_through_on_failure)) {
4148      // Quick check was generated for this choice.
4149      preload->preload_is_current_ = true;
4150      preload->preload_has_checked_bounds_ = true;
4151      // If we generated the quick check to fall through on possible success,
4152      // we now need to generate the full check inline.
4153      if (!fall_through_on_failure) {
4154        macro_assembler->Bind(&alt_gen->possible_success);
4155        new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4156        new_trace.set_characters_preloaded(preload->preload_characters_);
4157        new_trace.set_bound_checked_up_to(preload->preload_characters_);
4158        generate_full_check_inline = true;
4159      }
4160    } else if (alt_gen->quick_check_details.cannot_match()) {
4161      if (!fall_through_on_failure) {
4162        macro_assembler->GoTo(trace->backtrack());
4163      }
4164      continue;
4165    } else {
4166      // No quick check was generated.  Put the full code here.
4167      // If this is not the first choice then there could be slow checks from
4168      // previous cases that go here when they fail.  There's no reason to
4169      // insist that they preload characters since the slow check we are about
4170      // to generate probably can't use it.
4171      if (i != first_choice) {
4172        alt_gen->expects_preload = false;
4173        new_trace.InvalidateCurrentCharacter();
4174      }
4175      generate_full_check_inline = true;
4176    }
4177    if (generate_full_check_inline) {
4178      if (new_trace.actions() != NULL) {
4179        new_trace.set_flush_budget(new_flush_budget);
4180      }
4181      for (int j = 0; j < guard_count; j++) {
4182        GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4183      }
4184      alternative.node()->Emit(compiler, &new_trace);
4185      preload->preload_is_current_ = false;
4186    }
4187    macro_assembler->Bind(&alt_gen->after);
4188  }
4189}
4190
4191
4192void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4193                                           Trace* trace,
4194                                           GuardedAlternative alternative,
4195                                           AlternativeGeneration* alt_gen,
4196                                           int preload_characters,
4197                                           bool next_expects_preload) {
4198  if (!alt_gen->possible_success.is_linked()) return;
4199
4200  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4201  macro_assembler->Bind(&alt_gen->possible_success);
4202  Trace out_of_line_trace(*trace);
4203  out_of_line_trace.set_characters_preloaded(preload_characters);
4204  out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4205  if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4206  ZoneList<Guard*>* guards = alternative.guards();
4207  int guard_count = (guards == NULL) ? 0 : guards->length();
4208  if (next_expects_preload) {
4209    Label reload_current_char;
4210    out_of_line_trace.set_backtrack(&reload_current_char);
4211    for (int j = 0; j < guard_count; j++) {
4212      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4213    }
4214    alternative.node()->Emit(compiler, &out_of_line_trace);
4215    macro_assembler->Bind(&reload_current_char);
4216    // Reload the current character, since the next quick check expects that.
4217    // We don't need to check bounds here because we only get into this
4218    // code through a quick check which already did the checked load.
4219    macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4220                                          NULL,
4221                                          false,
4222                                          preload_characters);
4223    macro_assembler->GoTo(&(alt_gen->after));
4224  } else {
4225    out_of_line_trace.set_backtrack(&(alt_gen->after));
4226    for (int j = 0; j < guard_count; j++) {
4227      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4228    }
4229    alternative.node()->Emit(compiler, &out_of_line_trace);
4230  }
4231}
4232
4233
4234void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4235  RegExpMacroAssembler* assembler = compiler->macro_assembler();
4236  LimitResult limit_result = LimitVersions(compiler, trace);
4237  if (limit_result == DONE) return;
4238  DCHECK(limit_result == CONTINUE);
4239
4240  RecursionCheck rc(compiler);
4241
4242  switch (action_type_) {
4243    case STORE_POSITION: {
4244      Trace::DeferredCapture
4245          new_capture(data_.u_position_register.reg,
4246                      data_.u_position_register.is_capture,
4247                      trace);
4248      Trace new_trace = *trace;
4249      new_trace.add_action(&new_capture);
4250      on_success()->Emit(compiler, &new_trace);
4251      break;
4252    }
4253    case INCREMENT_REGISTER: {
4254      Trace::DeferredIncrementRegister
4255          new_increment(data_.u_increment_register.reg);
4256      Trace new_trace = *trace;
4257      new_trace.add_action(&new_increment);
4258      on_success()->Emit(compiler, &new_trace);
4259      break;
4260    }
4261    case SET_REGISTER: {
4262      Trace::DeferredSetRegister
4263          new_set(data_.u_store_register.reg, data_.u_store_register.value);
4264      Trace new_trace = *trace;
4265      new_trace.add_action(&new_set);
4266      on_success()->Emit(compiler, &new_trace);
4267      break;
4268    }
4269    case CLEAR_CAPTURES: {
4270      Trace::DeferredClearCaptures
4271        new_capture(Interval(data_.u_clear_captures.range_from,
4272                             data_.u_clear_captures.range_to));
4273      Trace new_trace = *trace;
4274      new_trace.add_action(&new_capture);
4275      on_success()->Emit(compiler, &new_trace);
4276      break;
4277    }
4278    case BEGIN_SUBMATCH:
4279      if (!trace->is_trivial()) {
4280        trace->Flush(compiler, this);
4281      } else {
4282        assembler->WriteCurrentPositionToRegister(
4283            data_.u_submatch.current_position_register, 0);
4284        assembler->WriteStackPointerToRegister(
4285            data_.u_submatch.stack_pointer_register);
4286        on_success()->Emit(compiler, trace);
4287      }
4288      break;
4289    case EMPTY_MATCH_CHECK: {
4290      int start_pos_reg = data_.u_empty_match_check.start_register;
4291      int stored_pos = 0;
4292      int rep_reg = data_.u_empty_match_check.repetition_register;
4293      bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4294      bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4295      if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4296        // If we know we haven't advanced and there is no minimum we
4297        // can just backtrack immediately.
4298        assembler->GoTo(trace->backtrack());
4299      } else if (know_dist && stored_pos < trace->cp_offset()) {
4300        // If we know we've advanced we can generate the continuation
4301        // immediately.
4302        on_success()->Emit(compiler, trace);
4303      } else if (!trace->is_trivial()) {
4304        trace->Flush(compiler, this);
4305      } else {
4306        Label skip_empty_check;
4307        // If we have a minimum number of repetitions we check the current
4308        // number first and skip the empty check if it's not enough.
4309        if (has_minimum) {
4310          int limit = data_.u_empty_match_check.repetition_limit;
4311          assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4312        }
4313        // If the match is empty we bail out, otherwise we fall through
4314        // to the on-success continuation.
4315        assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4316                                   trace->backtrack());
4317        assembler->Bind(&skip_empty_check);
4318        on_success()->Emit(compiler, trace);
4319      }
4320      break;
4321    }
4322    case POSITIVE_SUBMATCH_SUCCESS: {
4323      if (!trace->is_trivial()) {
4324        trace->Flush(compiler, this);
4325        return;
4326      }
4327      assembler->ReadCurrentPositionFromRegister(
4328          data_.u_submatch.current_position_register);
4329      assembler->ReadStackPointerFromRegister(
4330          data_.u_submatch.stack_pointer_register);
4331      int clear_register_count = data_.u_submatch.clear_register_count;
4332      if (clear_register_count == 0) {
4333        on_success()->Emit(compiler, trace);
4334        return;
4335      }
4336      int clear_registers_from = data_.u_submatch.clear_register_from;
4337      Label clear_registers_backtrack;
4338      Trace new_trace = *trace;
4339      new_trace.set_backtrack(&clear_registers_backtrack);
4340      on_success()->Emit(compiler, &new_trace);
4341
4342      assembler->Bind(&clear_registers_backtrack);
4343      int clear_registers_to = clear_registers_from + clear_register_count - 1;
4344      assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4345
4346      DCHECK(trace->backtrack() == NULL);
4347      assembler->Backtrack();
4348      return;
4349    }
4350    default:
4351      UNREACHABLE();
4352  }
4353}
4354
4355
4356void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4357  RegExpMacroAssembler* assembler = compiler->macro_assembler();
4358  if (!trace->is_trivial()) {
4359    trace->Flush(compiler, this);
4360    return;
4361  }
4362
4363  LimitResult limit_result = LimitVersions(compiler, trace);
4364  if (limit_result == DONE) return;
4365  DCHECK(limit_result == CONTINUE);
4366
4367  RecursionCheck rc(compiler);
4368
4369  DCHECK_EQ(start_reg_ + 1, end_reg_);
4370  if (compiler->ignore_case()) {
4371    assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
4372                                               trace->backtrack());
4373  } else {
4374    assembler->CheckNotBackReference(start_reg_, trace->backtrack());
4375  }
4376  on_success()->Emit(compiler, trace);
4377}
4378
4379
4380// -------------------------------------------------------------------
4381// Dot/dotty output
4382
4383
4384#ifdef DEBUG
4385
4386
4387class DotPrinter: public NodeVisitor {
4388 public:
4389  DotPrinter(OStream& os, bool ignore_case)  // NOLINT
4390      : os_(os),
4391        ignore_case_(ignore_case) {}
4392  void PrintNode(const char* label, RegExpNode* node);
4393  void Visit(RegExpNode* node);
4394  void PrintAttributes(RegExpNode* from);
4395  void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4396#define DECLARE_VISIT(Type)                                          \
4397  virtual void Visit##Type(Type##Node* that);
4398FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4399#undef DECLARE_VISIT
4400 private:
4401  OStream& os_;
4402  bool ignore_case_;
4403};
4404
4405
4406void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4407  os_ << "digraph G {\n  graph [label=\"";
4408  for (int i = 0; label[i]; i++) {
4409    switch (label[i]) {
4410      case '\\':
4411        os_ << "\\\\";
4412        break;
4413      case '"':
4414        os_ << "\"";
4415        break;
4416      default:
4417        os_ << label[i];
4418        break;
4419    }
4420  }
4421  os_ << "\"];\n";
4422  Visit(node);
4423  os_ << "}" << endl;
4424}
4425
4426
4427void DotPrinter::Visit(RegExpNode* node) {
4428  if (node->info()->visited) return;
4429  node->info()->visited = true;
4430  node->Accept(this);
4431}
4432
4433
4434void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4435  os_ << "  n" << from << " -> n" << on_failure << " [style=dotted];\n";
4436  Visit(on_failure);
4437}
4438
4439
4440class TableEntryBodyPrinter {
4441 public:
4442  TableEntryBodyPrinter(OStream& os, ChoiceNode* choice)  // NOLINT
4443      : os_(os),
4444        choice_(choice) {}
4445  void Call(uc16 from, DispatchTable::Entry entry) {
4446    OutSet* out_set = entry.out_set();
4447    for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4448      if (out_set->Get(i)) {
4449        os_ << "    n" << choice() << ":s" << from << "o" << i << " -> n"
4450            << choice()->alternatives()->at(i).node() << ";\n";
4451      }
4452    }
4453  }
4454 private:
4455  ChoiceNode* choice() { return choice_; }
4456  OStream& os_;
4457  ChoiceNode* choice_;
4458};
4459
4460
4461class TableEntryHeaderPrinter {
4462 public:
4463  explicit TableEntryHeaderPrinter(OStream& os)  // NOLINT
4464      : first_(true),
4465        os_(os) {}
4466  void Call(uc16 from, DispatchTable::Entry entry) {
4467    if (first_) {
4468      first_ = false;
4469    } else {
4470      os_ << "|";
4471    }
4472    os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4473    OutSet* out_set = entry.out_set();
4474    int priority = 0;
4475    for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4476      if (out_set->Get(i)) {
4477        if (priority > 0) os_ << "|";
4478        os_ << "<s" << from << "o" << i << "> " << priority;
4479        priority++;
4480      }
4481    }
4482    os_ << "}}";
4483  }
4484
4485 private:
4486  bool first_;
4487  OStream& os_;
4488};
4489
4490
4491class AttributePrinter {
4492 public:
4493  explicit AttributePrinter(OStream& os)  // NOLINT
4494      : os_(os),
4495        first_(true) {}
4496  void PrintSeparator() {
4497    if (first_) {
4498      first_ = false;
4499    } else {
4500      os_ << "|";
4501    }
4502  }
4503  void PrintBit(const char* name, bool value) {
4504    if (!value) return;
4505    PrintSeparator();
4506    os_ << "{" << name << "}";
4507  }
4508  void PrintPositive(const char* name, int value) {
4509    if (value < 0) return;
4510    PrintSeparator();
4511    os_ << "{" << name << "|" << value << "}";
4512  }
4513
4514 private:
4515  OStream& os_;
4516  bool first_;
4517};
4518
4519
4520void DotPrinter::PrintAttributes(RegExpNode* that) {
4521  os_ << "  a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4522      << "margin=0.1, fontsize=10, label=\"{";
4523  AttributePrinter printer(os_);
4524  NodeInfo* info = that->info();
4525  printer.PrintBit("NI", info->follows_newline_interest);
4526  printer.PrintBit("WI", info->follows_word_interest);
4527  printer.PrintBit("SI", info->follows_start_interest);
4528  Label* label = that->label();
4529  if (label->is_bound())
4530    printer.PrintPositive("@", label->pos());
4531  os_ << "}\"];\n"
4532      << "  a" << that << " -> n" << that
4533      << " [style=dashed, color=grey, arrowhead=none];\n";
4534}
4535
4536
4537static const bool kPrintDispatchTable = false;
4538void DotPrinter::VisitChoice(ChoiceNode* that) {
4539  if (kPrintDispatchTable) {
4540    os_ << "  n" << that << " [shape=Mrecord, label=\"";
4541    TableEntryHeaderPrinter header_printer(os_);
4542    that->GetTable(ignore_case_)->ForEach(&header_printer);
4543    os_ << "\"]\n";
4544    PrintAttributes(that);
4545    TableEntryBodyPrinter body_printer(os_, that);
4546    that->GetTable(ignore_case_)->ForEach(&body_printer);
4547  } else {
4548    os_ << "  n" << that << " [shape=Mrecord, label=\"?\"];\n";
4549    for (int i = 0; i < that->alternatives()->length(); i++) {
4550      GuardedAlternative alt = that->alternatives()->at(i);
4551      os_ << "  n" << that << " -> n" << alt.node();
4552    }
4553  }
4554  for (int i = 0; i < that->alternatives()->length(); i++) {
4555    GuardedAlternative alt = that->alternatives()->at(i);
4556    alt.node()->Accept(this);
4557  }
4558}
4559
4560
4561void DotPrinter::VisitText(TextNode* that) {
4562  Zone* zone = that->zone();
4563  os_ << "  n" << that << " [label=\"";
4564  for (int i = 0; i < that->elements()->length(); i++) {
4565    if (i > 0) os_ << " ";
4566    TextElement elm = that->elements()->at(i);
4567    switch (elm.text_type()) {
4568      case TextElement::ATOM: {
4569        Vector<const uc16> data = elm.atom()->data();
4570        for (int i = 0; i < data.length(); i++) {
4571          os_ << static_cast<char>(data[i]);
4572        }
4573        break;
4574      }
4575      case TextElement::CHAR_CLASS: {
4576        RegExpCharacterClass* node = elm.char_class();
4577        os_ << "[";
4578        if (node->is_negated()) os_ << "^";
4579        for (int j = 0; j < node->ranges(zone)->length(); j++) {
4580          CharacterRange range = node->ranges(zone)->at(j);
4581          os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4582        }
4583        os_ << "]";
4584        break;
4585      }
4586      default:
4587        UNREACHABLE();
4588    }
4589  }
4590  os_ << "\", shape=box, peripheries=2];\n";
4591  PrintAttributes(that);
4592  os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
4593  Visit(that->on_success());
4594}
4595
4596
4597void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4598  os_ << "  n" << that << " [label=\"$" << that->start_register() << "..$"
4599      << that->end_register() << "\", shape=doubleoctagon];\n";
4600  PrintAttributes(that);
4601  os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
4602  Visit(that->on_success());
4603}
4604
4605
4606void DotPrinter::VisitEnd(EndNode* that) {
4607  os_ << "  n" << that << " [style=bold, shape=point];\n";
4608  PrintAttributes(that);
4609}
4610
4611
4612void DotPrinter::VisitAssertion(AssertionNode* that) {
4613  os_ << "  n" << that << " [";
4614  switch (that->assertion_type()) {
4615    case AssertionNode::AT_END:
4616      os_ << "label=\"$\", shape=septagon";
4617      break;
4618    case AssertionNode::AT_START:
4619      os_ << "label=\"^\", shape=septagon";
4620      break;
4621    case AssertionNode::AT_BOUNDARY:
4622      os_ << "label=\"\\b\", shape=septagon";
4623      break;
4624    case AssertionNode::AT_NON_BOUNDARY:
4625      os_ << "label=\"\\B\", shape=septagon";
4626      break;
4627    case AssertionNode::AFTER_NEWLINE:
4628      os_ << "label=\"(?<=\\n)\", shape=septagon";
4629      break;
4630  }
4631  os_ << "];\n";
4632  PrintAttributes(that);
4633  RegExpNode* successor = that->on_success();
4634  os_ << "  n" << that << " -> n" << successor << ";\n";
4635  Visit(successor);
4636}
4637
4638
4639void DotPrinter::VisitAction(ActionNode* that) {
4640  os_ << "  n" << that << " [";
4641  switch (that->action_type_) {
4642    case ActionNode::SET_REGISTER:
4643      os_ << "label=\"$" << that->data_.u_store_register.reg
4644          << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4645      break;
4646    case ActionNode::INCREMENT_REGISTER:
4647      os_ << "label=\"$" << that->data_.u_increment_register.reg
4648          << "++\", shape=octagon";
4649      break;
4650    case ActionNode::STORE_POSITION:
4651      os_ << "label=\"$" << that->data_.u_position_register.reg
4652          << ":=$pos\", shape=octagon";
4653      break;
4654    case ActionNode::BEGIN_SUBMATCH:
4655      os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4656          << ":=$pos,begin\", shape=septagon";
4657      break;
4658    case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4659      os_ << "label=\"escape\", shape=septagon";
4660      break;
4661    case ActionNode::EMPTY_MATCH_CHECK:
4662      os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4663          << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4664          << "<" << that->data_.u_empty_match_check.repetition_limit
4665          << "?\", shape=septagon";
4666      break;
4667    case ActionNode::CLEAR_CAPTURES: {
4668      os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4669          << " to $" << that->data_.u_clear_captures.range_to
4670          << "\", shape=septagon";
4671      break;
4672    }
4673  }
4674  os_ << "];\n";
4675  PrintAttributes(that);
4676  RegExpNode* successor = that->on_success();
4677  os_ << "  n" << that << " -> n" << successor << ";\n";
4678  Visit(successor);
4679}
4680
4681
4682class DispatchTableDumper {
4683 public:
4684  explicit DispatchTableDumper(OStream& os) : os_(os) {}
4685  void Call(uc16 key, DispatchTable::Entry entry);
4686 private:
4687  OStream& os_;
4688};
4689
4690
4691void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4692  os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4693  OutSet* set = entry.out_set();
4694  bool first = true;
4695  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4696    if (set->Get(i)) {
4697      if (first) {
4698        first = false;
4699      } else {
4700        os_ << ", ";
4701      }
4702      os_ << i;
4703    }
4704  }
4705  os_ << "}\n";
4706}
4707
4708
4709void DispatchTable::Dump() {
4710  OFStream os(stderr);
4711  DispatchTableDumper dumper(os);
4712  tree()->ForEach(&dumper);
4713}
4714
4715
4716void RegExpEngine::DotPrint(const char* label,
4717                            RegExpNode* node,
4718                            bool ignore_case) {
4719  OFStream os(stdout);
4720  DotPrinter printer(os, ignore_case);
4721  printer.PrintNode(label, node);
4722}
4723
4724
4725#endif  // DEBUG
4726
4727
4728// -------------------------------------------------------------------
4729// Tree to graph conversion
4730
4731RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4732                               RegExpNode* on_success) {
4733  ZoneList<TextElement>* elms =
4734      new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4735  elms->Add(TextElement::Atom(this), compiler->zone());
4736  return new(compiler->zone()) TextNode(elms, on_success);
4737}
4738
4739
4740RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4741                               RegExpNode* on_success) {
4742  return new(compiler->zone()) TextNode(elements(), on_success);
4743}
4744
4745
4746static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4747                                 const int* special_class,
4748                                 int length) {
4749  length--;  // Remove final 0x10000.
4750  DCHECK(special_class[length] == 0x10000);
4751  DCHECK(ranges->length() != 0);
4752  DCHECK(length != 0);
4753  DCHECK(special_class[0] != 0);
4754  if (ranges->length() != (length >> 1) + 1) {
4755    return false;
4756  }
4757  CharacterRange range = ranges->at(0);
4758  if (range.from() != 0) {
4759    return false;
4760  }
4761  for (int i = 0; i < length; i += 2) {
4762    if (special_class[i] != (range.to() + 1)) {
4763      return false;
4764    }
4765    range = ranges->at((i >> 1) + 1);
4766    if (special_class[i+1] != range.from()) {
4767      return false;
4768    }
4769  }
4770  if (range.to() != 0xffff) {
4771    return false;
4772  }
4773  return true;
4774}
4775
4776
4777static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4778                          const int* special_class,
4779                          int length) {
4780  length--;  // Remove final 0x10000.
4781  DCHECK(special_class[length] == 0x10000);
4782  if (ranges->length() * 2 != length) {
4783    return false;
4784  }
4785  for (int i = 0; i < length; i += 2) {
4786    CharacterRange range = ranges->at(i >> 1);
4787    if (range.from() != special_class[i] ||
4788        range.to() != special_class[i + 1] - 1) {
4789      return false;
4790    }
4791  }
4792  return true;
4793}
4794
4795
4796bool RegExpCharacterClass::is_standard(Zone* zone) {
4797  // TODO(lrn): Remove need for this function, by not throwing away information
4798  // along the way.
4799  if (is_negated_) {
4800    return false;
4801  }
4802  if (set_.is_standard()) {
4803    return true;
4804  }
4805  if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4806    set_.set_standard_set_type('s');
4807    return true;
4808  }
4809  if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4810    set_.set_standard_set_type('S');
4811    return true;
4812  }
4813  if (CompareInverseRanges(set_.ranges(zone),
4814                           kLineTerminatorRanges,
4815                           kLineTerminatorRangeCount)) {
4816    set_.set_standard_set_type('.');
4817    return true;
4818  }
4819  if (CompareRanges(set_.ranges(zone),
4820                    kLineTerminatorRanges,
4821                    kLineTerminatorRangeCount)) {
4822    set_.set_standard_set_type('n');
4823    return true;
4824  }
4825  if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4826    set_.set_standard_set_type('w');
4827    return true;
4828  }
4829  if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4830    set_.set_standard_set_type('W');
4831    return true;
4832  }
4833  return false;
4834}
4835
4836
4837RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4838                                         RegExpNode* on_success) {
4839  return new(compiler->zone()) TextNode(this, on_success);
4840}
4841
4842
4843RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4844                                      RegExpNode* on_success) {
4845  ZoneList<RegExpTree*>* alternatives = this->alternatives();
4846  int length = alternatives->length();
4847  ChoiceNode* result =
4848      new(compiler->zone()) ChoiceNode(length, compiler->zone());
4849  for (int i = 0; i < length; i++) {
4850    GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
4851                                                               on_success));
4852    result->AddAlternative(alternative);
4853  }
4854  return result;
4855}
4856
4857
4858RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
4859                                     RegExpNode* on_success) {
4860  return ToNode(min(),
4861                max(),
4862                is_greedy(),
4863                body(),
4864                compiler,
4865                on_success);
4866}
4867
4868
4869// Scoped object to keep track of how much we unroll quantifier loops in the
4870// regexp graph generator.
4871class RegExpExpansionLimiter {
4872 public:
4873  static const int kMaxExpansionFactor = 6;
4874  RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
4875      : compiler_(compiler),
4876        saved_expansion_factor_(compiler->current_expansion_factor()),
4877        ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4878    DCHECK(factor > 0);
4879    if (ok_to_expand_) {
4880      if (factor > kMaxExpansionFactor) {
4881        // Avoid integer overflow of the current expansion factor.
4882        ok_to_expand_ = false;
4883        compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4884      } else {
4885        int new_factor = saved_expansion_factor_ * factor;
4886        ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4887        compiler->set_current_expansion_factor(new_factor);
4888      }
4889    }
4890  }
4891
4892  ~RegExpExpansionLimiter() {
4893    compiler_->set_current_expansion_factor(saved_expansion_factor_);
4894  }
4895
4896  bool ok_to_expand() { return ok_to_expand_; }
4897
4898 private:
4899  RegExpCompiler* compiler_;
4900  int saved_expansion_factor_;
4901  bool ok_to_expand_;
4902
4903  DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4904};
4905
4906
4907RegExpNode* RegExpQuantifier::ToNode(int min,
4908                                     int max,
4909                                     bool is_greedy,
4910                                     RegExpTree* body,
4911                                     RegExpCompiler* compiler,
4912                                     RegExpNode* on_success,
4913                                     bool not_at_start) {
4914  // x{f, t} becomes this:
4915  //
4916  //             (r++)<-.
4917  //               |     `
4918  //               |     (x)
4919  //               v     ^
4920  //      (r=0)-->(?)---/ [if r < t]
4921  //               |
4922  //   [if r >= f] \----> ...
4923  //
4924
4925  // 15.10.2.5 RepeatMatcher algorithm.
4926  // The parser has already eliminated the case where max is 0.  In the case
4927  // where max_match is zero the parser has removed the quantifier if min was
4928  // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
4929
4930  // If we know that we cannot match zero length then things are a little
4931  // simpler since we don't need to make the special zero length match check
4932  // from step 2.1.  If the min and max are small we can unroll a little in
4933  // this case.
4934  static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
4935  static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
4936  if (max == 0) return on_success;  // This can happen due to recursion.
4937  bool body_can_be_empty = (body->min_match() == 0);
4938  int body_start_reg = RegExpCompiler::kNoRegister;
4939  Interval capture_registers = body->CaptureRegisters();
4940  bool needs_capture_clearing = !capture_registers.is_empty();
4941  Zone* zone = compiler->zone();
4942
4943  if (body_can_be_empty) {
4944    body_start_reg = compiler->AllocateRegister();
4945  } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
4946    // Only unroll if there are no captures and the body can't be
4947    // empty.
4948    {
4949      RegExpExpansionLimiter limiter(
4950          compiler, min + ((max != min) ? 1 : 0));
4951      if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4952        int new_max = (max == kInfinity) ? max : max - min;
4953        // Recurse once to get the loop or optional matches after the fixed
4954        // ones.
4955        RegExpNode* answer = ToNode(
4956            0, new_max, is_greedy, body, compiler, on_success, true);
4957        // Unroll the forced matches from 0 to min.  This can cause chains of
4958        // TextNodes (which the parser does not generate).  These should be
4959        // combined if it turns out they hinder good code generation.
4960        for (int i = 0; i < min; i++) {
4961          answer = body->ToNode(compiler, answer);
4962        }
4963        return answer;
4964      }
4965    }
4966    if (max <= kMaxUnrolledMaxMatches && min == 0) {
4967      DCHECK(max > 0);  // Due to the 'if' above.
4968      RegExpExpansionLimiter limiter(compiler, max);
4969      if (limiter.ok_to_expand()) {
4970        // Unroll the optional matches up to max.
4971        RegExpNode* answer = on_success;
4972        for (int i = 0; i < max; i++) {
4973          ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
4974          if (is_greedy) {
4975            alternation->AddAlternative(
4976                GuardedAlternative(body->ToNode(compiler, answer)));
4977            alternation->AddAlternative(GuardedAlternative(on_success));
4978          } else {
4979            alternation->AddAlternative(GuardedAlternative(on_success));
4980            alternation->AddAlternative(
4981                GuardedAlternative(body->ToNode(compiler, answer)));
4982          }
4983          answer = alternation;
4984          if (not_at_start) alternation->set_not_at_start();
4985        }
4986        return answer;
4987      }
4988    }
4989  }
4990  bool has_min = min > 0;
4991  bool has_max = max < RegExpTree::kInfinity;
4992  bool needs_counter = has_min || has_max;
4993  int reg_ctr = needs_counter
4994      ? compiler->AllocateRegister()
4995      : RegExpCompiler::kNoRegister;
4996  LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0,
4997                                                    zone);
4998  if (not_at_start) center->set_not_at_start();
4999  RegExpNode* loop_return = needs_counter
5000      ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5001      : static_cast<RegExpNode*>(center);
5002  if (body_can_be_empty) {
5003    // If the body can be empty we need to check if it was and then
5004    // backtrack.
5005    loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5006                                              reg_ctr,
5007                                              min,
5008                                              loop_return);
5009  }
5010  RegExpNode* body_node = body->ToNode(compiler, loop_return);
5011  if (body_can_be_empty) {
5012    // If the body can be empty we need to store the start position
5013    // so we can bail out if it was empty.
5014    body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5015  }
5016  if (needs_capture_clearing) {
5017    // Before entering the body of this loop we need to clear captures.
5018    body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5019  }
5020  GuardedAlternative body_alt(body_node);
5021  if (has_max) {
5022    Guard* body_guard =
5023        new(zone) Guard(reg_ctr, Guard::LT, max);
5024    body_alt.AddGuard(body_guard, zone);
5025  }
5026  GuardedAlternative rest_alt(on_success);
5027  if (has_min) {
5028    Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5029    rest_alt.AddGuard(rest_guard, zone);
5030  }
5031  if (is_greedy) {
5032    center->AddLoopAlternative(body_alt);
5033    center->AddContinueAlternative(rest_alt);
5034  } else {
5035    center->AddContinueAlternative(rest_alt);
5036    center->AddLoopAlternative(body_alt);
5037  }
5038  if (needs_counter) {
5039    return ActionNode::SetRegister(reg_ctr, 0, center);
5040  } else {
5041    return center;
5042  }
5043}
5044
5045
5046RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5047                                    RegExpNode* on_success) {
5048  NodeInfo info;
5049  Zone* zone = compiler->zone();
5050
5051  switch (assertion_type()) {
5052    case START_OF_LINE:
5053      return AssertionNode::AfterNewline(on_success);
5054    case START_OF_INPUT:
5055      return AssertionNode::AtStart(on_success);
5056    case BOUNDARY:
5057      return AssertionNode::AtBoundary(on_success);
5058    case NON_BOUNDARY:
5059      return AssertionNode::AtNonBoundary(on_success);
5060    case END_OF_INPUT:
5061      return AssertionNode::AtEnd(on_success);
5062    case END_OF_LINE: {
5063      // Compile $ in multiline regexps as an alternation with a positive
5064      // lookahead in one side and an end-of-input on the other side.
5065      // We need two registers for the lookahead.
5066      int stack_pointer_register = compiler->AllocateRegister();
5067      int position_register = compiler->AllocateRegister();
5068      // The ChoiceNode to distinguish between a newline and end-of-input.
5069      ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5070      // Create a newline atom.
5071      ZoneList<CharacterRange>* newline_ranges =
5072          new(zone) ZoneList<CharacterRange>(3, zone);
5073      CharacterRange::AddClassEscape('n', newline_ranges, zone);
5074      RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');
5075      TextNode* newline_matcher = new(zone) TextNode(
5076         newline_atom,
5077         ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5078                                             position_register,
5079                                             0,  // No captures inside.
5080                                             -1,  // Ignored if no captures.
5081                                             on_success));
5082      // Create an end-of-input matcher.
5083      RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5084          stack_pointer_register,
5085          position_register,
5086          newline_matcher);
5087      // Add the two alternatives to the ChoiceNode.
5088      GuardedAlternative eol_alternative(end_of_line);
5089      result->AddAlternative(eol_alternative);
5090      GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5091      result->AddAlternative(end_alternative);
5092      return result;
5093    }
5094    default:
5095      UNREACHABLE();
5096  }
5097  return on_success;
5098}
5099
5100
5101RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5102                                        RegExpNode* on_success) {
5103  return new(compiler->zone())
5104      BackReferenceNode(RegExpCapture::StartRegister(index()),
5105                        RegExpCapture::EndRegister(index()),
5106                        on_success);
5107}
5108
5109
5110RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5111                                RegExpNode* on_success) {
5112  return on_success;
5113}
5114
5115
5116RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
5117                                    RegExpNode* on_success) {
5118  int stack_pointer_register = compiler->AllocateRegister();
5119  int position_register = compiler->AllocateRegister();
5120
5121  const int registers_per_capture = 2;
5122  const int register_of_first_capture = 2;
5123  int register_count = capture_count_ * registers_per_capture;
5124  int register_start =
5125    register_of_first_capture + capture_from_ * registers_per_capture;
5126
5127  RegExpNode* success;
5128  if (is_positive()) {
5129    RegExpNode* node = ActionNode::BeginSubmatch(
5130        stack_pointer_register,
5131        position_register,
5132        body()->ToNode(
5133            compiler,
5134            ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5135                                                position_register,
5136                                                register_count,
5137                                                register_start,
5138                                                on_success)));
5139    return node;
5140  } else {
5141    // We use a ChoiceNode for a negative lookahead because it has most of
5142    // the characteristics we need.  It has the body of the lookahead as its
5143    // first alternative and the expression after the lookahead of the second
5144    // alternative.  If the first alternative succeeds then the
5145    // NegativeSubmatchSuccess will unwind the stack including everything the
5146    // choice node set up and backtrack.  If the first alternative fails then
5147    // the second alternative is tried, which is exactly the desired result
5148    // for a negative lookahead.  The NegativeLookaheadChoiceNode is a special
5149    // ChoiceNode that knows to ignore the first exit when calculating quick
5150    // checks.
5151    Zone* zone = compiler->zone();
5152
5153    GuardedAlternative body_alt(
5154        body()->ToNode(
5155            compiler,
5156            success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,
5157                                                        position_register,
5158                                                        register_count,
5159                                                        register_start,
5160                                                        zone)));
5161    ChoiceNode* choice_node =
5162        new(zone) NegativeLookaheadChoiceNode(body_alt,
5163                                              GuardedAlternative(on_success),
5164                                              zone);
5165    return ActionNode::BeginSubmatch(stack_pointer_register,
5166                                     position_register,
5167                                     choice_node);
5168  }
5169}
5170
5171
5172RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5173                                  RegExpNode* on_success) {
5174  return ToNode(body(), index(), compiler, on_success);
5175}
5176
5177
5178RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5179                                  int index,
5180                                  RegExpCompiler* compiler,
5181                                  RegExpNode* on_success) {
5182  int start_reg = RegExpCapture::StartRegister(index);
5183  int end_reg = RegExpCapture::EndRegister(index);
5184  RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5185  RegExpNode* body_node = body->ToNode(compiler, store_end);
5186  return ActionNode::StorePosition(start_reg, true, body_node);
5187}
5188
5189
5190RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5191                                      RegExpNode* on_success) {
5192  ZoneList<RegExpTree*>* children = nodes();
5193  RegExpNode* current = on_success;
5194  for (int i = children->length() - 1; i >= 0; i--) {
5195    current = children->at(i)->ToNode(compiler, current);
5196  }
5197  return current;
5198}
5199
5200
5201static void AddClass(const int* elmv,
5202                     int elmc,
5203                     ZoneList<CharacterRange>* ranges,
5204                     Zone* zone) {
5205  elmc--;
5206  DCHECK(elmv[elmc] == 0x10000);
5207  for (int i = 0; i < elmc; i += 2) {
5208    DCHECK(elmv[i] < elmv[i + 1]);
5209    ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
5210  }
5211}
5212
5213
5214static void AddClassNegated(const int *elmv,
5215                            int elmc,
5216                            ZoneList<CharacterRange>* ranges,
5217                            Zone* zone) {
5218  elmc--;
5219  DCHECK(elmv[elmc] == 0x10000);
5220  DCHECK(elmv[0] != 0x0000);
5221  DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
5222  uc16 last = 0x0000;
5223  for (int i = 0; i < elmc; i += 2) {
5224    DCHECK(last <= elmv[i] - 1);
5225    DCHECK(elmv[i] < elmv[i + 1]);
5226    ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
5227    last = elmv[i + 1];
5228  }
5229  ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
5230}
5231
5232
5233void CharacterRange::AddClassEscape(uc16 type,
5234                                    ZoneList<CharacterRange>* ranges,
5235                                    Zone* zone) {
5236  switch (type) {
5237    case 's':
5238      AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5239      break;
5240    case 'S':
5241      AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5242      break;
5243    case 'w':
5244      AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5245      break;
5246    case 'W':
5247      AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5248      break;
5249    case 'd':
5250      AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5251      break;
5252    case 'D':
5253      AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5254      break;
5255    case '.':
5256      AddClassNegated(kLineTerminatorRanges,
5257                      kLineTerminatorRangeCount,
5258                      ranges,
5259                      zone);
5260      break;
5261    // This is not a character range as defined by the spec but a
5262    // convenient shorthand for a character class that matches any
5263    // character.
5264    case '*':
5265      ranges->Add(CharacterRange::Everything(), zone);
5266      break;
5267    // This is the set of characters matched by the $ and ^ symbols
5268    // in multiline mode.
5269    case 'n':
5270      AddClass(kLineTerminatorRanges,
5271               kLineTerminatorRangeCount,
5272               ranges,
5273               zone);
5274      break;
5275    default:
5276      UNREACHABLE();
5277  }
5278}
5279
5280
5281Vector<const int> CharacterRange::GetWordBounds() {
5282  return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5283}
5284
5285
5286class CharacterRangeSplitter {
5287 public:
5288  CharacterRangeSplitter(ZoneList<CharacterRange>** included,
5289                         ZoneList<CharacterRange>** excluded,
5290                         Zone* zone)
5291      : included_(included),
5292        excluded_(excluded),
5293        zone_(zone) { }
5294  void Call(uc16 from, DispatchTable::Entry entry);
5295
5296  static const int kInBase = 0;
5297  static const int kInOverlay = 1;
5298
5299 private:
5300  ZoneList<CharacterRange>** included_;
5301  ZoneList<CharacterRange>** excluded_;
5302  Zone* zone_;
5303};
5304
5305
5306void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
5307  if (!entry.out_set()->Get(kInBase)) return;
5308  ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5309    ? included_
5310    : excluded_;
5311  if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5312  (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
5313}
5314
5315
5316void CharacterRange::Split(ZoneList<CharacterRange>* base,
5317                           Vector<const int> overlay,
5318                           ZoneList<CharacterRange>** included,
5319                           ZoneList<CharacterRange>** excluded,
5320                           Zone* zone) {
5321  DCHECK_EQ(NULL, *included);
5322  DCHECK_EQ(NULL, *excluded);
5323  DispatchTable table(zone);
5324  for (int i = 0; i < base->length(); i++)
5325    table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
5326  for (int i = 0; i < overlay.length(); i += 2) {
5327    table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
5328                   CharacterRangeSplitter::kInOverlay, zone);
5329  }
5330  CharacterRangeSplitter callback(included, excluded, zone);
5331  table.ForEach(&callback);
5332}
5333
5334
5335void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
5336                                        bool is_one_byte, Zone* zone) {
5337  Isolate* isolate = zone->isolate();
5338  uc16 bottom = from();
5339  uc16 top = to();
5340  if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {
5341    if (bottom > String::kMaxOneByteCharCode) return;
5342    if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5343  }
5344  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5345  if (top == bottom) {
5346    // If this is a singleton we just expand the one character.
5347    int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5348    for (int i = 0; i < length; i++) {
5349      uc32 chr = chars[i];
5350      if (chr != bottom) {
5351        ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5352      }
5353    }
5354  } else {
5355    // If this is a range we expand the characters block by block,
5356    // expanding contiguous subranges (blocks) one at a time.
5357    // The approach is as follows.  For a given start character we
5358    // look up the remainder of the block that contains it (represented
5359    // by the end point), for instance we find 'z' if the character
5360    // is 'c'.  A block is characterized by the property
5361    // that all characters uncanonicalize in the same way, except that
5362    // each entry in the result is incremented by the distance from the first
5363    // element.  So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
5364    // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
5365    // Once we've found the end point we look up its uncanonicalization
5366    // and produce a range for each element.  For instance for [c-f]
5367    // we look up ['z', 'Z'] and produce [c-f] and [C-F].  We then only
5368    // add a range if it is not already contained in the input, so [c-f]
5369    // will be skipped but [C-F] will be added.  If this range is not
5370    // completely contained in a block we do this for all the blocks
5371    // covered by the range (handling characters that is not in a block
5372    // as a "singleton block").
5373    unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5374    int pos = bottom;
5375    while (pos <= top) {
5376      int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
5377      uc16 block_end;
5378      if (length == 0) {
5379        block_end = pos;
5380      } else {
5381        DCHECK_EQ(1, length);
5382        block_end = range[0];
5383      }
5384      int end = (block_end > top) ? top : block_end;
5385      length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
5386      for (int i = 0; i < length; i++) {
5387        uc32 c = range[i];
5388        uc16 range_from = c - (block_end - pos);
5389        uc16 range_to = c - (block_end - end);
5390        if (!(bottom <= range_from && range_to <= top)) {
5391          ranges->Add(CharacterRange(range_from, range_to), zone);
5392        }
5393      }
5394      pos = end + 1;
5395    }
5396  }
5397}
5398
5399
5400bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5401  DCHECK_NOT_NULL(ranges);
5402  int n = ranges->length();
5403  if (n <= 1) return true;
5404  int max = ranges->at(0).to();
5405  for (int i = 1; i < n; i++) {
5406    CharacterRange next_range = ranges->at(i);
5407    if (next_range.from() <= max + 1) return false;
5408    max = next_range.to();
5409  }
5410  return true;
5411}
5412
5413
5414