1// Copyright 2012 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
30#include "ast.h"
31#include "compiler.h"
32#include "execution.h"
33#include "factory.h"
34#include "jsregexp.h"
35#include "jsregexp-inl.h"
36#include "platform.h"
37#include "string-search.h"
38#include "runtime.h"
39#include "compilation-cache.h"
40#include "string-stream.h"
41#include "parser.h"
42#include "regexp-macro-assembler.h"
43#include "regexp-macro-assembler-tracer.h"
44#include "regexp-macro-assembler-irregexp.h"
45#include "regexp-stack.h"
46
47#ifndef V8_INTERPRETED_REGEXP
48#if V8_TARGET_ARCH_IA32
49#include "ia32/regexp-macro-assembler-ia32.h"
50#elif V8_TARGET_ARCH_X64
51#include "x64/regexp-macro-assembler-x64.h"
52#elif V8_TARGET_ARCH_ARM
53#include "arm/regexp-macro-assembler-arm.h"
54#elif V8_TARGET_ARCH_MIPS
55#include "mips/regexp-macro-assembler-mips.h"
56#else
57#error Unsupported target architecture.
58#endif
59#endif
60
61#include "interpreter-irregexp.h"
62
63
64namespace v8 {
65namespace internal {
66
67Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
68                                               Handle<String> pattern,
69                                               Handle<String> flags,
70                                               bool* has_pending_exception) {
71  // Call the construct code with 2 arguments.
72  Handle<Object> argv[] = { pattern, flags };
73  return Execution::New(constructor, ARRAY_SIZE(argv), argv,
74                        has_pending_exception);
75}
76
77
78static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
79  int flags = JSRegExp::NONE;
80  for (int i = 0; i < str->length(); i++) {
81    switch (str->Get(i)) {
82      case 'i':
83        flags |= JSRegExp::IGNORE_CASE;
84        break;
85      case 'g':
86        flags |= JSRegExp::GLOBAL;
87        break;
88      case 'm':
89        flags |= JSRegExp::MULTILINE;
90        break;
91    }
92  }
93  return JSRegExp::Flags(flags);
94}
95
96
97static inline void ThrowRegExpException(Handle<JSRegExp> re,
98                                        Handle<String> pattern,
99                                        Handle<String> error_text,
100                                        const char* message) {
101  Isolate* isolate = re->GetIsolate();
102  Factory* factory = isolate->factory();
103  Handle<FixedArray> elements = factory->NewFixedArray(2);
104  elements->set(0, *pattern);
105  elements->set(1, *error_text);
106  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
107  Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
108  isolate->Throw(*regexp_err);
109}
110
111
112ContainedInLattice AddRange(ContainedInLattice containment,
113                            const int* ranges,
114                            int ranges_length,
115                            Interval new_range) {
116  ASSERT((ranges_length & 1) == 1);
117  ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
118  if (containment == kLatticeUnknown) return containment;
119  bool inside = false;
120  int last = 0;
121  for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
122    // Consider the range from last to ranges[i].
123    // We haven't got to the new range yet.
124    if (ranges[i] <= new_range.from()) continue;
125    // New range is wholly inside last-ranges[i].  Note that new_range.to() is
126    // inclusive, but the values in ranges are not.
127    if (last <= new_range.from() && new_range.to() < ranges[i]) {
128      return Combine(containment, inside ? kLatticeIn : kLatticeOut);
129    }
130    return kLatticeUnknown;
131  }
132  return containment;
133}
134
135
136// More makes code generation slower, less makes V8 benchmark score lower.
137const int kMaxLookaheadForBoyerMoore = 8;
138// In a 3-character pattern you can maximally step forwards 3 characters
139// at a time, which is not always enough to pay for the extra logic.
140const int kPatternTooShortForBoyerMoore = 2;
141
142
143// Identifies the sort of regexps where the regexp engine is faster
144// than the code used for atom matches.
145static bool HasFewDifferentCharacters(Handle<String> pattern) {
146  int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
147  if (length <= kPatternTooShortForBoyerMoore) return false;
148  const int kMod = 128;
149  bool character_found[kMod];
150  int different = 0;
151  memset(&character_found[0], 0, sizeof(character_found));
152  for (int i = 0; i < length; i++) {
153    int ch = (pattern->Get(i) & (kMod - 1));
154    if (!character_found[ch]) {
155      character_found[ch] = true;
156      different++;
157      // We declare a regexp low-alphabet if it has at least 3 times as many
158      // characters as it has different characters.
159      if (different * 3 > length) return false;
160    }
161  }
162  return true;
163}
164
165
166// Generic RegExp methods. Dispatches to implementation specific methods.
167
168
169Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
170                                   Handle<String> pattern,
171                                   Handle<String> flag_str) {
172  Isolate* isolate = re->GetIsolate();
173  Zone zone(isolate);
174  JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
175  CompilationCache* compilation_cache = isolate->compilation_cache();
176  Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags);
177  bool in_cache = !cached.is_null();
178  LOG(isolate, RegExpCompileEvent(re, in_cache));
179
180  Handle<Object> result;
181  if (in_cache) {
182    re->set_data(*cached);
183    return re;
184  }
185  pattern = FlattenGetString(pattern);
186  PostponeInterruptsScope postpone(isolate);
187  RegExpCompileData parse_result;
188  FlatStringReader reader(isolate, pattern);
189  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
190                                 &parse_result, &zone)) {
191    // Throw an exception if we fail to parse the pattern.
192    ThrowRegExpException(re,
193                         pattern,
194                         parse_result.error,
195                         "malformed_regexp");
196    return Handle<Object>::null();
197  }
198
199  bool has_been_compiled = false;
200
201  if (parse_result.simple &&
202      !flags.is_ignore_case() &&
203      !HasFewDifferentCharacters(pattern)) {
204    // Parse-tree is a single atom that is equal to the pattern.
205    AtomCompile(re, pattern, flags, pattern);
206    has_been_compiled = true;
207  } else if (parse_result.tree->IsAtom() &&
208      !flags.is_ignore_case() &&
209      parse_result.capture_count == 0) {
210    RegExpAtom* atom = parse_result.tree->AsAtom();
211    Vector<const uc16> atom_pattern = atom->data();
212    Handle<String> atom_string =
213        isolate->factory()->NewStringFromTwoByte(atom_pattern);
214    if (!HasFewDifferentCharacters(atom_string)) {
215      AtomCompile(re, pattern, flags, atom_string);
216      has_been_compiled = true;
217    }
218  }
219  if (!has_been_compiled) {
220    IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
221  }
222  ASSERT(re->data()->IsFixedArray());
223  // Compilation succeeded so the data is set on the regexp
224  // and we can store it in the cache.
225  Handle<FixedArray> data(FixedArray::cast(re->data()));
226  compilation_cache->PutRegExp(pattern, flags, data);
227
228  return re;
229}
230
231
232Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
233                                Handle<String> subject,
234                                int index,
235                                Handle<JSArray> last_match_info) {
236  switch (regexp->TypeTag()) {
237    case JSRegExp::ATOM:
238      return AtomExec(regexp, subject, index, last_match_info);
239    case JSRegExp::IRREGEXP: {
240      Handle<Object> result =
241          IrregexpExec(regexp, subject, index, last_match_info);
242      ASSERT(!result.is_null() ||
243             regexp->GetIsolate()->has_pending_exception());
244      return result;
245    }
246    default:
247      UNREACHABLE();
248      return Handle<Object>::null();
249  }
250}
251
252
253// RegExp Atom implementation: Simple string search using indexOf.
254
255
256void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
257                             Handle<String> pattern,
258                             JSRegExp::Flags flags,
259                             Handle<String> match_pattern) {
260  re->GetIsolate()->factory()->SetRegExpAtomData(re,
261                                                 JSRegExp::ATOM,
262                                                 pattern,
263                                                 flags,
264                                                 match_pattern);
265}
266
267
268static void SetAtomLastCapture(FixedArray* array,
269                               String* subject,
270                               int from,
271                               int to) {
272  SealHandleScope shs(array->GetIsolate());
273  RegExpImpl::SetLastCaptureCount(array, 2);
274  RegExpImpl::SetLastSubject(array, subject);
275  RegExpImpl::SetLastInput(array, subject);
276  RegExpImpl::SetCapture(array, 0, from);
277  RegExpImpl::SetCapture(array, 1, to);
278}
279
280
281int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
282                            Handle<String> subject,
283                            int index,
284                            int32_t* output,
285                            int output_size) {
286  Isolate* isolate = regexp->GetIsolate();
287
288  ASSERT(0 <= index);
289  ASSERT(index <= subject->length());
290
291  if (!subject->IsFlat()) FlattenString(subject);
292  DisallowHeapAllocation no_gc;  // ensure vectors stay valid
293
294  String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
295  int needle_len = needle->length();
296  ASSERT(needle->IsFlat());
297  ASSERT_LT(0, needle_len);
298
299  if (index + needle_len > subject->length()) {
300    return RegExpImpl::RE_FAILURE;
301  }
302
303  for (int i = 0; i < output_size; i += 2) {
304    String::FlatContent needle_content = needle->GetFlatContent();
305    String::FlatContent subject_content = subject->GetFlatContent();
306    ASSERT(needle_content.IsFlat());
307    ASSERT(subject_content.IsFlat());
308    // dispatch on type of strings
309    index = (needle_content.IsAscii()
310             ? (subject_content.IsAscii()
311                ? SearchString(isolate,
312                               subject_content.ToOneByteVector(),
313                               needle_content.ToOneByteVector(),
314                               index)
315                : SearchString(isolate,
316                               subject_content.ToUC16Vector(),
317                               needle_content.ToOneByteVector(),
318                               index))
319             : (subject_content.IsAscii()
320                ? SearchString(isolate,
321                               subject_content.ToOneByteVector(),
322                               needle_content.ToUC16Vector(),
323                               index)
324                : SearchString(isolate,
325                               subject_content.ToUC16Vector(),
326                               needle_content.ToUC16Vector(),
327                               index)));
328    if (index == -1) {
329      return i / 2;  // Return number of matches.
330    } else {
331      output[i] = index;
332      output[i+1] = index + needle_len;
333      index += needle_len;
334    }
335  }
336  return output_size / 2;
337}
338
339
340Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
341                                    Handle<String> subject,
342                                    int index,
343                                    Handle<JSArray> last_match_info) {
344  Isolate* isolate = re->GetIsolate();
345
346  static const int kNumRegisters = 2;
347  STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
348  int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
349
350  int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
351
352  if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
353
354  ASSERT_EQ(res, RegExpImpl::RE_SUCCESS);
355  SealHandleScope shs(isolate);
356  FixedArray* array = FixedArray::cast(last_match_info->elements());
357  SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
358  return last_match_info;
359}
360
361
362// Irregexp implementation.
363
364// Ensures that the regexp object contains a compiled version of the
365// source for either ASCII or non-ASCII strings.
366// If the compiled version doesn't already exist, it is compiled
367// from the source pattern.
368// If compilation fails, an exception is thrown and this function
369// returns false.
370bool RegExpImpl::EnsureCompiledIrregexp(
371    Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
372  Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
373#ifdef V8_INTERPRETED_REGEXP
374  if (compiled_code->IsByteArray()) return true;
375#else  // V8_INTERPRETED_REGEXP (RegExp native code)
376  if (compiled_code->IsCode()) return true;
377#endif
378  // We could potentially have marked this as flushable, but have kept
379  // a saved version if we did not flush it yet.
380  Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
381  if (saved_code->IsCode()) {
382    // Reinstate the code in the original place.
383    re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
384    ASSERT(compiled_code->IsSmi());
385    return true;
386  }
387  return CompileIrregexp(re, sample_subject, is_ascii);
388}
389
390
391static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
392                                            bool is_ascii,
393                                            Handle<String> error_message,
394                                            Isolate* isolate) {
395  Factory* factory = isolate->factory();
396  Handle<FixedArray> elements = factory->NewFixedArray(2);
397  elements->set(0, re->Pattern());
398  elements->set(1, *error_message);
399  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
400  Handle<Object> regexp_err =
401      factory->NewSyntaxError("malformed_regexp", array);
402  isolate->Throw(*regexp_err);
403  return false;
404}
405
406
407bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
408                                 Handle<String> sample_subject,
409                                 bool is_ascii) {
410  // Compile the RegExp.
411  Isolate* isolate = re->GetIsolate();
412  Zone zone(isolate);
413  PostponeInterruptsScope postpone(isolate);
414  // If we had a compilation error the last time this is saved at the
415  // saved code index.
416  Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
417  // When arriving here entry can only be a smi, either representing an
418  // uncompiled regexp, a previous compilation error, or code that has
419  // been flushed.
420  ASSERT(entry->IsSmi());
421  int entry_value = Smi::cast(entry)->value();
422  ASSERT(entry_value == JSRegExp::kUninitializedValue ||
423         entry_value == JSRegExp::kCompilationErrorValue ||
424         (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
425
426  if (entry_value == JSRegExp::kCompilationErrorValue) {
427    // A previous compilation failed and threw an error which we store in
428    // the saved code index (we store the error message, not the actual
429    // error). Recreate the error object and throw it.
430    Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
431    ASSERT(error_string->IsString());
432    Handle<String> error_message(String::cast(error_string));
433    CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
434    return false;
435  }
436
437  JSRegExp::Flags flags = re->GetFlags();
438
439  Handle<String> pattern(re->Pattern());
440  if (!pattern->IsFlat()) FlattenString(pattern);
441  RegExpCompileData compile_data;
442  FlatStringReader reader(isolate, pattern);
443  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
444                                 &compile_data,
445                                 &zone)) {
446    // Throw an exception if we fail to parse the pattern.
447    // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
448    ThrowRegExpException(re,
449                         pattern,
450                         compile_data.error,
451                         "malformed_regexp");
452    return false;
453  }
454  RegExpEngine::CompilationResult result =
455      RegExpEngine::Compile(&compile_data,
456                            flags.is_ignore_case(),
457                            flags.is_global(),
458                            flags.is_multiline(),
459                            pattern,
460                            sample_subject,
461                            is_ascii,
462                            &zone);
463  if (result.error_message != NULL) {
464    // Unable to compile regexp.
465    Handle<String> error_message =
466        isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message));
467    CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
468    return false;
469  }
470
471  Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
472  data->set(JSRegExp::code_index(is_ascii), result.code);
473  int register_max = IrregexpMaxRegisterCount(*data);
474  if (result.num_registers > register_max) {
475    SetIrregexpMaxRegisterCount(*data, result.num_registers);
476  }
477
478  return true;
479}
480
481
482int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
483  return Smi::cast(
484      re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
485}
486
487
488void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
489  re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
490}
491
492
493int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
494  return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
495}
496
497
498int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
499  return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
500}
501
502
503ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
504  return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
505}
506
507
508Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
509  return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
510}
511
512
513void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
514                                    Handle<String> pattern,
515                                    JSRegExp::Flags flags,
516                                    int capture_count) {
517  // Initialize compiled code entries to null.
518  re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
519                                                     JSRegExp::IRREGEXP,
520                                                     pattern,
521                                                     flags,
522                                                     capture_count);
523}
524
525
526int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
527                                Handle<String> subject) {
528  if (!subject->IsFlat()) FlattenString(subject);
529
530  // Check the asciiness of the underlying storage.
531  bool is_ascii = subject->IsOneByteRepresentationUnderneath();
532  if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;
533
534#ifdef V8_INTERPRETED_REGEXP
535  // Byte-code regexp needs space allocated for all its registers.
536  // The result captures are copied to the start of the registers array
537  // if the match succeeds.  This way those registers are not clobbered
538  // when we set the last match info from last successful match.
539  return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
540         (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
541#else  // V8_INTERPRETED_REGEXP
542  // Native regexp only needs room to output captures. Registers are handled
543  // internally.
544  return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
545#endif  // V8_INTERPRETED_REGEXP
546}
547
548
549int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
550                                Handle<String> subject,
551                                int index,
552                                int32_t* output,
553                                int output_size) {
554  Isolate* isolate = regexp->GetIsolate();
555
556  Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
557
558  ASSERT(index >= 0);
559  ASSERT(index <= subject->length());
560  ASSERT(subject->IsFlat());
561
562  bool is_ascii = subject->IsOneByteRepresentationUnderneath();
563
564#ifndef V8_INTERPRETED_REGEXP
565  ASSERT(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
566  do {
567    EnsureCompiledIrregexp(regexp, subject, is_ascii);
568    Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
569    // The stack is used to allocate registers for the compiled regexp code.
570    // This means that in case of failure, the output registers array is left
571    // untouched and contains the capture results from the previous successful
572    // match.  We can use that to set the last match info lazily.
573    NativeRegExpMacroAssembler::Result res =
574        NativeRegExpMacroAssembler::Match(code,
575                                          subject,
576                                          output,
577                                          output_size,
578                                          index,
579                                          isolate);
580    if (res != NativeRegExpMacroAssembler::RETRY) {
581      ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION ||
582             isolate->has_pending_exception());
583      STATIC_ASSERT(
584          static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
585      STATIC_ASSERT(
586          static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
587      STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
588                    == RE_EXCEPTION);
589      return static_cast<IrregexpResult>(res);
590    }
591    // If result is RETRY, the string has changed representation, and we
592    // must restart from scratch.
593    // In this case, it means we must make sure we are prepared to handle
594    // the, potentially, different subject (the string can switch between
595    // being internal and external, and even between being ASCII and UC16,
596    // but the characters are always the same).
597    IrregexpPrepare(regexp, subject);
598    is_ascii = subject->IsOneByteRepresentationUnderneath();
599  } while (true);
600  UNREACHABLE();
601  return RE_EXCEPTION;
602#else  // V8_INTERPRETED_REGEXP
603
604  ASSERT(output_size >= IrregexpNumberOfRegisters(*irregexp));
605  // We must have done EnsureCompiledIrregexp, so we can get the number of
606  // registers.
607  int number_of_capture_registers =
608      (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
609  int32_t* raw_output = &output[number_of_capture_registers];
610  // We do not touch the actual capture result registers until we know there
611  // has been a match so that we can use those capture results to set the
612  // last match info.
613  for (int i = number_of_capture_registers - 1; i >= 0; i--) {
614    raw_output[i] = -1;
615  }
616  Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
617
618  IrregexpResult result = IrregexpInterpreter::Match(isolate,
619                                                     byte_codes,
620                                                     subject,
621                                                     raw_output,
622                                                     index);
623  if (result == RE_SUCCESS) {
624    // Copy capture results to the start of the registers array.
625    OS::MemCopy(
626        output, raw_output, number_of_capture_registers * sizeof(int32_t));
627  }
628  if (result == RE_EXCEPTION) {
629    ASSERT(!isolate->has_pending_exception());
630    isolate->StackOverflow();
631  }
632  return result;
633#endif  // V8_INTERPRETED_REGEXP
634}
635
636
637Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
638                                        Handle<String> subject,
639                                        int previous_index,
640                                        Handle<JSArray> last_match_info) {
641  Isolate* isolate = regexp->GetIsolate();
642  ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
643
644  // Prepare space for the return values.
645#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
646  if (FLAG_trace_regexp_bytecodes) {
647    String* pattern = regexp->Pattern();
648    PrintF("\n\nRegexp match:   /%s/\n\n", *(pattern->ToCString()));
649    PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
650  }
651#endif
652  int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
653  if (required_registers < 0) {
654    // Compiling failed with an exception.
655    ASSERT(isolate->has_pending_exception());
656    return Handle<Object>::null();
657  }
658
659  int32_t* output_registers = NULL;
660  if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
661    output_registers = NewArray<int32_t>(required_registers);
662  }
663  SmartArrayPointer<int32_t> auto_release(output_registers);
664  if (output_registers == NULL) {
665    output_registers = isolate->jsregexp_static_offsets_vector();
666  }
667
668  int res = RegExpImpl::IrregexpExecRaw(
669      regexp, subject, previous_index, output_registers, required_registers);
670  if (res == RE_SUCCESS) {
671    int capture_count =
672        IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
673    return SetLastMatchInfo(
674        last_match_info, subject, capture_count, output_registers);
675  }
676  if (res == RE_EXCEPTION) {
677    ASSERT(isolate->has_pending_exception());
678    return Handle<Object>::null();
679  }
680  ASSERT(res == RE_FAILURE);
681  return isolate->factory()->null_value();
682}
683
684
685Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
686                                             Handle<String> subject,
687                                             int capture_count,
688                                             int32_t* match) {
689  ASSERT(last_match_info->HasFastObjectElements());
690  int capture_register_count = (capture_count + 1) * 2;
691  last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
692  DisallowHeapAllocation no_allocation;
693  FixedArray* array = FixedArray::cast(last_match_info->elements());
694  if (match != NULL) {
695    for (int i = 0; i < capture_register_count; i += 2) {
696      SetCapture(array, i, match[i]);
697      SetCapture(array, i + 1, match[i + 1]);
698    }
699  }
700  SetLastCaptureCount(array, capture_register_count);
701  SetLastSubject(array, *subject);
702  SetLastInput(array, *subject);
703  return last_match_info;
704}
705
706
707RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
708                                     Handle<String> subject,
709                                     bool is_global,
710                                     Isolate* isolate)
711  : register_array_(NULL),
712    register_array_size_(0),
713    regexp_(regexp),
714    subject_(subject) {
715#ifdef V8_INTERPRETED_REGEXP
716  bool interpreted = true;
717#else
718  bool interpreted = false;
719#endif  // V8_INTERPRETED_REGEXP
720
721  if (regexp_->TypeTag() == JSRegExp::ATOM) {
722    static const int kAtomRegistersPerMatch = 2;
723    registers_per_match_ = kAtomRegistersPerMatch;
724    // There is no distinction between interpreted and native for atom regexps.
725    interpreted = false;
726  } else {
727    registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
728    if (registers_per_match_ < 0) {
729      num_matches_ = -1;  // Signal exception.
730      return;
731    }
732  }
733
734  if (is_global && !interpreted) {
735    register_array_size_ =
736        Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
737    max_matches_ = register_array_size_ / registers_per_match_;
738  } else {
739    // Global loop in interpreted regexp is not implemented.  We choose
740    // the size of the offsets vector so that it can only store one match.
741    register_array_size_ = registers_per_match_;
742    max_matches_ = 1;
743  }
744
745  if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
746    register_array_ = NewArray<int32_t>(register_array_size_);
747  } else {
748    register_array_ = isolate->jsregexp_static_offsets_vector();
749  }
750
751  // Set state so that fetching the results the first time triggers a call
752  // to the compiled regexp.
753  current_match_index_ = max_matches_ - 1;
754  num_matches_ = max_matches_;
755  ASSERT(registers_per_match_ >= 2);  // Each match has at least one capture.
756  ASSERT_GE(register_array_size_, registers_per_match_);
757  int32_t* last_match =
758      &register_array_[current_match_index_ * registers_per_match_];
759  last_match[0] = -1;
760  last_match[1] = 0;
761}
762
763
764// -------------------------------------------------------------------
765// Implementation of the Irregexp regular expression engine.
766//
767// The Irregexp regular expression engine is intended to be a complete
768// implementation of ECMAScript regular expressions.  It generates either
769// bytecodes or native code.
770
771//   The Irregexp regexp engine is structured in three steps.
772//   1) The parser generates an abstract syntax tree.  See ast.cc.
773//   2) From the AST a node network is created.  The nodes are all
774//      subclasses of RegExpNode.  The nodes represent states when
775//      executing a regular expression.  Several optimizations are
776//      performed on the node network.
777//   3) From the nodes we generate either byte codes or native code
778//      that can actually execute the regular expression (perform
779//      the search).  The code generation step is described in more
780//      detail below.
781
782// Code generation.
783//
784//   The nodes are divided into four main categories.
785//   * Choice nodes
786//        These represent places where the regular expression can
787//        match in more than one way.  For example on entry to an
788//        alternation (foo|bar) or a repetition (*, +, ? or {}).
789//   * Action nodes
790//        These represent places where some action should be
791//        performed.  Examples include recording the current position
792//        in the input string to a register (in order to implement
793//        captures) or other actions on register for example in order
794//        to implement the counters needed for {} repetitions.
795//   * Matching nodes
796//        These attempt to match some element part of the input string.
797//        Examples of elements include character classes, plain strings
798//        or back references.
799//   * End nodes
800//        These are used to implement the actions required on finding
801//        a successful match or failing to find a match.
802//
803//   The code generated (whether as byte codes or native code) maintains
804//   some state as it runs.  This consists of the following elements:
805//
806//   * The capture registers.  Used for string captures.
807//   * Other registers.  Used for counters etc.
808//   * The current position.
809//   * The stack of backtracking information.  Used when a matching node
810//     fails to find a match and needs to try an alternative.
811//
812// Conceptual regular expression execution model:
813//
814//   There is a simple conceptual model of regular expression execution
815//   which will be presented first.  The actual code generated is a more
816//   efficient simulation of the simple conceptual model:
817//
818//   * Choice nodes are implemented as follows:
819//     For each choice except the last {
820//       push current position
821//       push backtrack code location
822//       <generate code to test for choice>
823//       backtrack code location:
824//       pop current position
825//     }
826//     <generate code to test for last choice>
827//
828//   * Actions nodes are generated as follows
829//     <push affected registers on backtrack stack>
830//     <generate code to perform action>
831//     push backtrack code location
832//     <generate code to test for following nodes>
833//     backtrack code location:
834//     <pop affected registers to restore their state>
835//     <pop backtrack location from stack and go to it>
836//
837//   * Matching nodes are generated as follows:
838//     if input string matches at current position
839//       update current position
840//       <generate code to test for following nodes>
841//     else
842//       <pop backtrack location from stack and go to it>
843//
844//   Thus it can be seen that the current position is saved and restored
845//   by the choice nodes, whereas the registers are saved and restored by
846//   by the action nodes that manipulate them.
847//
848//   The other interesting aspect of this model is that nodes are generated
849//   at the point where they are needed by a recursive call to Emit().  If
850//   the node has already been code generated then the Emit() call will
851//   generate a jump to the previously generated code instead.  In order to
852//   limit recursion it is possible for the Emit() function to put the node
853//   on a work list for later generation and instead generate a jump.  The
854//   destination of the jump is resolved later when the code is generated.
855//
856// Actual regular expression code generation.
857//
858//   Code generation is actually more complicated than the above.  In order
859//   to improve the efficiency of the generated code some optimizations are
860//   performed
861//
862//   * Choice nodes have 1-character lookahead.
863//     A choice node looks at the following character and eliminates some of
864//     the choices immediately based on that character.  This is not yet
865//     implemented.
866//   * Simple greedy loops store reduced backtracking information.
867//     A quantifier like /.*foo/m will greedily match the whole input.  It will
868//     then need to backtrack to a point where it can match "foo".  The naive
869//     implementation of this would push each character position onto the
870//     backtracking stack, then pop them off one by one.  This would use space
871//     proportional to the length of the input string.  However since the "."
872//     can only match in one way and always has a constant length (in this case
873//     of 1) it suffices to store the current position on the top of the stack
874//     once.  Matching now becomes merely incrementing the current position and
875//     backtracking becomes decrementing the current position and checking the
876//     result against the stored current position.  This is faster and saves
877//     space.
878//   * The current state is virtualized.
879//     This is used to defer expensive operations until it is clear that they
880//     are needed and to generate code for a node more than once, allowing
881//     specialized an efficient versions of the code to be created. This is
882//     explained in the section below.
883//
884// Execution state virtualization.
885//
886//   Instead of emitting code, nodes that manipulate the state can record their
887//   manipulation in an object called the Trace.  The Trace object can record a
888//   current position offset, an optional backtrack code location on the top of
889//   the virtualized backtrack stack and some register changes.  When a node is
890//   to be emitted it can flush the Trace or update it.  Flushing the Trace
891//   will emit code to bring the actual state into line with the virtual state.
892//   Avoiding flushing the state can postpone some work (e.g. updates of capture
893//   registers).  Postponing work can save time when executing the regular
894//   expression since it may be found that the work never has to be done as a
895//   failure to match can occur.  In addition it is much faster to jump to a
896//   known backtrack code location than it is to pop an unknown backtrack
897//   location from the stack and jump there.
898//
899//   The virtual state found in the Trace affects code generation.  For example
900//   the virtual state contains the difference between the actual current
901//   position and the virtual current position, and matching code needs to use
902//   this offset to attempt a match in the correct location of the input
903//   string.  Therefore code generated for a non-trivial trace is specialized
904//   to that trace.  The code generator therefore has the ability to generate
905//   code for each node several times.  In order to limit the size of the
906//   generated code there is an arbitrary limit on how many specialized sets of
907//   code may be generated for a given node.  If the limit is reached, the
908//   trace is flushed and a generic version of the code for a node is emitted.
909//   This is subsequently used for that node.  The code emitted for non-generic
910//   trace is not recorded in the node and so it cannot currently be reused in
911//   the event that code generation is requested for an identical trace.
912
913
914void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
915  UNREACHABLE();
916}
917
918
919void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
920  text->AddElement(TextElement::Atom(this), zone);
921}
922
923
924void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
925  text->AddElement(TextElement::CharClass(this), zone);
926}
927
928
929void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
930  for (int i = 0; i < elements()->length(); i++)
931    text->AddElement(elements()->at(i), zone);
932}
933
934
935TextElement TextElement::Atom(RegExpAtom* atom) {
936  return TextElement(ATOM, atom);
937}
938
939
940TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
941  return TextElement(CHAR_CLASS, char_class);
942}
943
944
945int TextElement::length() const {
946  switch (text_type()) {
947    case ATOM:
948      return atom()->length();
949
950    case CHAR_CLASS:
951      return 1;
952  }
953  UNREACHABLE();
954  return 0;
955}
956
957
958DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
959  if (table_ == NULL) {
960    table_ = new(zone()) DispatchTable(zone());
961    DispatchTableConstructor cons(table_, ignore_case, zone());
962    cons.BuildTable(this);
963  }
964  return table_;
965}
966
967
968class FrequencyCollator {
969 public:
970  FrequencyCollator() : total_samples_(0) {
971    for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
972      frequencies_[i] = CharacterFrequency(i);
973    }
974  }
975
976  void CountCharacter(int character) {
977    int index = (character & RegExpMacroAssembler::kTableMask);
978    frequencies_[index].Increment();
979    total_samples_++;
980  }
981
982  // Does not measure in percent, but rather per-128 (the table size from the
983  // regexp macro assembler).
984  int Frequency(int in_character) {
985    ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
986    if (total_samples_ < 1) return 1;  // Division by zero.
987    int freq_in_per128 =
988        (frequencies_[in_character].counter() * 128) / total_samples_;
989    return freq_in_per128;
990  }
991
992 private:
993  class CharacterFrequency {
994   public:
995    CharacterFrequency() : counter_(0), character_(-1) { }
996    explicit CharacterFrequency(int character)
997        : counter_(0), character_(character) { }
998
999    void Increment() { counter_++; }
1000    int counter() { return counter_; }
1001    int character() { return character_; }
1002
1003   private:
1004    int counter_;
1005    int character_;
1006  };
1007
1008
1009 private:
1010  CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
1011  int total_samples_;
1012};
1013
1014
1015class RegExpCompiler {
1016 public:
1017  RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii,
1018                 Zone* zone);
1019
1020  int AllocateRegister() {
1021    if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1022      reg_exp_too_big_ = true;
1023      return next_register_;
1024    }
1025    return next_register_++;
1026  }
1027
1028  RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
1029                                           RegExpNode* start,
1030                                           int capture_count,
1031                                           Handle<String> pattern);
1032
1033  inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1034
1035  static const int kImplementationOffset = 0;
1036  static const int kNumberOfRegistersOffset = 0;
1037  static const int kCodeOffset = 1;
1038
1039  RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1040  EndNode* accept() { return accept_; }
1041
1042  static const int kMaxRecursion = 100;
1043  inline int recursion_depth() { return recursion_depth_; }
1044  inline void IncrementRecursionDepth() { recursion_depth_++; }
1045  inline void DecrementRecursionDepth() { recursion_depth_--; }
1046
1047  void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1048
1049  inline bool ignore_case() { return ignore_case_; }
1050  inline bool ascii() { return ascii_; }
1051  FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1052
1053  int current_expansion_factor() { return current_expansion_factor_; }
1054  void set_current_expansion_factor(int value) {
1055    current_expansion_factor_ = value;
1056  }
1057
1058  Zone* zone() const { return zone_; }
1059
1060  static const int kNoRegister = -1;
1061
1062 private:
1063  EndNode* accept_;
1064  int next_register_;
1065  List<RegExpNode*>* work_list_;
1066  int recursion_depth_;
1067  RegExpMacroAssembler* macro_assembler_;
1068  bool ignore_case_;
1069  bool ascii_;
1070  bool reg_exp_too_big_;
1071  int current_expansion_factor_;
1072  FrequencyCollator frequency_collator_;
1073  Zone* zone_;
1074};
1075
1076
1077class RecursionCheck {
1078 public:
1079  explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1080    compiler->IncrementRecursionDepth();
1081  }
1082  ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1083 private:
1084  RegExpCompiler* compiler_;
1085};
1086
1087
1088static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1089  return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1090}
1091
1092
1093// Attempts to compile the regexp using an Irregexp code generator.  Returns
1094// a fixed array or a null handle depending on whether it succeeded.
1095RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii,
1096                               Zone* zone)
1097    : next_register_(2 * (capture_count + 1)),
1098      work_list_(NULL),
1099      recursion_depth_(0),
1100      ignore_case_(ignore_case),
1101      ascii_(ascii),
1102      reg_exp_too_big_(false),
1103      current_expansion_factor_(1),
1104      frequency_collator_(),
1105      zone_(zone) {
1106  accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1107  ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1108}
1109
1110
1111RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1112    RegExpMacroAssembler* macro_assembler,
1113    RegExpNode* start,
1114    int capture_count,
1115    Handle<String> pattern) {
1116  Heap* heap = pattern->GetHeap();
1117
1118  bool use_slow_safe_regexp_compiler = false;
1119  if (heap->total_regexp_code_generated() >
1120          RegExpImpl::kRegWxpCompiledLimit &&
1121      heap->isolate()->memory_allocator()->SizeExecutable() >
1122          RegExpImpl::kRegExpExecutableMemoryLimit) {
1123    use_slow_safe_regexp_compiler = true;
1124  }
1125
1126  macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
1127
1128#ifdef DEBUG
1129  if (FLAG_trace_regexp_assembler)
1130    macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1131  else
1132#endif
1133    macro_assembler_ = macro_assembler;
1134
1135  List <RegExpNode*> work_list(0);
1136  work_list_ = &work_list;
1137  Label fail;
1138  macro_assembler_->PushBacktrack(&fail);
1139  Trace new_trace;
1140  start->Emit(this, &new_trace);
1141  macro_assembler_->Bind(&fail);
1142  macro_assembler_->Fail();
1143  while (!work_list.is_empty()) {
1144    work_list.RemoveLast()->Emit(this, &new_trace);
1145  }
1146  if (reg_exp_too_big_) return IrregexpRegExpTooBig(zone_->isolate());
1147
1148  Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1149  heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1150  work_list_ = NULL;
1151#ifdef DEBUG
1152  if (FLAG_print_code) {
1153    CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1154    Handle<Code>::cast(code)->Disassemble(*pattern->ToCString(),
1155                                          trace_scope.file());
1156  }
1157  if (FLAG_trace_regexp_assembler) {
1158    delete macro_assembler_;
1159  }
1160#endif
1161  return RegExpEngine::CompilationResult(*code, next_register_);
1162}
1163
1164
1165bool Trace::DeferredAction::Mentions(int that) {
1166  if (action_type() == ActionNode::CLEAR_CAPTURES) {
1167    Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1168    return range.Contains(that);
1169  } else {
1170    return reg() == that;
1171  }
1172}
1173
1174
1175bool Trace::mentions_reg(int reg) {
1176  for (DeferredAction* action = actions_;
1177       action != NULL;
1178       action = action->next()) {
1179    if (action->Mentions(reg))
1180      return true;
1181  }
1182  return false;
1183}
1184
1185
1186bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1187  ASSERT_EQ(0, *cp_offset);
1188  for (DeferredAction* action = actions_;
1189       action != NULL;
1190       action = action->next()) {
1191    if (action->Mentions(reg)) {
1192      if (action->action_type() == ActionNode::STORE_POSITION) {
1193        *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1194        return true;
1195      } else {
1196        return false;
1197      }
1198    }
1199  }
1200  return false;
1201}
1202
1203
1204int Trace::FindAffectedRegisters(OutSet* affected_registers,
1205                                 Zone* zone) {
1206  int max_register = RegExpCompiler::kNoRegister;
1207  for (DeferredAction* action = actions_;
1208       action != NULL;
1209       action = action->next()) {
1210    if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1211      Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1212      for (int i = range.from(); i <= range.to(); i++)
1213        affected_registers->Set(i, zone);
1214      if (range.to() > max_register) max_register = range.to();
1215    } else {
1216      affected_registers->Set(action->reg(), zone);
1217      if (action->reg() > max_register) max_register = action->reg();
1218    }
1219  }
1220  return max_register;
1221}
1222
1223
1224void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1225                                     int max_register,
1226                                     OutSet& registers_to_pop,
1227                                     OutSet& registers_to_clear) {
1228  for (int reg = max_register; reg >= 0; reg--) {
1229    if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
1230    else if (registers_to_clear.Get(reg)) {
1231      int clear_to = reg;
1232      while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1233        reg--;
1234      }
1235      assembler->ClearRegisters(reg, clear_to);
1236    }
1237  }
1238}
1239
1240
1241void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1242                                   int max_register,
1243                                   OutSet& affected_registers,
1244                                   OutSet* registers_to_pop,
1245                                   OutSet* registers_to_clear,
1246                                   Zone* zone) {
1247  // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1248  const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1249
1250  // Count pushes performed to force a stack limit check occasionally.
1251  int pushes = 0;
1252
1253  for (int reg = 0; reg <= max_register; reg++) {
1254    if (!affected_registers.Get(reg)) {
1255      continue;
1256    }
1257
1258    // The chronologically first deferred action in the trace
1259    // is used to infer the action needed to restore a register
1260    // to its previous state (or not, if it's safe to ignore it).
1261    enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1262    DeferredActionUndoType undo_action = IGNORE;
1263
1264    int value = 0;
1265    bool absolute = false;
1266    bool clear = false;
1267    int store_position = -1;
1268    // This is a little tricky because we are scanning the actions in reverse
1269    // historical order (newest first).
1270    for (DeferredAction* action = actions_;
1271         action != NULL;
1272         action = action->next()) {
1273      if (action->Mentions(reg)) {
1274        switch (action->action_type()) {
1275          case ActionNode::SET_REGISTER: {
1276            Trace::DeferredSetRegister* psr =
1277                static_cast<Trace::DeferredSetRegister*>(action);
1278            if (!absolute) {
1279              value += psr->value();
1280              absolute = true;
1281            }
1282            // SET_REGISTER is currently only used for newly introduced loop
1283            // counters. They can have a significant previous value if they
1284            // occour in a loop. TODO(lrn): Propagate this information, so
1285            // we can set undo_action to IGNORE if we know there is no value to
1286            // restore.
1287            undo_action = RESTORE;
1288            ASSERT_EQ(store_position, -1);
1289            ASSERT(!clear);
1290            break;
1291          }
1292          case ActionNode::INCREMENT_REGISTER:
1293            if (!absolute) {
1294              value++;
1295            }
1296            ASSERT_EQ(store_position, -1);
1297            ASSERT(!clear);
1298            undo_action = RESTORE;
1299            break;
1300          case ActionNode::STORE_POSITION: {
1301            Trace::DeferredCapture* pc =
1302                static_cast<Trace::DeferredCapture*>(action);
1303            if (!clear && store_position == -1) {
1304              store_position = pc->cp_offset();
1305            }
1306
1307            // For captures we know that stores and clears alternate.
1308            // Other register, are never cleared, and if the occur
1309            // inside a loop, they might be assigned more than once.
1310            if (reg <= 1) {
1311              // Registers zero and one, aka "capture zero", is
1312              // always set correctly if we succeed. There is no
1313              // need to undo a setting on backtrack, because we
1314              // will set it again or fail.
1315              undo_action = IGNORE;
1316            } else {
1317              undo_action = pc->is_capture() ? CLEAR : RESTORE;
1318            }
1319            ASSERT(!absolute);
1320            ASSERT_EQ(value, 0);
1321            break;
1322          }
1323          case ActionNode::CLEAR_CAPTURES: {
1324            // Since we're scanning in reverse order, if we've already
1325            // set the position we have to ignore historically earlier
1326            // clearing operations.
1327            if (store_position == -1) {
1328              clear = true;
1329            }
1330            undo_action = RESTORE;
1331            ASSERT(!absolute);
1332            ASSERT_EQ(value, 0);
1333            break;
1334          }
1335          default:
1336            UNREACHABLE();
1337            break;
1338        }
1339      }
1340    }
1341    // Prepare for the undo-action (e.g., push if it's going to be popped).
1342    if (undo_action == RESTORE) {
1343      pushes++;
1344      RegExpMacroAssembler::StackCheckFlag stack_check =
1345          RegExpMacroAssembler::kNoStackLimitCheck;
1346      if (pushes == push_limit) {
1347        stack_check = RegExpMacroAssembler::kCheckStackLimit;
1348        pushes = 0;
1349      }
1350
1351      assembler->PushRegister(reg, stack_check);
1352      registers_to_pop->Set(reg, zone);
1353    } else if (undo_action == CLEAR) {
1354      registers_to_clear->Set(reg, zone);
1355    }
1356    // Perform the chronologically last action (or accumulated increment)
1357    // for the register.
1358    if (store_position != -1) {
1359      assembler->WriteCurrentPositionToRegister(reg, store_position);
1360    } else if (clear) {
1361      assembler->ClearRegisters(reg, reg);
1362    } else if (absolute) {
1363      assembler->SetRegister(reg, value);
1364    } else if (value != 0) {
1365      assembler->AdvanceRegister(reg, value);
1366    }
1367  }
1368}
1369
1370
1371// This is called as we come into a loop choice node and some other tricky
1372// nodes.  It normalizes the state of the code generator to ensure we can
1373// generate generic code.
1374void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1375  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1376
1377  ASSERT(!is_trivial());
1378
1379  if (actions_ == NULL && backtrack() == NULL) {
1380    // Here we just have some deferred cp advances to fix and we are back to
1381    // a normal situation.  We may also have to forget some information gained
1382    // through a quick check that was already performed.
1383    if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1384    // Create a new trivial state and generate the node with that.
1385    Trace new_state;
1386    successor->Emit(compiler, &new_state);
1387    return;
1388  }
1389
1390  // Generate deferred actions here along with code to undo them again.
1391  OutSet affected_registers;
1392
1393  if (backtrack() != NULL) {
1394    // Here we have a concrete backtrack location.  These are set up by choice
1395    // nodes and so they indicate that we have a deferred save of the current
1396    // position which we may need to emit here.
1397    assembler->PushCurrentPosition();
1398  }
1399
1400  int max_register = FindAffectedRegisters(&affected_registers,
1401                                           compiler->zone());
1402  OutSet registers_to_pop;
1403  OutSet registers_to_clear;
1404  PerformDeferredActions(assembler,
1405                         max_register,
1406                         affected_registers,
1407                         &registers_to_pop,
1408                         &registers_to_clear,
1409                         compiler->zone());
1410  if (cp_offset_ != 0) {
1411    assembler->AdvanceCurrentPosition(cp_offset_);
1412  }
1413
1414  // Create a new trivial state and generate the node with that.
1415  Label undo;
1416  assembler->PushBacktrack(&undo);
1417  Trace new_state;
1418  successor->Emit(compiler, &new_state);
1419
1420  // On backtrack we need to restore state.
1421  assembler->Bind(&undo);
1422  RestoreAffectedRegisters(assembler,
1423                           max_register,
1424                           registers_to_pop,
1425                           registers_to_clear);
1426  if (backtrack() == NULL) {
1427    assembler->Backtrack();
1428  } else {
1429    assembler->PopCurrentPosition();
1430    assembler->GoTo(backtrack());
1431  }
1432}
1433
1434
1435void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1436  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1437
1438  // Omit flushing the trace. We discard the entire stack frame anyway.
1439
1440  if (!label()->is_bound()) {
1441    // We are completely independent of the trace, since we ignore it,
1442    // so this code can be used as the generic version.
1443    assembler->Bind(label());
1444  }
1445
1446  // Throw away everything on the backtrack stack since the start
1447  // of the negative submatch and restore the character position.
1448  assembler->ReadCurrentPositionFromRegister(current_position_register_);
1449  assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1450  if (clear_capture_count_ > 0) {
1451    // Clear any captures that might have been performed during the success
1452    // of the body of the negative look-ahead.
1453    int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1454    assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1455  }
1456  // Now that we have unwound the stack we find at the top of the stack the
1457  // backtrack that the BeginSubmatch node got.
1458  assembler->Backtrack();
1459}
1460
1461
1462void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1463  if (!trace->is_trivial()) {
1464    trace->Flush(compiler, this);
1465    return;
1466  }
1467  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1468  if (!label()->is_bound()) {
1469    assembler->Bind(label());
1470  }
1471  switch (action_) {
1472    case ACCEPT:
1473      assembler->Succeed();
1474      return;
1475    case BACKTRACK:
1476      assembler->GoTo(trace->backtrack());
1477      return;
1478    case NEGATIVE_SUBMATCH_SUCCESS:
1479      // This case is handled in a different virtual method.
1480      UNREACHABLE();
1481  }
1482  UNIMPLEMENTED();
1483}
1484
1485
1486void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1487  if (guards_ == NULL)
1488    guards_ = new(zone) ZoneList<Guard*>(1, zone);
1489  guards_->Add(guard, zone);
1490}
1491
1492
1493ActionNode* ActionNode::SetRegister(int reg,
1494                                    int val,
1495                                    RegExpNode* on_success) {
1496  ActionNode* result =
1497      new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1498  result->data_.u_store_register.reg = reg;
1499  result->data_.u_store_register.value = val;
1500  return result;
1501}
1502
1503
1504ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1505  ActionNode* result =
1506      new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1507  result->data_.u_increment_register.reg = reg;
1508  return result;
1509}
1510
1511
1512ActionNode* ActionNode::StorePosition(int reg,
1513                                      bool is_capture,
1514                                      RegExpNode* on_success) {
1515  ActionNode* result =
1516      new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1517  result->data_.u_position_register.reg = reg;
1518  result->data_.u_position_register.is_capture = is_capture;
1519  return result;
1520}
1521
1522
1523ActionNode* ActionNode::ClearCaptures(Interval range,
1524                                      RegExpNode* on_success) {
1525  ActionNode* result =
1526      new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1527  result->data_.u_clear_captures.range_from = range.from();
1528  result->data_.u_clear_captures.range_to = range.to();
1529  return result;
1530}
1531
1532
1533ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1534                                      int position_reg,
1535                                      RegExpNode* on_success) {
1536  ActionNode* result =
1537      new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1538  result->data_.u_submatch.stack_pointer_register = stack_reg;
1539  result->data_.u_submatch.current_position_register = position_reg;
1540  return result;
1541}
1542
1543
1544ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1545                                                int position_reg,
1546                                                int clear_register_count,
1547                                                int clear_register_from,
1548                                                RegExpNode* on_success) {
1549  ActionNode* result =
1550      new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1551  result->data_.u_submatch.stack_pointer_register = stack_reg;
1552  result->data_.u_submatch.current_position_register = position_reg;
1553  result->data_.u_submatch.clear_register_count = clear_register_count;
1554  result->data_.u_submatch.clear_register_from = clear_register_from;
1555  return result;
1556}
1557
1558
1559ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1560                                        int repetition_register,
1561                                        int repetition_limit,
1562                                        RegExpNode* on_success) {
1563  ActionNode* result =
1564      new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1565  result->data_.u_empty_match_check.start_register = start_register;
1566  result->data_.u_empty_match_check.repetition_register = repetition_register;
1567  result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1568  return result;
1569}
1570
1571
1572#define DEFINE_ACCEPT(Type)                                          \
1573  void Type##Node::Accept(NodeVisitor* visitor) {                    \
1574    visitor->Visit##Type(this);                                      \
1575  }
1576FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1577#undef DEFINE_ACCEPT
1578
1579
1580void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1581  visitor->VisitLoopChoice(this);
1582}
1583
1584
1585// -------------------------------------------------------------------
1586// Emit code.
1587
1588
1589void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1590                               Guard* guard,
1591                               Trace* trace) {
1592  switch (guard->op()) {
1593    case Guard::LT:
1594      ASSERT(!trace->mentions_reg(guard->reg()));
1595      macro_assembler->IfRegisterGE(guard->reg(),
1596                                    guard->value(),
1597                                    trace->backtrack());
1598      break;
1599    case Guard::GEQ:
1600      ASSERT(!trace->mentions_reg(guard->reg()));
1601      macro_assembler->IfRegisterLT(guard->reg(),
1602                                    guard->value(),
1603                                    trace->backtrack());
1604      break;
1605  }
1606}
1607
1608
1609// Returns the number of characters in the equivalence class, omitting those
1610// that cannot occur in the source string because it is ASCII.
1611static int GetCaseIndependentLetters(Isolate* isolate,
1612                                     uc16 character,
1613                                     bool ascii_subject,
1614                                     unibrow::uchar* letters) {
1615  int length =
1616      isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1617  // Unibrow returns 0 or 1 for characters where case independence is
1618  // trivial.
1619  if (length == 0) {
1620    letters[0] = character;
1621    length = 1;
1622  }
1623  if (!ascii_subject || character <= String::kMaxOneByteCharCode) {
1624    return length;
1625  }
1626  // The standard requires that non-ASCII characters cannot have ASCII
1627  // character codes in their equivalence class.
1628  return 0;
1629}
1630
1631
1632static inline bool EmitSimpleCharacter(Isolate* isolate,
1633                                       RegExpCompiler* compiler,
1634                                       uc16 c,
1635                                       Label* on_failure,
1636                                       int cp_offset,
1637                                       bool check,
1638                                       bool preloaded) {
1639  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1640  bool bound_checked = false;
1641  if (!preloaded) {
1642    assembler->LoadCurrentCharacter(
1643        cp_offset,
1644        on_failure,
1645        check);
1646    bound_checked = true;
1647  }
1648  assembler->CheckNotCharacter(c, on_failure);
1649  return bound_checked;
1650}
1651
1652
1653// Only emits non-letters (things that don't have case).  Only used for case
1654// independent matches.
1655static inline bool EmitAtomNonLetter(Isolate* isolate,
1656                                     RegExpCompiler* compiler,
1657                                     uc16 c,
1658                                     Label* on_failure,
1659                                     int cp_offset,
1660                                     bool check,
1661                                     bool preloaded) {
1662  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1663  bool ascii = compiler->ascii();
1664  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1665  int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
1666  if (length < 1) {
1667    // This can't match.  Must be an ASCII subject and a non-ASCII character.
1668    // We do not need to do anything since the ASCII pass already handled this.
1669    return false;  // Bounds not checked.
1670  }
1671  bool checked = false;
1672  // We handle the length > 1 case in a later pass.
1673  if (length == 1) {
1674    if (ascii && c > String::kMaxOneByteCharCodeU) {
1675      // Can't match - see above.
1676      return false;  // Bounds not checked.
1677    }
1678    if (!preloaded) {
1679      macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1680      checked = check;
1681    }
1682    macro_assembler->CheckNotCharacter(c, on_failure);
1683  }
1684  return checked;
1685}
1686
1687
1688static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1689                                      bool ascii,
1690                                      uc16 c1,
1691                                      uc16 c2,
1692                                      Label* on_failure) {
1693  uc16 char_mask;
1694  if (ascii) {
1695    char_mask = String::kMaxOneByteCharCode;
1696  } else {
1697    char_mask = String::kMaxUtf16CodeUnit;
1698  }
1699  uc16 exor = c1 ^ c2;
1700  // Check whether exor has only one bit set.
1701  if (((exor - 1) & exor) == 0) {
1702    // If c1 and c2 differ only by one bit.
1703    // Ecma262UnCanonicalize always gives the highest number last.
1704    ASSERT(c2 > c1);
1705    uc16 mask = char_mask ^ exor;
1706    macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1707    return true;
1708  }
1709  ASSERT(c2 > c1);
1710  uc16 diff = c2 - c1;
1711  if (((diff - 1) & diff) == 0 && c1 >= diff) {
1712    // If the characters differ by 2^n but don't differ by one bit then
1713    // subtract the difference from the found character, then do the or
1714    // trick.  We avoid the theoretical case where negative numbers are
1715    // involved in order to simplify code generation.
1716    uc16 mask = char_mask ^ diff;
1717    macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1718                                                    diff,
1719                                                    mask,
1720                                                    on_failure);
1721    return true;
1722  }
1723  return false;
1724}
1725
1726
1727typedef bool EmitCharacterFunction(Isolate* isolate,
1728                                   RegExpCompiler* compiler,
1729                                   uc16 c,
1730                                   Label* on_failure,
1731                                   int cp_offset,
1732                                   bool check,
1733                                   bool preloaded);
1734
1735// Only emits letters (things that have case).  Only used for case independent
1736// matches.
1737static inline bool EmitAtomLetter(Isolate* isolate,
1738                                  RegExpCompiler* compiler,
1739                                  uc16 c,
1740                                  Label* on_failure,
1741                                  int cp_offset,
1742                                  bool check,
1743                                  bool preloaded) {
1744  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1745  bool ascii = compiler->ascii();
1746  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1747  int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
1748  if (length <= 1) return false;
1749  // We may not need to check against the end of the input string
1750  // if this character lies before a character that matched.
1751  if (!preloaded) {
1752    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1753  }
1754  Label ok;
1755  ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1756  switch (length) {
1757    case 2: {
1758      if (ShortCutEmitCharacterPair(macro_assembler,
1759                                    ascii,
1760                                    chars[0],
1761                                    chars[1],
1762                                    on_failure)) {
1763      } else {
1764        macro_assembler->CheckCharacter(chars[0], &ok);
1765        macro_assembler->CheckNotCharacter(chars[1], on_failure);
1766        macro_assembler->Bind(&ok);
1767      }
1768      break;
1769    }
1770    case 4:
1771      macro_assembler->CheckCharacter(chars[3], &ok);
1772      // Fall through!
1773    case 3:
1774      macro_assembler->CheckCharacter(chars[0], &ok);
1775      macro_assembler->CheckCharacter(chars[1], &ok);
1776      macro_assembler->CheckNotCharacter(chars[2], on_failure);
1777      macro_assembler->Bind(&ok);
1778      break;
1779    default:
1780      UNREACHABLE();
1781      break;
1782  }
1783  return true;
1784}
1785
1786
1787static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1788                             int border,
1789                             Label* fall_through,
1790                             Label* above_or_equal,
1791                             Label* below) {
1792  if (below != fall_through) {
1793    masm->CheckCharacterLT(border, below);
1794    if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1795  } else {
1796    masm->CheckCharacterGT(border - 1, above_or_equal);
1797  }
1798}
1799
1800
1801static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1802                                   int first,
1803                                   int last,
1804                                   Label* fall_through,
1805                                   Label* in_range,
1806                                   Label* out_of_range) {
1807  if (in_range == fall_through) {
1808    if (first == last) {
1809      masm->CheckNotCharacter(first, out_of_range);
1810    } else {
1811      masm->CheckCharacterNotInRange(first, last, out_of_range);
1812    }
1813  } else {
1814    if (first == last) {
1815      masm->CheckCharacter(first, in_range);
1816    } else {
1817      masm->CheckCharacterInRange(first, last, in_range);
1818    }
1819    if (out_of_range != fall_through) masm->GoTo(out_of_range);
1820  }
1821}
1822
1823
1824// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1825// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1826static void EmitUseLookupTable(
1827    RegExpMacroAssembler* masm,
1828    ZoneList<int>* ranges,
1829    int start_index,
1830    int end_index,
1831    int min_char,
1832    Label* fall_through,
1833    Label* even_label,
1834    Label* odd_label) {
1835  static const int kSize = RegExpMacroAssembler::kTableSize;
1836  static const int kMask = RegExpMacroAssembler::kTableMask;
1837
1838  int base = (min_char & ~kMask);
1839  USE(base);
1840
1841  // Assert that everything is on one kTableSize page.
1842  for (int i = start_index; i <= end_index; i++) {
1843    ASSERT_EQ(ranges->at(i) & ~kMask, base);
1844  }
1845  ASSERT(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1846
1847  char templ[kSize];
1848  Label* on_bit_set;
1849  Label* on_bit_clear;
1850  int bit;
1851  if (even_label == fall_through) {
1852    on_bit_set = odd_label;
1853    on_bit_clear = even_label;
1854    bit = 1;
1855  } else {
1856    on_bit_set = even_label;
1857    on_bit_clear = odd_label;
1858    bit = 0;
1859  }
1860  for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1861    templ[i] = bit;
1862  }
1863  int j = 0;
1864  bit ^= 1;
1865  for (int i = start_index; i < end_index; i++) {
1866    for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1867      templ[j] = bit;
1868    }
1869    bit ^= 1;
1870  }
1871  for (int i = j; i < kSize; i++) {
1872    templ[i] = bit;
1873  }
1874  Factory* factory = masm->zone()->isolate()->factory();
1875  // TODO(erikcorry): Cache these.
1876  Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1877  for (int i = 0; i < kSize; i++) {
1878    ba->set(i, templ[i]);
1879  }
1880  masm->CheckBitInTable(ba, on_bit_set);
1881  if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1882}
1883
1884
1885static void CutOutRange(RegExpMacroAssembler* masm,
1886                        ZoneList<int>* ranges,
1887                        int start_index,
1888                        int end_index,
1889                        int cut_index,
1890                        Label* even_label,
1891                        Label* odd_label) {
1892  bool odd = (((cut_index - start_index) & 1) == 1);
1893  Label* in_range_label = odd ? odd_label : even_label;
1894  Label dummy;
1895  EmitDoubleBoundaryTest(masm,
1896                         ranges->at(cut_index),
1897                         ranges->at(cut_index + 1) - 1,
1898                         &dummy,
1899                         in_range_label,
1900                         &dummy);
1901  ASSERT(!dummy.is_linked());
1902  // Cut out the single range by rewriting the array.  This creates a new
1903  // range that is a merger of the two ranges on either side of the one we
1904  // are cutting out.  The oddity of the labels is preserved.
1905  for (int j = cut_index; j > start_index; j--) {
1906    ranges->at(j) = ranges->at(j - 1);
1907  }
1908  for (int j = cut_index + 1; j < end_index; j++) {
1909    ranges->at(j) = ranges->at(j + 1);
1910  }
1911}
1912
1913
1914// Unicode case.  Split the search space into kSize spaces that are handled
1915// with recursion.
1916static void SplitSearchSpace(ZoneList<int>* ranges,
1917                             int start_index,
1918                             int end_index,
1919                             int* new_start_index,
1920                             int* new_end_index,
1921                             int* border) {
1922  static const int kSize = RegExpMacroAssembler::kTableSize;
1923  static const int kMask = RegExpMacroAssembler::kTableMask;
1924
1925  int first = ranges->at(start_index);
1926  int last = ranges->at(end_index) - 1;
1927
1928  *new_start_index = start_index;
1929  *border = (ranges->at(start_index) & ~kMask) + kSize;
1930  while (*new_start_index < end_index) {
1931    if (ranges->at(*new_start_index) > *border) break;
1932    (*new_start_index)++;
1933  }
1934  // new_start_index is the index of the first edge that is beyond the
1935  // current kSize space.
1936
1937  // For very large search spaces we do a binary chop search of the non-ASCII
1938  // space instead of just going to the end of the current kSize space.  The
1939  // heuristics are complicated a little by the fact that any 128-character
1940  // encoding space can be quickly tested with a table lookup, so we don't
1941  // wish to do binary chop search at a smaller granularity than that.  A
1942  // 128-character space can take up a lot of space in the ranges array if,
1943  // for example, we only want to match every second character (eg. the lower
1944  // case characters on some Unicode pages).
1945  int binary_chop_index = (end_index + start_index) / 2;
1946  // The first test ensures that we get to the code that handles the ASCII
1947  // range with a single not-taken branch, speeding up this important
1948  // character range (even non-ASCII charset-based text has spaces and
1949  // punctuation).
1950  if (*border - 1 > String::kMaxOneByteCharCode &&  // ASCII case.
1951      end_index - start_index > (*new_start_index - start_index) * 2 &&
1952      last - first > kSize * 2 &&
1953      binary_chop_index > *new_start_index &&
1954      ranges->at(binary_chop_index) >= first + 2 * kSize) {
1955    int scan_forward_for_section_border = binary_chop_index;;
1956    int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1957
1958    while (scan_forward_for_section_border < end_index) {
1959      if (ranges->at(scan_forward_for_section_border) > new_border) {
1960        *new_start_index = scan_forward_for_section_border;
1961        *border = new_border;
1962        break;
1963      }
1964      scan_forward_for_section_border++;
1965    }
1966  }
1967
1968  ASSERT(*new_start_index > start_index);
1969  *new_end_index = *new_start_index - 1;
1970  if (ranges->at(*new_end_index) == *border) {
1971    (*new_end_index)--;
1972  }
1973  if (*border >= ranges->at(end_index)) {
1974    *border = ranges->at(end_index);
1975    *new_start_index = end_index;  // Won't be used.
1976    *new_end_index = end_index - 1;
1977  }
1978}
1979
1980
1981// Gets a series of segment boundaries representing a character class.  If the
1982// character is in the range between an even and an odd boundary (counting from
1983// start_index) then go to even_label, otherwise go to odd_label.  We already
1984// know that the character is in the range of min_char to max_char inclusive.
1985// Either label can be NULL indicating backtracking.  Either label can also be
1986// equal to the fall_through label.
1987static void GenerateBranches(RegExpMacroAssembler* masm,
1988                             ZoneList<int>* ranges,
1989                             int start_index,
1990                             int end_index,
1991                             uc16 min_char,
1992                             uc16 max_char,
1993                             Label* fall_through,
1994                             Label* even_label,
1995                             Label* odd_label) {
1996  int first = ranges->at(start_index);
1997  int last = ranges->at(end_index) - 1;
1998
1999  ASSERT_LT(min_char, first);
2000
2001  // Just need to test if the character is before or on-or-after
2002  // a particular character.
2003  if (start_index == end_index) {
2004    EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
2005    return;
2006  }
2007
2008  // Another almost trivial case:  There is one interval in the middle that is
2009  // different from the end intervals.
2010  if (start_index + 1 == end_index) {
2011    EmitDoubleBoundaryTest(
2012        masm, first, last, fall_through, even_label, odd_label);
2013    return;
2014  }
2015
2016  // It's not worth using table lookup if there are very few intervals in the
2017  // character class.
2018  if (end_index - start_index <= 6) {
2019    // It is faster to test for individual characters, so we look for those
2020    // first, then try arbitrary ranges in the second round.
2021    static int kNoCutIndex = -1;
2022    int cut = kNoCutIndex;
2023    for (int i = start_index; i < end_index; i++) {
2024      if (ranges->at(i) == ranges->at(i + 1) - 1) {
2025        cut = i;
2026        break;
2027      }
2028    }
2029    if (cut == kNoCutIndex) cut = start_index;
2030    CutOutRange(
2031        masm, ranges, start_index, end_index, cut, even_label, odd_label);
2032    ASSERT_GE(end_index - start_index, 2);
2033    GenerateBranches(masm,
2034                     ranges,
2035                     start_index + 1,
2036                     end_index - 1,
2037                     min_char,
2038                     max_char,
2039                     fall_through,
2040                     even_label,
2041                     odd_label);
2042    return;
2043  }
2044
2045  // If there are a lot of intervals in the regexp, then we will use tables to
2046  // determine whether the character is inside or outside the character class.
2047  static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2048
2049  if ((max_char >> kBits) == (min_char >> kBits)) {
2050    EmitUseLookupTable(masm,
2051                       ranges,
2052                       start_index,
2053                       end_index,
2054                       min_char,
2055                       fall_through,
2056                       even_label,
2057                       odd_label);
2058    return;
2059  }
2060
2061  if ((min_char >> kBits) != (first >> kBits)) {
2062    masm->CheckCharacterLT(first, odd_label);
2063    GenerateBranches(masm,
2064                     ranges,
2065                     start_index + 1,
2066                     end_index,
2067                     first,
2068                     max_char,
2069                     fall_through,
2070                     odd_label,
2071                     even_label);
2072    return;
2073  }
2074
2075  int new_start_index = 0;
2076  int new_end_index = 0;
2077  int border = 0;
2078
2079  SplitSearchSpace(ranges,
2080                   start_index,
2081                   end_index,
2082                   &new_start_index,
2083                   &new_end_index,
2084                   &border);
2085
2086  Label handle_rest;
2087  Label* above = &handle_rest;
2088  if (border == last + 1) {
2089    // We didn't find any section that started after the limit, so everything
2090    // above the border is one of the terminal labels.
2091    above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2092    ASSERT(new_end_index == end_index - 1);
2093  }
2094
2095  ASSERT_LE(start_index, new_end_index);
2096  ASSERT_LE(new_start_index, end_index);
2097  ASSERT_LT(start_index, new_start_index);
2098  ASSERT_LT(new_end_index, end_index);
2099  ASSERT(new_end_index + 1 == new_start_index ||
2100         (new_end_index + 2 == new_start_index &&
2101          border == ranges->at(new_end_index + 1)));
2102  ASSERT_LT(min_char, border - 1);
2103  ASSERT_LT(border, max_char);
2104  ASSERT_LT(ranges->at(new_end_index), border);
2105  ASSERT(border < ranges->at(new_start_index) ||
2106         (border == ranges->at(new_start_index) &&
2107          new_start_index == end_index &&
2108          new_end_index == end_index - 1 &&
2109          border == last + 1));
2110  ASSERT(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2111
2112  masm->CheckCharacterGT(border - 1, above);
2113  Label dummy;
2114  GenerateBranches(masm,
2115                   ranges,
2116                   start_index,
2117                   new_end_index,
2118                   min_char,
2119                   border - 1,
2120                   &dummy,
2121                   even_label,
2122                   odd_label);
2123  if (handle_rest.is_linked()) {
2124    masm->Bind(&handle_rest);
2125    bool flip = (new_start_index & 1) != (start_index & 1);
2126    GenerateBranches(masm,
2127                     ranges,
2128                     new_start_index,
2129                     end_index,
2130                     border,
2131                     max_char,
2132                     &dummy,
2133                     flip ? odd_label : even_label,
2134                     flip ? even_label : odd_label);
2135  }
2136}
2137
2138
2139static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2140                          RegExpCharacterClass* cc,
2141                          bool ascii,
2142                          Label* on_failure,
2143                          int cp_offset,
2144                          bool check_offset,
2145                          bool preloaded,
2146                          Zone* zone) {
2147  ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2148  if (!CharacterRange::IsCanonical(ranges)) {
2149    CharacterRange::Canonicalize(ranges);
2150  }
2151
2152  int max_char;
2153  if (ascii) {
2154    max_char = String::kMaxOneByteCharCode;
2155  } else {
2156    max_char = String::kMaxUtf16CodeUnit;
2157  }
2158
2159  int range_count = ranges->length();
2160
2161  int last_valid_range = range_count - 1;
2162  while (last_valid_range >= 0) {
2163    CharacterRange& range = ranges->at(last_valid_range);
2164    if (range.from() <= max_char) {
2165      break;
2166    }
2167    last_valid_range--;
2168  }
2169
2170  if (last_valid_range < 0) {
2171    if (!cc->is_negated()) {
2172      macro_assembler->GoTo(on_failure);
2173    }
2174    if (check_offset) {
2175      macro_assembler->CheckPosition(cp_offset, on_failure);
2176    }
2177    return;
2178  }
2179
2180  if (last_valid_range == 0 &&
2181      ranges->at(0).IsEverything(max_char)) {
2182    if (cc->is_negated()) {
2183      macro_assembler->GoTo(on_failure);
2184    } else {
2185      // This is a common case hit by non-anchored expressions.
2186      if (check_offset) {
2187        macro_assembler->CheckPosition(cp_offset, on_failure);
2188      }
2189    }
2190    return;
2191  }
2192  if (last_valid_range == 0 &&
2193      !cc->is_negated() &&
2194      ranges->at(0).IsEverything(max_char)) {
2195    // This is a common case hit by non-anchored expressions.
2196    if (check_offset) {
2197      macro_assembler->CheckPosition(cp_offset, on_failure);
2198    }
2199    return;
2200  }
2201
2202  if (!preloaded) {
2203    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2204  }
2205
2206  if (cc->is_standard(zone) &&
2207        macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2208                                                    on_failure)) {
2209      return;
2210  }
2211
2212
2213  // A new list with ascending entries.  Each entry is a code unit
2214  // where there is a boundary between code units that are part of
2215  // the class and code units that are not.  Normally we insert an
2216  // entry at zero which goes to the failure label, but if there
2217  // was already one there we fall through for success on that entry.
2218  // Subsequent entries have alternating meaning (success/failure).
2219  ZoneList<int>* range_boundaries =
2220      new(zone) ZoneList<int>(last_valid_range, zone);
2221
2222  bool zeroth_entry_is_failure = !cc->is_negated();
2223
2224  for (int i = 0; i <= last_valid_range; i++) {
2225    CharacterRange& range = ranges->at(i);
2226    if (range.from() == 0) {
2227      ASSERT_EQ(i, 0);
2228      zeroth_entry_is_failure = !zeroth_entry_is_failure;
2229    } else {
2230      range_boundaries->Add(range.from(), zone);
2231    }
2232    range_boundaries->Add(range.to() + 1, zone);
2233  }
2234  int end_index = range_boundaries->length() - 1;
2235  if (range_boundaries->at(end_index) > max_char) {
2236    end_index--;
2237  }
2238
2239  Label fall_through;
2240  GenerateBranches(macro_assembler,
2241                   range_boundaries,
2242                   0,  // start_index.
2243                   end_index,
2244                   0,  // min_char.
2245                   max_char,
2246                   &fall_through,
2247                   zeroth_entry_is_failure ? &fall_through : on_failure,
2248                   zeroth_entry_is_failure ? on_failure : &fall_through);
2249  macro_assembler->Bind(&fall_through);
2250}
2251
2252
2253RegExpNode::~RegExpNode() {
2254}
2255
2256
2257RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2258                                                  Trace* trace) {
2259  // If we are generating a greedy loop then don't stop and don't reuse code.
2260  if (trace->stop_node() != NULL) {
2261    return CONTINUE;
2262  }
2263
2264  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2265  if (trace->is_trivial()) {
2266    if (label_.is_bound()) {
2267      // We are being asked to generate a generic version, but that's already
2268      // been done so just go to it.
2269      macro_assembler->GoTo(&label_);
2270      return DONE;
2271    }
2272    if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
2273      // To avoid too deep recursion we push the node to the work queue and just
2274      // generate a goto here.
2275      compiler->AddWork(this);
2276      macro_assembler->GoTo(&label_);
2277      return DONE;
2278    }
2279    // Generate generic version of the node and bind the label for later use.
2280    macro_assembler->Bind(&label_);
2281    return CONTINUE;
2282  }
2283
2284  // We are being asked to make a non-generic version.  Keep track of how many
2285  // non-generic versions we generate so as not to overdo it.
2286  trace_count_++;
2287  if (FLAG_regexp_optimization &&
2288      trace_count_ < kMaxCopiesCodeGenerated &&
2289      compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
2290    return CONTINUE;
2291  }
2292
2293  // If we get here code has been generated for this node too many times or
2294  // recursion is too deep.  Time to switch to a generic version.  The code for
2295  // generic versions above can handle deep recursion properly.
2296  trace->Flush(compiler, this);
2297  return DONE;
2298}
2299
2300
2301int ActionNode::EatsAtLeast(int still_to_find,
2302                            int budget,
2303                            bool not_at_start) {
2304  if (budget <= 0) return 0;
2305  if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
2306  return on_success()->EatsAtLeast(still_to_find,
2307                                   budget - 1,
2308                                   not_at_start);
2309}
2310
2311
2312void ActionNode::FillInBMInfo(int offset,
2313                              int budget,
2314                              BoyerMooreLookahead* bm,
2315                              bool not_at_start) {
2316  if (action_type_ == BEGIN_SUBMATCH) {
2317    bm->SetRest(offset);
2318  } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2319    on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2320  }
2321  SaveBMInfo(bm, not_at_start, offset);
2322}
2323
2324
2325int AssertionNode::EatsAtLeast(int still_to_find,
2326                               int budget,
2327                               bool not_at_start) {
2328  if (budget <= 0) return 0;
2329  // If we know we are not at the start and we are asked "how many characters
2330  // will you match if you succeed?" then we can answer anything since false
2331  // implies false.  So lets just return the max answer (still_to_find) since
2332  // that won't prevent us from preloading a lot of characters for the other
2333  // branches in the node graph.
2334  if (assertion_type() == AT_START && not_at_start) return still_to_find;
2335  return on_success()->EatsAtLeast(still_to_find,
2336                                   budget - 1,
2337                                   not_at_start);
2338}
2339
2340
2341void AssertionNode::FillInBMInfo(int offset,
2342                                 int budget,
2343                                 BoyerMooreLookahead* bm,
2344                                 bool not_at_start) {
2345  // Match the behaviour of EatsAtLeast on this node.
2346  if (assertion_type() == AT_START && not_at_start) return;
2347  on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2348  SaveBMInfo(bm, not_at_start, offset);
2349}
2350
2351
2352int BackReferenceNode::EatsAtLeast(int still_to_find,
2353                                   int budget,
2354                                   bool not_at_start) {
2355  if (budget <= 0) return 0;
2356  return on_success()->EatsAtLeast(still_to_find,
2357                                   budget - 1,
2358                                   not_at_start);
2359}
2360
2361
2362int TextNode::EatsAtLeast(int still_to_find,
2363                          int budget,
2364                          bool not_at_start) {
2365  int answer = Length();
2366  if (answer >= still_to_find) return answer;
2367  if (budget <= 0) return answer;
2368  // We are not at start after this node so we set the last argument to 'true'.
2369  return answer + on_success()->EatsAtLeast(still_to_find - answer,
2370                                            budget - 1,
2371                                            true);
2372}
2373
2374
2375int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
2376                                             int budget,
2377                                             bool not_at_start) {
2378  if (budget <= 0) return 0;
2379  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2380  // afterwards.
2381  RegExpNode* node = alternatives_->at(1).node();
2382  return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2383}
2384
2385
2386void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
2387    QuickCheckDetails* details,
2388    RegExpCompiler* compiler,
2389    int filled_in,
2390    bool not_at_start) {
2391  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2392  // afterwards.
2393  RegExpNode* node = alternatives_->at(1).node();
2394  return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2395}
2396
2397
2398int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2399                                  int budget,
2400                                  RegExpNode* ignore_this_node,
2401                                  bool not_at_start) {
2402  if (budget <= 0) return 0;
2403  int min = 100;
2404  int choice_count = alternatives_->length();
2405  budget = (budget - 1) / choice_count;
2406  for (int i = 0; i < choice_count; i++) {
2407    RegExpNode* node = alternatives_->at(i).node();
2408    if (node == ignore_this_node) continue;
2409    int node_eats_at_least =
2410        node->EatsAtLeast(still_to_find, budget, not_at_start);
2411    if (node_eats_at_least < min) min = node_eats_at_least;
2412    if (min == 0) return 0;
2413  }
2414  return min;
2415}
2416
2417
2418int LoopChoiceNode::EatsAtLeast(int still_to_find,
2419                                int budget,
2420                                bool not_at_start) {
2421  return EatsAtLeastHelper(still_to_find,
2422                           budget - 1,
2423                           loop_node_,
2424                           not_at_start);
2425}
2426
2427
2428int ChoiceNode::EatsAtLeast(int still_to_find,
2429                            int budget,
2430                            bool not_at_start) {
2431  return EatsAtLeastHelper(still_to_find,
2432                           budget,
2433                           NULL,
2434                           not_at_start);
2435}
2436
2437
2438// Takes the left-most 1-bit and smears it out, setting all bits to its right.
2439static inline uint32_t SmearBitsRight(uint32_t v) {
2440  v |= v >> 1;
2441  v |= v >> 2;
2442  v |= v >> 4;
2443  v |= v >> 8;
2444  v |= v >> 16;
2445  return v;
2446}
2447
2448
2449bool QuickCheckDetails::Rationalize(bool asc) {
2450  bool found_useful_op = false;
2451  uint32_t char_mask;
2452  if (asc) {
2453    char_mask = String::kMaxOneByteCharCode;
2454  } else {
2455    char_mask = String::kMaxUtf16CodeUnit;
2456  }
2457  mask_ = 0;
2458  value_ = 0;
2459  int char_shift = 0;
2460  for (int i = 0; i < characters_; i++) {
2461    Position* pos = &positions_[i];
2462    if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2463      found_useful_op = true;
2464    }
2465    mask_ |= (pos->mask & char_mask) << char_shift;
2466    value_ |= (pos->value & char_mask) << char_shift;
2467    char_shift += asc ? 8 : 16;
2468  }
2469  return found_useful_op;
2470}
2471
2472
2473bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2474                                Trace* trace,
2475                                bool preload_has_checked_bounds,
2476                                Label* on_possible_success,
2477                                QuickCheckDetails* details,
2478                                bool fall_through_on_failure) {
2479  if (details->characters() == 0) return false;
2480  GetQuickCheckDetails(
2481      details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2482  if (details->cannot_match()) return false;
2483  if (!details->Rationalize(compiler->ascii())) return false;
2484  ASSERT(details->characters() == 1 ||
2485         compiler->macro_assembler()->CanReadUnaligned());
2486  uint32_t mask = details->mask();
2487  uint32_t value = details->value();
2488
2489  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2490
2491  if (trace->characters_preloaded() != details->characters()) {
2492    assembler->LoadCurrentCharacter(trace->cp_offset(),
2493                                    trace->backtrack(),
2494                                    !preload_has_checked_bounds,
2495                                    details->characters());
2496  }
2497
2498
2499  bool need_mask = true;
2500
2501  if (details->characters() == 1) {
2502    // If number of characters preloaded is 1 then we used a byte or 16 bit
2503    // load so the value is already masked down.
2504    uint32_t char_mask;
2505    if (compiler->ascii()) {
2506      char_mask = String::kMaxOneByteCharCode;
2507    } else {
2508      char_mask = String::kMaxUtf16CodeUnit;
2509    }
2510    if ((mask & char_mask) == char_mask) need_mask = false;
2511    mask &= char_mask;
2512  } else {
2513    // For 2-character preloads in ASCII mode or 1-character preloads in
2514    // TWO_BYTE mode we also use a 16 bit load with zero extend.
2515    if (details->characters() == 2 && compiler->ascii()) {
2516      if ((mask & 0xffff) == 0xffff) need_mask = false;
2517    } else if (details->characters() == 1 && !compiler->ascii()) {
2518      if ((mask & 0xffff) == 0xffff) need_mask = false;
2519    } else {
2520      if (mask == 0xffffffff) need_mask = false;
2521    }
2522  }
2523
2524  if (fall_through_on_failure) {
2525    if (need_mask) {
2526      assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2527    } else {
2528      assembler->CheckCharacter(value, on_possible_success);
2529    }
2530  } else {
2531    if (need_mask) {
2532      assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2533    } else {
2534      assembler->CheckNotCharacter(value, trace->backtrack());
2535    }
2536  }
2537  return true;
2538}
2539
2540
2541// Here is the meat of GetQuickCheckDetails (see also the comment on the
2542// super-class in the .h file).
2543//
2544// We iterate along the text object, building up for each character a
2545// mask and value that can be used to test for a quick failure to match.
2546// The masks and values for the positions will be combined into a single
2547// machine word for the current character width in order to be used in
2548// generating a quick check.
2549void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2550                                    RegExpCompiler* compiler,
2551                                    int characters_filled_in,
2552                                    bool not_at_start) {
2553  Isolate* isolate = compiler->macro_assembler()->zone()->isolate();
2554  ASSERT(characters_filled_in < details->characters());
2555  int characters = details->characters();
2556  int char_mask;
2557  if (compiler->ascii()) {
2558    char_mask = String::kMaxOneByteCharCode;
2559  } else {
2560    char_mask = String::kMaxUtf16CodeUnit;
2561  }
2562  for (int k = 0; k < elms_->length(); k++) {
2563    TextElement elm = elms_->at(k);
2564    if (elm.text_type() == TextElement::ATOM) {
2565      Vector<const uc16> quarks = elm.atom()->data();
2566      for (int i = 0; i < characters && i < quarks.length(); i++) {
2567        QuickCheckDetails::Position* pos =
2568            details->positions(characters_filled_in);
2569        uc16 c = quarks[i];
2570        if (c > char_mask) {
2571          // If we expect a non-ASCII character from an ASCII string,
2572          // there is no way we can match. Not even case independent
2573          // matching can turn an ASCII character into non-ASCII or
2574          // vice versa.
2575          details->set_cannot_match();
2576          pos->determines_perfectly = false;
2577          return;
2578        }
2579        if (compiler->ignore_case()) {
2580          unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2581          int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
2582                                                 chars);
2583          ASSERT(length != 0);  // Can only happen if c > char_mask (see above).
2584          if (length == 1) {
2585            // This letter has no case equivalents, so it's nice and simple
2586            // and the mask-compare will determine definitely whether we have
2587            // a match at this character position.
2588            pos->mask = char_mask;
2589            pos->value = c;
2590            pos->determines_perfectly = true;
2591          } else {
2592            uint32_t common_bits = char_mask;
2593            uint32_t bits = chars[0];
2594            for (int j = 1; j < length; j++) {
2595              uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2596              common_bits ^= differing_bits;
2597              bits &= common_bits;
2598            }
2599            // If length is 2 and common bits has only one zero in it then
2600            // our mask and compare instruction will determine definitely
2601            // whether we have a match at this character position.  Otherwise
2602            // it can only be an approximate check.
2603            uint32_t one_zero = (common_bits | ~char_mask);
2604            if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2605              pos->determines_perfectly = true;
2606            }
2607            pos->mask = common_bits;
2608            pos->value = bits;
2609          }
2610        } else {
2611          // Don't ignore case.  Nice simple case where the mask-compare will
2612          // determine definitely whether we have a match at this character
2613          // position.
2614          pos->mask = char_mask;
2615          pos->value = c;
2616          pos->determines_perfectly = true;
2617        }
2618        characters_filled_in++;
2619        ASSERT(characters_filled_in <= details->characters());
2620        if (characters_filled_in == details->characters()) {
2621          return;
2622        }
2623      }
2624    } else {
2625      QuickCheckDetails::Position* pos =
2626          details->positions(characters_filled_in);
2627      RegExpCharacterClass* tree = elm.char_class();
2628      ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2629      if (tree->is_negated()) {
2630        // A quick check uses multi-character mask and compare.  There is no
2631        // useful way to incorporate a negative char class into this scheme
2632        // so we just conservatively create a mask and value that will always
2633        // succeed.
2634        pos->mask = 0;
2635        pos->value = 0;
2636      } else {
2637        int first_range = 0;
2638        while (ranges->at(first_range).from() > char_mask) {
2639          first_range++;
2640          if (first_range == ranges->length()) {
2641            details->set_cannot_match();
2642            pos->determines_perfectly = false;
2643            return;
2644          }
2645        }
2646        CharacterRange range = ranges->at(first_range);
2647        uc16 from = range.from();
2648        uc16 to = range.to();
2649        if (to > char_mask) {
2650          to = char_mask;
2651        }
2652        uint32_t differing_bits = (from ^ to);
2653        // A mask and compare is only perfect if the differing bits form a
2654        // number like 00011111 with one single block of trailing 1s.
2655        if ((differing_bits & (differing_bits + 1)) == 0 &&
2656             from + differing_bits == to) {
2657          pos->determines_perfectly = true;
2658        }
2659        uint32_t common_bits = ~SmearBitsRight(differing_bits);
2660        uint32_t bits = (from & common_bits);
2661        for (int i = first_range + 1; i < ranges->length(); i++) {
2662          CharacterRange range = ranges->at(i);
2663          uc16 from = range.from();
2664          uc16 to = range.to();
2665          if (from > char_mask) continue;
2666          if (to > char_mask) to = char_mask;
2667          // Here we are combining more ranges into the mask and compare
2668          // value.  With each new range the mask becomes more sparse and
2669          // so the chances of a false positive rise.  A character class
2670          // with multiple ranges is assumed never to be equivalent to a
2671          // mask and compare operation.
2672          pos->determines_perfectly = false;
2673          uint32_t new_common_bits = (from ^ to);
2674          new_common_bits = ~SmearBitsRight(new_common_bits);
2675          common_bits &= new_common_bits;
2676          bits &= new_common_bits;
2677          uint32_t differing_bits = (from & common_bits) ^ bits;
2678          common_bits ^= differing_bits;
2679          bits &= common_bits;
2680        }
2681        pos->mask = common_bits;
2682        pos->value = bits;
2683      }
2684      characters_filled_in++;
2685      ASSERT(characters_filled_in <= details->characters());
2686      if (characters_filled_in == details->characters()) {
2687        return;
2688      }
2689    }
2690  }
2691  ASSERT(characters_filled_in != details->characters());
2692  if (!details->cannot_match()) {
2693    on_success()-> GetQuickCheckDetails(details,
2694                                        compiler,
2695                                        characters_filled_in,
2696                                        true);
2697  }
2698}
2699
2700
2701void QuickCheckDetails::Clear() {
2702  for (int i = 0; i < characters_; i++) {
2703    positions_[i].mask = 0;
2704    positions_[i].value = 0;
2705    positions_[i].determines_perfectly = false;
2706  }
2707  characters_ = 0;
2708}
2709
2710
2711void QuickCheckDetails::Advance(int by, bool ascii) {
2712  ASSERT(by >= 0);
2713  if (by >= characters_) {
2714    Clear();
2715    return;
2716  }
2717  for (int i = 0; i < characters_ - by; i++) {
2718    positions_[i] = positions_[by + i];
2719  }
2720  for (int i = characters_ - by; i < characters_; i++) {
2721    positions_[i].mask = 0;
2722    positions_[i].value = 0;
2723    positions_[i].determines_perfectly = false;
2724  }
2725  characters_ -= by;
2726  // We could change mask_ and value_ here but we would never advance unless
2727  // they had already been used in a check and they won't be used again because
2728  // it would gain us nothing.  So there's no point.
2729}
2730
2731
2732void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2733  ASSERT(characters_ == other->characters_);
2734  if (other->cannot_match_) {
2735    return;
2736  }
2737  if (cannot_match_) {
2738    *this = *other;
2739    return;
2740  }
2741  for (int i = from_index; i < characters_; i++) {
2742    QuickCheckDetails::Position* pos = positions(i);
2743    QuickCheckDetails::Position* other_pos = other->positions(i);
2744    if (pos->mask != other_pos->mask ||
2745        pos->value != other_pos->value ||
2746        !other_pos->determines_perfectly) {
2747      // Our mask-compare operation will be approximate unless we have the
2748      // exact same operation on both sides of the alternation.
2749      pos->determines_perfectly = false;
2750    }
2751    pos->mask &= other_pos->mask;
2752    pos->value &= pos->mask;
2753    other_pos->value &= pos->mask;
2754    uc16 differing_bits = (pos->value ^ other_pos->value);
2755    pos->mask &= ~differing_bits;
2756    pos->value &= pos->mask;
2757  }
2758}
2759
2760
2761class VisitMarker {
2762 public:
2763  explicit VisitMarker(NodeInfo* info) : info_(info) {
2764    ASSERT(!info->visited);
2765    info->visited = true;
2766  }
2767  ~VisitMarker() {
2768    info_->visited = false;
2769  }
2770 private:
2771  NodeInfo* info_;
2772};
2773
2774
2775RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) {
2776  if (info()->replacement_calculated) return replacement();
2777  if (depth < 0) return this;
2778  ASSERT(!info()->visited);
2779  VisitMarker marker(info());
2780  return FilterSuccessor(depth - 1, ignore_case);
2781}
2782
2783
2784RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2785  RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);
2786  if (next == NULL) return set_replacement(NULL);
2787  on_success_ = next;
2788  return set_replacement(this);
2789}
2790
2791
2792// We need to check for the following characters: 0x39c 0x3bc 0x178.
2793static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2794  // TODO(dcarney): this could be a lot more efficient.
2795  return range.Contains(0x39c) ||
2796      range.Contains(0x3bc) || range.Contains(0x178);
2797}
2798
2799
2800static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2801  for (int i = 0; i < ranges->length(); i++) {
2802    // TODO(dcarney): this could be a lot more efficient.
2803    if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2804  }
2805  return false;
2806}
2807
2808
2809RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
2810  if (info()->replacement_calculated) return replacement();
2811  if (depth < 0) return this;
2812  ASSERT(!info()->visited);
2813  VisitMarker marker(info());
2814  int element_count = elms_->length();
2815  for (int i = 0; i < element_count; i++) {
2816    TextElement elm = elms_->at(i);
2817    if (elm.text_type() == TextElement::ATOM) {
2818      Vector<const uc16> quarks = elm.atom()->data();
2819      for (int j = 0; j < quarks.length(); j++) {
2820        uint16_t c = quarks[j];
2821        if (c <= String::kMaxOneByteCharCode) continue;
2822        if (!ignore_case) return set_replacement(NULL);
2823        // Here, we need to check for characters whose upper and lower cases
2824        // are outside the Latin-1 range.
2825        uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2826        // Character is outside Latin-1 completely
2827        if (converted == 0) return set_replacement(NULL);
2828        // Convert quark to Latin-1 in place.
2829        uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2830        copy[j] = converted;
2831      }
2832    } else {
2833      ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2834      RegExpCharacterClass* cc = elm.char_class();
2835      ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2836      if (!CharacterRange::IsCanonical(ranges)) {
2837        CharacterRange::Canonicalize(ranges);
2838      }
2839      // Now they are in order so we only need to look at the first.
2840      int range_count = ranges->length();
2841      if (cc->is_negated()) {
2842        if (range_count != 0 &&
2843            ranges->at(0).from() == 0 &&
2844            ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2845          // This will be handled in a later filter.
2846          if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2847          return set_replacement(NULL);
2848        }
2849      } else {
2850        if (range_count == 0 ||
2851            ranges->at(0).from() > String::kMaxOneByteCharCode) {
2852          // This will be handled in a later filter.
2853          if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2854          return set_replacement(NULL);
2855        }
2856      }
2857    }
2858  }
2859  return FilterSuccessor(depth - 1, ignore_case);
2860}
2861
2862
2863RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) {
2864  if (info()->replacement_calculated) return replacement();
2865  if (depth < 0) return this;
2866  if (info()->visited) return this;
2867  {
2868    VisitMarker marker(info());
2869
2870    RegExpNode* continue_replacement =
2871        continue_node_->FilterASCII(depth - 1, ignore_case);
2872    // If we can't continue after the loop then there is no sense in doing the
2873    // loop.
2874    if (continue_replacement == NULL) return set_replacement(NULL);
2875  }
2876
2877  return ChoiceNode::FilterASCII(depth - 1, ignore_case);
2878}
2879
2880
2881RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) {
2882  if (info()->replacement_calculated) return replacement();
2883  if (depth < 0) return this;
2884  if (info()->visited) return this;
2885  VisitMarker marker(info());
2886  int choice_count = alternatives_->length();
2887
2888  for (int i = 0; i < choice_count; i++) {
2889    GuardedAlternative alternative = alternatives_->at(i);
2890    if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2891      set_replacement(this);
2892      return this;
2893    }
2894  }
2895
2896  int surviving = 0;
2897  RegExpNode* survivor = NULL;
2898  for (int i = 0; i < choice_count; i++) {
2899    GuardedAlternative alternative = alternatives_->at(i);
2900    RegExpNode* replacement =
2901        alternative.node()->FilterASCII(depth - 1, ignore_case);
2902    ASSERT(replacement != this);  // No missing EMPTY_MATCH_CHECK.
2903    if (replacement != NULL) {
2904      alternatives_->at(i).set_node(replacement);
2905      surviving++;
2906      survivor = replacement;
2907    }
2908  }
2909  if (surviving < 2) return set_replacement(survivor);
2910
2911  set_replacement(this);
2912  if (surviving == choice_count) {
2913    return this;
2914  }
2915  // Only some of the nodes survived the filtering.  We need to rebuild the
2916  // alternatives list.
2917  ZoneList<GuardedAlternative>* new_alternatives =
2918      new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2919  for (int i = 0; i < choice_count; i++) {
2920    RegExpNode* replacement =
2921        alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case);
2922    if (replacement != NULL) {
2923      alternatives_->at(i).set_node(replacement);
2924      new_alternatives->Add(alternatives_->at(i), zone());
2925    }
2926  }
2927  alternatives_ = new_alternatives;
2928  return this;
2929}
2930
2931
2932RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth,
2933                                                     bool ignore_case) {
2934  if (info()->replacement_calculated) return replacement();
2935  if (depth < 0) return this;
2936  if (info()->visited) return this;
2937  VisitMarker marker(info());
2938  // Alternative 0 is the negative lookahead, alternative 1 is what comes
2939  // afterwards.
2940  RegExpNode* node = alternatives_->at(1).node();
2941  RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case);
2942  if (replacement == NULL) return set_replacement(NULL);
2943  alternatives_->at(1).set_node(replacement);
2944
2945  RegExpNode* neg_node = alternatives_->at(0).node();
2946  RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case);
2947  // If the negative lookahead is always going to fail then
2948  // we don't need to check it.
2949  if (neg_replacement == NULL) return set_replacement(replacement);
2950  alternatives_->at(0).set_node(neg_replacement);
2951  return set_replacement(this);
2952}
2953
2954
2955void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2956                                          RegExpCompiler* compiler,
2957                                          int characters_filled_in,
2958                                          bool not_at_start) {
2959  if (body_can_be_zero_length_ || info()->visited) return;
2960  VisitMarker marker(info());
2961  return ChoiceNode::GetQuickCheckDetails(details,
2962                                          compiler,
2963                                          characters_filled_in,
2964                                          not_at_start);
2965}
2966
2967
2968void LoopChoiceNode::FillInBMInfo(int offset,
2969                                  int budget,
2970                                  BoyerMooreLookahead* bm,
2971                                  bool not_at_start) {
2972  if (body_can_be_zero_length_ || budget <= 0) {
2973    bm->SetRest(offset);
2974    SaveBMInfo(bm, not_at_start, offset);
2975    return;
2976  }
2977  ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
2978  SaveBMInfo(bm, not_at_start, offset);
2979}
2980
2981
2982void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2983                                      RegExpCompiler* compiler,
2984                                      int characters_filled_in,
2985                                      bool not_at_start) {
2986  not_at_start = (not_at_start || not_at_start_);
2987  int choice_count = alternatives_->length();
2988  ASSERT(choice_count > 0);
2989  alternatives_->at(0).node()->GetQuickCheckDetails(details,
2990                                                    compiler,
2991                                                    characters_filled_in,
2992                                                    not_at_start);
2993  for (int i = 1; i < choice_count; i++) {
2994    QuickCheckDetails new_details(details->characters());
2995    RegExpNode* node = alternatives_->at(i).node();
2996    node->GetQuickCheckDetails(&new_details, compiler,
2997                               characters_filled_in,
2998                               not_at_start);
2999    // Here we merge the quick match details of the two branches.
3000    details->Merge(&new_details, characters_filled_in);
3001  }
3002}
3003
3004
3005// Check for [0-9A-Z_a-z].
3006static void EmitWordCheck(RegExpMacroAssembler* assembler,
3007                          Label* word,
3008                          Label* non_word,
3009                          bool fall_through_on_word) {
3010  if (assembler->CheckSpecialCharacterClass(
3011          fall_through_on_word ? 'w' : 'W',
3012          fall_through_on_word ? non_word : word)) {
3013    // Optimized implementation available.
3014    return;
3015  }
3016  assembler->CheckCharacterGT('z', non_word);
3017  assembler->CheckCharacterLT('0', non_word);
3018  assembler->CheckCharacterGT('a' - 1, word);
3019  assembler->CheckCharacterLT('9' + 1, word);
3020  assembler->CheckCharacterLT('A', non_word);
3021  assembler->CheckCharacterLT('Z' + 1, word);
3022  if (fall_through_on_word) {
3023    assembler->CheckNotCharacter('_', non_word);
3024  } else {
3025    assembler->CheckCharacter('_', word);
3026  }
3027}
3028
3029
3030// Emit the code to check for a ^ in multiline mode (1-character lookbehind
3031// that matches newline or the start of input).
3032static void EmitHat(RegExpCompiler* compiler,
3033                    RegExpNode* on_success,
3034                    Trace* trace) {
3035  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3036  // We will be loading the previous character into the current character
3037  // register.
3038  Trace new_trace(*trace);
3039  new_trace.InvalidateCurrentCharacter();
3040
3041  Label ok;
3042  if (new_trace.cp_offset() == 0) {
3043    // The start of input counts as a newline in this context, so skip to
3044    // ok if we are at the start.
3045    assembler->CheckAtStart(&ok);
3046  }
3047  // We already checked that we are not at the start of input so it must be
3048  // OK to load the previous character.
3049  assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3050                                  new_trace.backtrack(),
3051                                  false);
3052  if (!assembler->CheckSpecialCharacterClass('n',
3053                                             new_trace.backtrack())) {
3054    // Newline means \n, \r, 0x2028 or 0x2029.
3055    if (!compiler->ascii()) {
3056      assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3057    }
3058    assembler->CheckCharacter('\n', &ok);
3059    assembler->CheckNotCharacter('\r', new_trace.backtrack());
3060  }
3061  assembler->Bind(&ok);
3062  on_success->Emit(compiler, &new_trace);
3063}
3064
3065
3066// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3067void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3068  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3069  Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3070  bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3071  BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3072  if (lookahead == NULL) {
3073    int eats_at_least =
3074        Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3075                                                    kRecursionBudget,
3076                                                    not_at_start));
3077    if (eats_at_least >= 1) {
3078      BoyerMooreLookahead* bm =
3079          new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3080      FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
3081      if (bm->at(0)->is_non_word())
3082        next_is_word_character = Trace::FALSE_VALUE;
3083      if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3084    }
3085  } else {
3086    if (lookahead->at(0)->is_non_word())
3087      next_is_word_character = Trace::FALSE_VALUE;
3088    if (lookahead->at(0)->is_word())
3089      next_is_word_character = Trace::TRUE_VALUE;
3090  }
3091  bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3092  if (next_is_word_character == Trace::UNKNOWN) {
3093    Label before_non_word;
3094    Label before_word;
3095    if (trace->characters_preloaded() != 1) {
3096      assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3097    }
3098    // Fall through on non-word.
3099    EmitWordCheck(assembler, &before_word, &before_non_word, false);
3100    // Next character is not a word character.
3101    assembler->Bind(&before_non_word);
3102    Label ok;
3103    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3104    assembler->GoTo(&ok);
3105
3106    assembler->Bind(&before_word);
3107    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3108    assembler->Bind(&ok);
3109  } else if (next_is_word_character == Trace::TRUE_VALUE) {
3110    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3111  } else {
3112    ASSERT(next_is_word_character == Trace::FALSE_VALUE);
3113    BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3114  }
3115}
3116
3117
3118void AssertionNode::BacktrackIfPrevious(
3119    RegExpCompiler* compiler,
3120    Trace* trace,
3121    AssertionNode::IfPrevious backtrack_if_previous) {
3122  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3123  Trace new_trace(*trace);
3124  new_trace.InvalidateCurrentCharacter();
3125
3126  Label fall_through, dummy;
3127
3128  Label* non_word = backtrack_if_previous == kIsNonWord ?
3129                    new_trace.backtrack() :
3130                    &fall_through;
3131  Label* word = backtrack_if_previous == kIsNonWord ?
3132                &fall_through :
3133                new_trace.backtrack();
3134
3135  if (new_trace.cp_offset() == 0) {
3136    // The start of input counts as a non-word character, so the question is
3137    // decided if we are at the start.
3138    assembler->CheckAtStart(non_word);
3139  }
3140  // We already checked that we are not at the start of input so it must be
3141  // OK to load the previous character.
3142  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3143  EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3144
3145  assembler->Bind(&fall_through);
3146  on_success()->Emit(compiler, &new_trace);
3147}
3148
3149
3150void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3151                                         RegExpCompiler* compiler,
3152                                         int filled_in,
3153                                         bool not_at_start) {
3154  if (assertion_type_ == AT_START && not_at_start) {
3155    details->set_cannot_match();
3156    return;
3157  }
3158  return on_success()->GetQuickCheckDetails(details,
3159                                            compiler,
3160                                            filled_in,
3161                                            not_at_start);
3162}
3163
3164
3165void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3166  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3167  switch (assertion_type_) {
3168    case AT_END: {
3169      Label ok;
3170      assembler->CheckPosition(trace->cp_offset(), &ok);
3171      assembler->GoTo(trace->backtrack());
3172      assembler->Bind(&ok);
3173      break;
3174    }
3175    case AT_START: {
3176      if (trace->at_start() == Trace::FALSE_VALUE) {
3177        assembler->GoTo(trace->backtrack());
3178        return;
3179      }
3180      if (trace->at_start() == Trace::UNKNOWN) {
3181        assembler->CheckNotAtStart(trace->backtrack());
3182        Trace at_start_trace = *trace;
3183        at_start_trace.set_at_start(true);
3184        on_success()->Emit(compiler, &at_start_trace);
3185        return;
3186      }
3187    }
3188    break;
3189    case AFTER_NEWLINE:
3190      EmitHat(compiler, on_success(), trace);
3191      return;
3192    case AT_BOUNDARY:
3193    case AT_NON_BOUNDARY: {
3194      EmitBoundaryCheck(compiler, trace);
3195      return;
3196    }
3197  }
3198  on_success()->Emit(compiler, trace);
3199}
3200
3201
3202static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3203  if (quick_check == NULL) return false;
3204  if (offset >= quick_check->characters()) return false;
3205  return quick_check->positions(offset)->determines_perfectly;
3206}
3207
3208
3209static void UpdateBoundsCheck(int index, int* checked_up_to) {
3210  if (index > *checked_up_to) {
3211    *checked_up_to = index;
3212  }
3213}
3214
3215
3216// We call this repeatedly to generate code for each pass over the text node.
3217// The passes are in increasing order of difficulty because we hope one
3218// of the first passes will fail in which case we are saved the work of the
3219// later passes.  for example for the case independent regexp /%[asdfghjkl]a/
3220// we will check the '%' in the first pass, the case independent 'a' in the
3221// second pass and the character class in the last pass.
3222//
3223// The passes are done from right to left, so for example to test for /bar/
3224// we will first test for an 'r' with offset 2, then an 'a' with offset 1
3225// and then a 'b' with offset 0.  This means we can avoid the end-of-input
3226// bounds check most of the time.  In the example we only need to check for
3227// end-of-input when loading the putative 'r'.
3228//
3229// A slight complication involves the fact that the first character may already
3230// be fetched into a register by the previous node.  In this case we want to
3231// do the test for that character first.  We do this in separate passes.  The
3232// 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
3233// pass has been performed then subsequent passes will have true in
3234// first_element_checked to indicate that that character does not need to be
3235// checked again.
3236//
3237// In addition to all this we are passed a Trace, which can
3238// contain an AlternativeGeneration object.  In this AlternativeGeneration
3239// object we can see details of any quick check that was already passed in
3240// order to get to the code we are now generating.  The quick check can involve
3241// loading characters, which means we do not need to recheck the bounds
3242// up to the limit the quick check already checked.  In addition the quick
3243// check can have involved a mask and compare operation which may simplify
3244// or obviate the need for further checks at some character positions.
3245void TextNode::TextEmitPass(RegExpCompiler* compiler,
3246                            TextEmitPassType pass,
3247                            bool preloaded,
3248                            Trace* trace,
3249                            bool first_element_checked,
3250                            int* checked_up_to) {
3251  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3252  Isolate* isolate = assembler->zone()->isolate();
3253  bool ascii = compiler->ascii();
3254  Label* backtrack = trace->backtrack();
3255  QuickCheckDetails* quick_check = trace->quick_check_performed();
3256  int element_count = elms_->length();
3257  for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3258    TextElement elm = elms_->at(i);
3259    int cp_offset = trace->cp_offset() + elm.cp_offset();
3260    if (elm.text_type() == TextElement::ATOM) {
3261      Vector<const uc16> quarks = elm.atom()->data();
3262      for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3263        if (first_element_checked && i == 0 && j == 0) continue;
3264        if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3265        EmitCharacterFunction* emit_function = NULL;
3266        switch (pass) {
3267          case NON_ASCII_MATCH:
3268            ASSERT(ascii);
3269            if (quarks[j] > String::kMaxOneByteCharCode) {
3270              assembler->GoTo(backtrack);
3271              return;
3272            }
3273            break;
3274          case NON_LETTER_CHARACTER_MATCH:
3275            emit_function = &EmitAtomNonLetter;
3276            break;
3277          case SIMPLE_CHARACTER_MATCH:
3278            emit_function = &EmitSimpleCharacter;
3279            break;
3280          case CASE_CHARACTER_MATCH:
3281            emit_function = &EmitAtomLetter;
3282            break;
3283          default:
3284            break;
3285        }
3286        if (emit_function != NULL) {
3287          bool bound_checked = emit_function(isolate,
3288                                             compiler,
3289                                             quarks[j],
3290                                             backtrack,
3291                                             cp_offset + j,
3292                                             *checked_up_to < cp_offset + j,
3293                                             preloaded);
3294          if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3295        }
3296      }
3297    } else {
3298      ASSERT_EQ(TextElement::CHAR_CLASS, elm.text_type());
3299      if (pass == CHARACTER_CLASS_MATCH) {
3300        if (first_element_checked && i == 0) continue;
3301        if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3302        RegExpCharacterClass* cc = elm.char_class();
3303        EmitCharClass(assembler,
3304                      cc,
3305                      ascii,
3306                      backtrack,
3307                      cp_offset,
3308                      *checked_up_to < cp_offset,
3309                      preloaded,
3310                      zone());
3311        UpdateBoundsCheck(cp_offset, checked_up_to);
3312      }
3313    }
3314  }
3315}
3316
3317
3318int TextNode::Length() {
3319  TextElement elm = elms_->last();
3320  ASSERT(elm.cp_offset() >= 0);
3321  return elm.cp_offset() + elm.length();
3322}
3323
3324
3325bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3326  TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3327  if (ignore_case) {
3328    return pass == SIMPLE_CHARACTER_MATCH;
3329  } else {
3330    return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3331  }
3332}
3333
3334
3335// This generates the code to match a text node.  A text node can contain
3336// straight character sequences (possibly to be matched in a case-independent
3337// way) and character classes.  For efficiency we do not do this in a single
3338// pass from left to right.  Instead we pass over the text node several times,
3339// emitting code for some character positions every time.  See the comment on
3340// TextEmitPass for details.
3341void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3342  LimitResult limit_result = LimitVersions(compiler, trace);
3343  if (limit_result == DONE) return;
3344  ASSERT(limit_result == CONTINUE);
3345
3346  if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3347    compiler->SetRegExpTooBig();
3348    return;
3349  }
3350
3351  if (compiler->ascii()) {
3352    int dummy = 0;
3353    TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
3354  }
3355
3356  bool first_elt_done = false;
3357  int bound_checked_to = trace->cp_offset() - 1;
3358  bound_checked_to += trace->bound_checked_up_to();
3359
3360  // If a character is preloaded into the current character register then
3361  // check that now.
3362  if (trace->characters_preloaded() == 1) {
3363    for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3364      if (!SkipPass(pass, compiler->ignore_case())) {
3365        TextEmitPass(compiler,
3366                     static_cast<TextEmitPassType>(pass),
3367                     true,
3368                     trace,
3369                     false,
3370                     &bound_checked_to);
3371      }
3372    }
3373    first_elt_done = true;
3374  }
3375
3376  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3377    if (!SkipPass(pass, compiler->ignore_case())) {
3378      TextEmitPass(compiler,
3379                   static_cast<TextEmitPassType>(pass),
3380                   false,
3381                   trace,
3382                   first_elt_done,
3383                   &bound_checked_to);
3384    }
3385  }
3386
3387  Trace successor_trace(*trace);
3388  successor_trace.set_at_start(false);
3389  successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
3390  RecursionCheck rc(compiler);
3391  on_success()->Emit(compiler, &successor_trace);
3392}
3393
3394
3395void Trace::InvalidateCurrentCharacter() {
3396  characters_preloaded_ = 0;
3397}
3398
3399
3400void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3401  ASSERT(by > 0);
3402  // We don't have an instruction for shifting the current character register
3403  // down or for using a shifted value for anything so lets just forget that
3404  // we preloaded any characters into it.
3405  characters_preloaded_ = 0;
3406  // Adjust the offsets of the quick check performed information.  This
3407  // information is used to find out what we already determined about the
3408  // characters by means of mask and compare.
3409  quick_check_performed_.Advance(by, compiler->ascii());
3410  cp_offset_ += by;
3411  if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3412    compiler->SetRegExpTooBig();
3413    cp_offset_ = 0;
3414  }
3415  bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3416}
3417
3418
3419void TextNode::MakeCaseIndependent(bool is_ascii) {
3420  int element_count = elms_->length();
3421  for (int i = 0; i < element_count; i++) {
3422    TextElement elm = elms_->at(i);
3423    if (elm.text_type() == TextElement::CHAR_CLASS) {
3424      RegExpCharacterClass* cc = elm.char_class();
3425      // None of the standard character classes is different in the case
3426      // independent case and it slows us down if we don't know that.
3427      if (cc->is_standard(zone())) continue;
3428      ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3429      int range_count = ranges->length();
3430      for (int j = 0; j < range_count; j++) {
3431        ranges->at(j).AddCaseEquivalents(ranges, is_ascii, zone());
3432      }
3433    }
3434  }
3435}
3436
3437
3438int TextNode::GreedyLoopTextLength() {
3439  TextElement elm = elms_->at(elms_->length() - 1);
3440  return elm.cp_offset() + elm.length();
3441}
3442
3443
3444RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3445    RegExpCompiler* compiler) {
3446  if (elms_->length() != 1) return NULL;
3447  TextElement elm = elms_->at(0);
3448  if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3449  RegExpCharacterClass* node = elm.char_class();
3450  ZoneList<CharacterRange>* ranges = node->ranges(zone());
3451  if (!CharacterRange::IsCanonical(ranges)) {
3452    CharacterRange::Canonicalize(ranges);
3453  }
3454  if (node->is_negated()) {
3455    return ranges->length() == 0 ? on_success() : NULL;
3456  }
3457  if (ranges->length() != 1) return NULL;
3458  uint32_t max_char;
3459  if (compiler->ascii()) {
3460    max_char = String::kMaxOneByteCharCode;
3461  } else {
3462    max_char = String::kMaxUtf16CodeUnit;
3463  }
3464  return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3465}
3466
3467
3468// Finds the fixed match length of a sequence of nodes that goes from
3469// this alternative and back to this choice node.  If there are variable
3470// length nodes or other complications in the way then return a sentinel
3471// value indicating that a greedy loop cannot be constructed.
3472int ChoiceNode::GreedyLoopTextLengthForAlternative(
3473    GuardedAlternative* alternative) {
3474  int length = 0;
3475  RegExpNode* node = alternative->node();
3476  // Later we will generate code for all these text nodes using recursion
3477  // so we have to limit the max number.
3478  int recursion_depth = 0;
3479  while (node != this) {
3480    if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3481      return kNodeIsTooComplexForGreedyLoops;
3482    }
3483    int node_length = node->GreedyLoopTextLength();
3484    if (node_length == kNodeIsTooComplexForGreedyLoops) {
3485      return kNodeIsTooComplexForGreedyLoops;
3486    }
3487    length += node_length;
3488    SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3489    node = seq_node->on_success();
3490  }
3491  return length;
3492}
3493
3494
3495void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3496  ASSERT_EQ(loop_node_, NULL);
3497  AddAlternative(alt);
3498  loop_node_ = alt.node();
3499}
3500
3501
3502void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3503  ASSERT_EQ(continue_node_, NULL);
3504  AddAlternative(alt);
3505  continue_node_ = alt.node();
3506}
3507
3508
3509void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3510  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3511  if (trace->stop_node() == this) {
3512    int text_length =
3513        GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3514    ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
3515    // Update the counter-based backtracking info on the stack.  This is an
3516    // optimization for greedy loops (see below).
3517    ASSERT(trace->cp_offset() == text_length);
3518    macro_assembler->AdvanceCurrentPosition(text_length);
3519    macro_assembler->GoTo(trace->loop_label());
3520    return;
3521  }
3522  ASSERT(trace->stop_node() == NULL);
3523  if (!trace->is_trivial()) {
3524    trace->Flush(compiler, this);
3525    return;
3526  }
3527  ChoiceNode::Emit(compiler, trace);
3528}
3529
3530
3531int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3532                                           int eats_at_least) {
3533  int preload_characters = Min(4, eats_at_least);
3534  if (compiler->macro_assembler()->CanReadUnaligned()) {
3535    bool ascii = compiler->ascii();
3536    if (ascii) {
3537      if (preload_characters > 4) preload_characters = 4;
3538      // We can't preload 3 characters because there is no machine instruction
3539      // to do that.  We can't just load 4 because we could be reading
3540      // beyond the end of the string, which could cause a memory fault.
3541      if (preload_characters == 3) preload_characters = 2;
3542    } else {
3543      if (preload_characters > 2) preload_characters = 2;
3544    }
3545  } else {
3546    if (preload_characters > 1) preload_characters = 1;
3547  }
3548  return preload_characters;
3549}
3550
3551
3552// This class is used when generating the alternatives in a choice node.  It
3553// records the way the alternative is being code generated.
3554class AlternativeGeneration: public Malloced {
3555 public:
3556  AlternativeGeneration()
3557      : possible_success(),
3558        expects_preload(false),
3559        after(),
3560        quick_check_details() { }
3561  Label possible_success;
3562  bool expects_preload;
3563  Label after;
3564  QuickCheckDetails quick_check_details;
3565};
3566
3567
3568// Creates a list of AlternativeGenerations.  If the list has a reasonable
3569// size then it is on the stack, otherwise the excess is on the heap.
3570class AlternativeGenerationList {
3571 public:
3572  AlternativeGenerationList(int count, Zone* zone)
3573      : alt_gens_(count, zone) {
3574    for (int i = 0; i < count && i < kAFew; i++) {
3575      alt_gens_.Add(a_few_alt_gens_ + i, zone);
3576    }
3577    for (int i = kAFew; i < count; i++) {
3578      alt_gens_.Add(new AlternativeGeneration(), zone);
3579    }
3580  }
3581  ~AlternativeGenerationList() {
3582    for (int i = kAFew; i < alt_gens_.length(); i++) {
3583      delete alt_gens_[i];
3584      alt_gens_[i] = NULL;
3585    }
3586  }
3587
3588  AlternativeGeneration* at(int i) {
3589    return alt_gens_[i];
3590  }
3591
3592 private:
3593  static const int kAFew = 10;
3594  ZoneList<AlternativeGeneration*> alt_gens_;
3595  AlternativeGeneration a_few_alt_gens_[kAFew];
3596};
3597
3598
3599// The '2' variant is has inclusive from and exclusive to.
3600static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0,
3601    0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A,
3602    0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
3603static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
3604
3605static const int kWordRanges[] = {
3606    '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3607static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);
3608static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3609static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
3610static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3611static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
3612static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3613    0x2028, 0x202A, 0x10000 };
3614static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);
3615
3616
3617void BoyerMoorePositionInfo::Set(int character) {
3618  SetInterval(Interval(character, character));
3619}
3620
3621
3622void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3623  s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3624  w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3625  d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3626  surrogate_ =
3627      AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3628  if (interval.to() - interval.from() >= kMapSize - 1) {
3629    if (map_count_ != kMapSize) {
3630      map_count_ = kMapSize;
3631      for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3632    }
3633    return;
3634  }
3635  for (int i = interval.from(); i <= interval.to(); i++) {
3636    int mod_character = (i & kMask);
3637    if (!map_->at(mod_character)) {
3638      map_count_++;
3639      map_->at(mod_character) = true;
3640    }
3641    if (map_count_ == kMapSize) return;
3642  }
3643}
3644
3645
3646void BoyerMoorePositionInfo::SetAll() {
3647  s_ = w_ = d_ = kLatticeUnknown;
3648  if (map_count_ != kMapSize) {
3649    map_count_ = kMapSize;
3650    for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3651  }
3652}
3653
3654
3655BoyerMooreLookahead::BoyerMooreLookahead(
3656    int length, RegExpCompiler* compiler, Zone* zone)
3657    : length_(length),
3658      compiler_(compiler) {
3659  if (compiler->ascii()) {
3660    max_char_ = String::kMaxOneByteCharCode;
3661  } else {
3662    max_char_ = String::kMaxUtf16CodeUnit;
3663  }
3664  bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3665  for (int i = 0; i < length; i++) {
3666    bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3667  }
3668}
3669
3670
3671// Find the longest range of lookahead that has the fewest number of different
3672// characters that can occur at a given position.  Since we are optimizing two
3673// different parameters at once this is a tradeoff.
3674bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3675  int biggest_points = 0;
3676  // If more than 32 characters out of 128 can occur it is unlikely that we can
3677  // be lucky enough to step forwards much of the time.
3678  const int kMaxMax = 32;
3679  for (int max_number_of_chars = 4;
3680       max_number_of_chars < kMaxMax;
3681       max_number_of_chars *= 2) {
3682    biggest_points =
3683        FindBestInterval(max_number_of_chars, biggest_points, from, to);
3684  }
3685  if (biggest_points == 0) return false;
3686  return true;
3687}
3688
3689
3690// Find the highest-points range between 0 and length_ where the character
3691// information is not too vague.  'Too vague' means that there are more than
3692// max_number_of_chars that can occur at this position.  Calculates the number
3693// of points as the product of width-of-the-range and
3694// probability-of-finding-one-of-the-characters, where the probability is
3695// calculated using the frequency distribution of the sample subject string.
3696int BoyerMooreLookahead::FindBestInterval(
3697    int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3698  int biggest_points = old_biggest_points;
3699  static const int kSize = RegExpMacroAssembler::kTableSize;
3700  for (int i = 0; i < length_; ) {
3701    while (i < length_ && Count(i) > max_number_of_chars) i++;
3702    if (i == length_) break;
3703    int remembered_from = i;
3704    bool union_map[kSize];
3705    for (int j = 0; j < kSize; j++) union_map[j] = false;
3706    while (i < length_ && Count(i) <= max_number_of_chars) {
3707      BoyerMoorePositionInfo* map = bitmaps_->at(i);
3708      for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3709      i++;
3710    }
3711    int frequency = 0;
3712    for (int j = 0; j < kSize; j++) {
3713      if (union_map[j]) {
3714        // Add 1 to the frequency to give a small per-character boost for
3715        // the cases where our sampling is not good enough and many
3716        // characters have a frequency of zero.  This means the frequency
3717        // can theoretically be up to 2*kSize though we treat it mostly as
3718        // a fraction of kSize.
3719        frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3720      }
3721    }
3722    // We use the probability of skipping times the distance we are skipping to
3723    // judge the effectiveness of this.  Actually we have a cut-off:  By
3724    // dividing by 2 we switch off the skipping if the probability of skipping
3725    // is less than 50%.  This is because the multibyte mask-and-compare
3726    // skipping in quickcheck is more likely to do well on this case.
3727    bool in_quickcheck_range = ((i - remembered_from < 4) ||
3728        (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2));
3729    // Called 'probability' but it is only a rough estimate and can actually
3730    // be outside the 0-kSize range.
3731    int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3732    int points = (i - remembered_from) * probability;
3733    if (points > biggest_points) {
3734      *from = remembered_from;
3735      *to = i - 1;
3736      biggest_points = points;
3737    }
3738  }
3739  return biggest_points;
3740}
3741
3742
3743// Take all the characters that will not prevent a successful match if they
3744// occur in the subject string in the range between min_lookahead and
3745// max_lookahead (inclusive) measured from the current position.  If the
3746// character at max_lookahead offset is not one of these characters, then we
3747// can safely skip forwards by the number of characters in the range.
3748int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3749                                      int max_lookahead,
3750                                      Handle<ByteArray> boolean_skip_table) {
3751  const int kSize = RegExpMacroAssembler::kTableSize;
3752
3753  const int kSkipArrayEntry = 0;
3754  const int kDontSkipArrayEntry = 1;
3755
3756  for (int i = 0; i < kSize; i++) {
3757    boolean_skip_table->set(i, kSkipArrayEntry);
3758  }
3759  int skip = max_lookahead + 1 - min_lookahead;
3760
3761  for (int i = max_lookahead; i >= min_lookahead; i--) {
3762    BoyerMoorePositionInfo* map = bitmaps_->at(i);
3763    for (int j = 0; j < kSize; j++) {
3764      if (map->at(j)) {
3765        boolean_skip_table->set(j, kDontSkipArrayEntry);
3766      }
3767    }
3768  }
3769
3770  return skip;
3771}
3772
3773
3774// See comment above on the implementation of GetSkipTable.
3775bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3776  const int kSize = RegExpMacroAssembler::kTableSize;
3777
3778  int min_lookahead = 0;
3779  int max_lookahead = 0;
3780
3781  if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
3782
3783  bool found_single_character = false;
3784  int single_character = 0;
3785  for (int i = max_lookahead; i >= min_lookahead; i--) {
3786    BoyerMoorePositionInfo* map = bitmaps_->at(i);
3787    if (map->map_count() > 1 ||
3788        (found_single_character && map->map_count() != 0)) {
3789      found_single_character = false;
3790      break;
3791    }
3792    for (int j = 0; j < kSize; j++) {
3793      if (map->at(j)) {
3794        found_single_character = true;
3795        single_character = j;
3796        break;
3797      }
3798    }
3799  }
3800
3801  int lookahead_width = max_lookahead + 1 - min_lookahead;
3802
3803  if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3804    // The mask-compare can probably handle this better.
3805    return false;
3806  }
3807
3808  if (found_single_character) {
3809    Label cont, again;
3810    masm->Bind(&again);
3811    masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3812    if (max_char_ > kSize) {
3813      masm->CheckCharacterAfterAnd(single_character,
3814                                   RegExpMacroAssembler::kTableMask,
3815                                   &cont);
3816    } else {
3817      masm->CheckCharacter(single_character, &cont);
3818    }
3819    masm->AdvanceCurrentPosition(lookahead_width);
3820    masm->GoTo(&again);
3821    masm->Bind(&cont);
3822    return true;
3823  }
3824
3825  Factory* factory = masm->zone()->isolate()->factory();
3826  Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3827  int skip_distance = GetSkipTable(
3828      min_lookahead, max_lookahead, boolean_skip_table);
3829  ASSERT(skip_distance != 0);
3830
3831  Label cont, again;
3832  masm->Bind(&again);
3833  masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3834  masm->CheckBitInTable(boolean_skip_table, &cont);
3835  masm->AdvanceCurrentPosition(skip_distance);
3836  masm->GoTo(&again);
3837  masm->Bind(&cont);
3838
3839  return true;
3840}
3841
3842
3843/* Code generation for choice nodes.
3844 *
3845 * We generate quick checks that do a mask and compare to eliminate a
3846 * choice.  If the quick check succeeds then it jumps to the continuation to
3847 * do slow checks and check subsequent nodes.  If it fails (the common case)
3848 * it falls through to the next choice.
3849 *
3850 * Here is the desired flow graph.  Nodes directly below each other imply
3851 * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
3852 * 3 doesn't have a quick check so we have to call the slow check.
3853 * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
3854 * regexp continuation is generated directly after the Sn node, up to the
3855 * next GoTo if we decide to reuse some already generated code.  Some
3856 * nodes expect preload_characters to be preloaded into the current
3857 * character register.  R nodes do this preloading.  Vertices are marked
3858 * F for failures and S for success (possible success in the case of quick
3859 * nodes).  L, V, < and > are used as arrow heads.
3860 *
3861 * ----------> R
3862 *             |
3863 *             V
3864 *            Q1 -----> S1
3865 *             |   S   /
3866 *            F|      /
3867 *             |    F/
3868 *             |    /
3869 *             |   R
3870 *             |  /
3871 *             V L
3872 *            Q2 -----> S2
3873 *             |   S   /
3874 *            F|      /
3875 *             |    F/
3876 *             |    /
3877 *             |   R
3878 *             |  /
3879 *             V L
3880 *            S3
3881 *             |
3882 *            F|
3883 *             |
3884 *             R
3885 *             |
3886 * backtrack   V
3887 * <----------Q4
3888 *   \    F    |
3889 *    \        |S
3890 *     \   F   V
3891 *      \-----S4
3892 *
3893 * For greedy loops we reverse our expectation and expect to match rather
3894 * than fail. Therefore we want the loop code to look like this (U is the
3895 * unwind code that steps back in the greedy loop).  The following alternatives
3896 * look the same as above.
3897 *              _____
3898 *             /     \
3899 *             V     |
3900 * ----------> S1    |
3901 *            /|     |
3902 *           / |S    |
3903 *         F/  \_____/
3904 *         /
3905 *        |<-----------
3906 *        |            \
3907 *        V             \
3908 *        Q2 ---> S2     \
3909 *        |  S   /       |
3910 *       F|     /        |
3911 *        |   F/         |
3912 *        |   /          |
3913 *        |  R           |
3914 *        | /            |
3915 *   F    VL             |
3916 * <------U              |
3917 * back   |S             |
3918 *        \______________/
3919 */
3920
3921void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3922  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3923  int choice_count = alternatives_->length();
3924#ifdef DEBUG
3925  for (int i = 0; i < choice_count - 1; i++) {
3926    GuardedAlternative alternative = alternatives_->at(i);
3927    ZoneList<Guard*>* guards = alternative.guards();
3928    int guard_count = (guards == NULL) ? 0 : guards->length();
3929    for (int j = 0; j < guard_count; j++) {
3930      ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
3931    }
3932  }
3933#endif
3934
3935  LimitResult limit_result = LimitVersions(compiler, trace);
3936  if (limit_result == DONE) return;
3937  ASSERT(limit_result == CONTINUE);
3938
3939  int new_flush_budget = trace->flush_budget() / choice_count;
3940  if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3941    trace->Flush(compiler, this);
3942    return;
3943  }
3944
3945  RecursionCheck rc(compiler);
3946
3947  Trace* current_trace = trace;
3948
3949  int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3950  bool greedy_loop = false;
3951  Label greedy_loop_label;
3952  Trace counter_backtrack_trace;
3953  counter_backtrack_trace.set_backtrack(&greedy_loop_label);
3954  if (not_at_start()) counter_backtrack_trace.set_at_start(false);
3955
3956  if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3957    // Here we have special handling for greedy loops containing only text nodes
3958    // and other simple nodes.  These are handled by pushing the current
3959    // position on the stack and then incrementing the current position each
3960    // time around the switch.  On backtrack we decrement the current position
3961    // and check it against the pushed value.  This avoids pushing backtrack
3962    // information for each iteration of the loop, which could take up a lot of
3963    // space.
3964    greedy_loop = true;
3965    ASSERT(trace->stop_node() == NULL);
3966    macro_assembler->PushCurrentPosition();
3967    current_trace = &counter_backtrack_trace;
3968    Label greedy_match_failed;
3969    Trace greedy_match_trace;
3970    if (not_at_start()) greedy_match_trace.set_at_start(false);
3971    greedy_match_trace.set_backtrack(&greedy_match_failed);
3972    Label loop_label;
3973    macro_assembler->Bind(&loop_label);
3974    greedy_match_trace.set_stop_node(this);
3975    greedy_match_trace.set_loop_label(&loop_label);
3976    alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3977    macro_assembler->Bind(&greedy_match_failed);
3978  }
3979
3980  Label second_choice;  // For use in greedy matches.
3981  macro_assembler->Bind(&second_choice);
3982
3983  int first_normal_choice = greedy_loop ? 1 : 0;
3984
3985  bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE;
3986  const int kEatsAtLeastNotYetInitialized = -1;
3987  int eats_at_least = kEatsAtLeastNotYetInitialized;
3988
3989  bool skip_was_emitted = false;
3990
3991  if (!greedy_loop && choice_count == 2) {
3992    GuardedAlternative alt1 = alternatives_->at(1);
3993    if (alt1.guards() == NULL || alt1.guards()->length() == 0) {
3994      RegExpNode* eats_anything_node = alt1.node();
3995      if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
3996          this) {
3997        // At this point we know that we are at a non-greedy loop that will eat
3998        // any character one at a time.  Any non-anchored regexp has such a
3999        // loop prepended to it in order to find where it starts.  We look for
4000        // a pattern of the form ...abc... where we can look 6 characters ahead
4001        // and step forwards 3 if the character is not one of abc.  Abc need
4002        // not be atoms, they can be any reasonably limited character class or
4003        // small alternation.
4004        ASSERT(trace->is_trivial());  // This is the case on LoopChoiceNodes.
4005        BoyerMooreLookahead* lookahead = bm_info(not_at_start);
4006        if (lookahead == NULL) {
4007          eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4008                              EatsAtLeast(kMaxLookaheadForBoyerMoore,
4009                                          kRecursionBudget,
4010                                          not_at_start));
4011          if (eats_at_least >= 1) {
4012            BoyerMooreLookahead* bm =
4013                new(zone()) BoyerMooreLookahead(eats_at_least,
4014                                                compiler,
4015                                                zone());
4016            GuardedAlternative alt0 = alternatives_->at(0);
4017            alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
4018            skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);
4019          }
4020        } else {
4021          skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);
4022        }
4023      }
4024    }
4025  }
4026
4027  if (eats_at_least == kEatsAtLeastNotYetInitialized) {
4028    // Save some time by looking at most one machine word ahead.
4029    eats_at_least =
4030        EatsAtLeast(compiler->ascii() ? 4 : 2, kRecursionBudget, not_at_start);
4031  }
4032  int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
4033
4034  bool preload_is_current = !skip_was_emitted &&
4035      (current_trace->characters_preloaded() == preload_characters);
4036  bool preload_has_checked_bounds = preload_is_current;
4037
4038  AlternativeGenerationList alt_gens(choice_count, zone());
4039
4040  // For now we just call all choices one after the other.  The idea ultimately
4041  // is to use the Dispatch table to try only the relevant ones.
4042  for (int i = first_normal_choice; i < choice_count; i++) {
4043    GuardedAlternative alternative = alternatives_->at(i);
4044    AlternativeGeneration* alt_gen = alt_gens.at(i);
4045    alt_gen->quick_check_details.set_characters(preload_characters);
4046    ZoneList<Guard*>* guards = alternative.guards();
4047    int guard_count = (guards == NULL) ? 0 : guards->length();
4048    Trace new_trace(*current_trace);
4049    new_trace.set_characters_preloaded(preload_is_current ?
4050                                         preload_characters :
4051                                         0);
4052    if (preload_has_checked_bounds) {
4053      new_trace.set_bound_checked_up_to(preload_characters);
4054    }
4055    new_trace.quick_check_performed()->Clear();
4056    if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4057    alt_gen->expects_preload = preload_is_current;
4058    bool generate_full_check_inline = false;
4059    if (FLAG_regexp_optimization &&
4060        try_to_emit_quick_check_for_alternative(i) &&
4061        alternative.node()->EmitQuickCheck(compiler,
4062                                           &new_trace,
4063                                           preload_has_checked_bounds,
4064                                           &alt_gen->possible_success,
4065                                           &alt_gen->quick_check_details,
4066                                           i < choice_count - 1)) {
4067      // Quick check was generated for this choice.
4068      preload_is_current = true;
4069      preload_has_checked_bounds = true;
4070      // On the last choice in the ChoiceNode we generated the quick
4071      // check to fall through on possible success.  So now we need to
4072      // generate the full check inline.
4073      if (i == choice_count - 1) {
4074        macro_assembler->Bind(&alt_gen->possible_success);
4075        new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4076        new_trace.set_characters_preloaded(preload_characters);
4077        new_trace.set_bound_checked_up_to(preload_characters);
4078        generate_full_check_inline = true;
4079      }
4080    } else if (alt_gen->quick_check_details.cannot_match()) {
4081      if (i == choice_count - 1 && !greedy_loop) {
4082        macro_assembler->GoTo(trace->backtrack());
4083      }
4084      continue;
4085    } else {
4086      // No quick check was generated.  Put the full code here.
4087      // If this is not the first choice then there could be slow checks from
4088      // previous cases that go here when they fail.  There's no reason to
4089      // insist that they preload characters since the slow check we are about
4090      // to generate probably can't use it.
4091      if (i != first_normal_choice) {
4092        alt_gen->expects_preload = false;
4093        new_trace.InvalidateCurrentCharacter();
4094      }
4095      if (i < choice_count - 1) {
4096        new_trace.set_backtrack(&alt_gen->after);
4097      }
4098      generate_full_check_inline = true;
4099    }
4100    if (generate_full_check_inline) {
4101      if (new_trace.actions() != NULL) {
4102        new_trace.set_flush_budget(new_flush_budget);
4103      }
4104      for (int j = 0; j < guard_count; j++) {
4105        GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4106      }
4107      alternative.node()->Emit(compiler, &new_trace);
4108      preload_is_current = false;
4109    }
4110    macro_assembler->Bind(&alt_gen->after);
4111  }
4112  if (greedy_loop) {
4113    macro_assembler->Bind(&greedy_loop_label);
4114    // If we have unwound to the bottom then backtrack.
4115    macro_assembler->CheckGreedyLoop(trace->backtrack());
4116    // Otherwise try the second priority at an earlier position.
4117    macro_assembler->AdvanceCurrentPosition(-text_length);
4118    macro_assembler->GoTo(&second_choice);
4119  }
4120
4121  // At this point we need to generate slow checks for the alternatives where
4122  // the quick check was inlined.  We can recognize these because the associated
4123  // label was bound.
4124  for (int i = first_normal_choice; i < choice_count - 1; i++) {
4125    AlternativeGeneration* alt_gen = alt_gens.at(i);
4126    Trace new_trace(*current_trace);
4127    // If there are actions to be flushed we have to limit how many times
4128    // they are flushed.  Take the budget of the parent trace and distribute
4129    // it fairly amongst the children.
4130    if (new_trace.actions() != NULL) {
4131      new_trace.set_flush_budget(new_flush_budget);
4132    }
4133    EmitOutOfLineContinuation(compiler,
4134                              &new_trace,
4135                              alternatives_->at(i),
4136                              alt_gen,
4137                              preload_characters,
4138                              alt_gens.at(i + 1)->expects_preload);
4139  }
4140}
4141
4142
4143void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4144                                           Trace* trace,
4145                                           GuardedAlternative alternative,
4146                                           AlternativeGeneration* alt_gen,
4147                                           int preload_characters,
4148                                           bool next_expects_preload) {
4149  if (!alt_gen->possible_success.is_linked()) return;
4150
4151  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4152  macro_assembler->Bind(&alt_gen->possible_success);
4153  Trace out_of_line_trace(*trace);
4154  out_of_line_trace.set_characters_preloaded(preload_characters);
4155  out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4156  if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4157  ZoneList<Guard*>* guards = alternative.guards();
4158  int guard_count = (guards == NULL) ? 0 : guards->length();
4159  if (next_expects_preload) {
4160    Label reload_current_char;
4161    out_of_line_trace.set_backtrack(&reload_current_char);
4162    for (int j = 0; j < guard_count; j++) {
4163      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4164    }
4165    alternative.node()->Emit(compiler, &out_of_line_trace);
4166    macro_assembler->Bind(&reload_current_char);
4167    // Reload the current character, since the next quick check expects that.
4168    // We don't need to check bounds here because we only get into this
4169    // code through a quick check which already did the checked load.
4170    macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4171                                          NULL,
4172                                          false,
4173                                          preload_characters);
4174    macro_assembler->GoTo(&(alt_gen->after));
4175  } else {
4176    out_of_line_trace.set_backtrack(&(alt_gen->after));
4177    for (int j = 0; j < guard_count; j++) {
4178      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4179    }
4180    alternative.node()->Emit(compiler, &out_of_line_trace);
4181  }
4182}
4183
4184
4185void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4186  RegExpMacroAssembler* assembler = compiler->macro_assembler();
4187  LimitResult limit_result = LimitVersions(compiler, trace);
4188  if (limit_result == DONE) return;
4189  ASSERT(limit_result == CONTINUE);
4190
4191  RecursionCheck rc(compiler);
4192
4193  switch (action_type_) {
4194    case STORE_POSITION: {
4195      Trace::DeferredCapture
4196          new_capture(data_.u_position_register.reg,
4197                      data_.u_position_register.is_capture,
4198                      trace);
4199      Trace new_trace = *trace;
4200      new_trace.add_action(&new_capture);
4201      on_success()->Emit(compiler, &new_trace);
4202      break;
4203    }
4204    case INCREMENT_REGISTER: {
4205      Trace::DeferredIncrementRegister
4206          new_increment(data_.u_increment_register.reg);
4207      Trace new_trace = *trace;
4208      new_trace.add_action(&new_increment);
4209      on_success()->Emit(compiler, &new_trace);
4210      break;
4211    }
4212    case SET_REGISTER: {
4213      Trace::DeferredSetRegister
4214          new_set(data_.u_store_register.reg, data_.u_store_register.value);
4215      Trace new_trace = *trace;
4216      new_trace.add_action(&new_set);
4217      on_success()->Emit(compiler, &new_trace);
4218      break;
4219    }
4220    case CLEAR_CAPTURES: {
4221      Trace::DeferredClearCaptures
4222        new_capture(Interval(data_.u_clear_captures.range_from,
4223                             data_.u_clear_captures.range_to));
4224      Trace new_trace = *trace;
4225      new_trace.add_action(&new_capture);
4226      on_success()->Emit(compiler, &new_trace);
4227      break;
4228    }
4229    case BEGIN_SUBMATCH:
4230      if (!trace->is_trivial()) {
4231        trace->Flush(compiler, this);
4232      } else {
4233        assembler->WriteCurrentPositionToRegister(
4234            data_.u_submatch.current_position_register, 0);
4235        assembler->WriteStackPointerToRegister(
4236            data_.u_submatch.stack_pointer_register);
4237        on_success()->Emit(compiler, trace);
4238      }
4239      break;
4240    case EMPTY_MATCH_CHECK: {
4241      int start_pos_reg = data_.u_empty_match_check.start_register;
4242      int stored_pos = 0;
4243      int rep_reg = data_.u_empty_match_check.repetition_register;
4244      bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4245      bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4246      if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4247        // If we know we haven't advanced and there is no minimum we
4248        // can just backtrack immediately.
4249        assembler->GoTo(trace->backtrack());
4250      } else if (know_dist && stored_pos < trace->cp_offset()) {
4251        // If we know we've advanced we can generate the continuation
4252        // immediately.
4253        on_success()->Emit(compiler, trace);
4254      } else if (!trace->is_trivial()) {
4255        trace->Flush(compiler, this);
4256      } else {
4257        Label skip_empty_check;
4258        // If we have a minimum number of repetitions we check the current
4259        // number first and skip the empty check if it's not enough.
4260        if (has_minimum) {
4261          int limit = data_.u_empty_match_check.repetition_limit;
4262          assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4263        }
4264        // If the match is empty we bail out, otherwise we fall through
4265        // to the on-success continuation.
4266        assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4267                                   trace->backtrack());
4268        assembler->Bind(&skip_empty_check);
4269        on_success()->Emit(compiler, trace);
4270      }
4271      break;
4272    }
4273    case POSITIVE_SUBMATCH_SUCCESS: {
4274      if (!trace->is_trivial()) {
4275        trace->Flush(compiler, this);
4276        return;
4277      }
4278      assembler->ReadCurrentPositionFromRegister(
4279          data_.u_submatch.current_position_register);
4280      assembler->ReadStackPointerFromRegister(
4281          data_.u_submatch.stack_pointer_register);
4282      int clear_register_count = data_.u_submatch.clear_register_count;
4283      if (clear_register_count == 0) {
4284        on_success()->Emit(compiler, trace);
4285        return;
4286      }
4287      int clear_registers_from = data_.u_submatch.clear_register_from;
4288      Label clear_registers_backtrack;
4289      Trace new_trace = *trace;
4290      new_trace.set_backtrack(&clear_registers_backtrack);
4291      on_success()->Emit(compiler, &new_trace);
4292
4293      assembler->Bind(&clear_registers_backtrack);
4294      int clear_registers_to = clear_registers_from + clear_register_count - 1;
4295      assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4296
4297      ASSERT(trace->backtrack() == NULL);
4298      assembler->Backtrack();
4299      return;
4300    }
4301    default:
4302      UNREACHABLE();
4303  }
4304}
4305
4306
4307void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4308  RegExpMacroAssembler* assembler = compiler->macro_assembler();
4309  if (!trace->is_trivial()) {
4310    trace->Flush(compiler, this);
4311    return;
4312  }
4313
4314  LimitResult limit_result = LimitVersions(compiler, trace);
4315  if (limit_result == DONE) return;
4316  ASSERT(limit_result == CONTINUE);
4317
4318  RecursionCheck rc(compiler);
4319
4320  ASSERT_EQ(start_reg_ + 1, end_reg_);
4321  if (compiler->ignore_case()) {
4322    assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
4323                                               trace->backtrack());
4324  } else {
4325    assembler->CheckNotBackReference(start_reg_, trace->backtrack());
4326  }
4327  on_success()->Emit(compiler, trace);
4328}
4329
4330
4331// -------------------------------------------------------------------
4332// Dot/dotty output
4333
4334
4335#ifdef DEBUG
4336
4337
4338class DotPrinter: public NodeVisitor {
4339 public:
4340  explicit DotPrinter(bool ignore_case)
4341      : ignore_case_(ignore_case),
4342        stream_(&alloc_) { }
4343  void PrintNode(const char* label, RegExpNode* node);
4344  void Visit(RegExpNode* node);
4345  void PrintAttributes(RegExpNode* from);
4346  StringStream* stream() { return &stream_; }
4347  void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4348#define DECLARE_VISIT(Type)                                          \
4349  virtual void Visit##Type(Type##Node* that);
4350FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4351#undef DECLARE_VISIT
4352 private:
4353  bool ignore_case_;
4354  HeapStringAllocator alloc_;
4355  StringStream stream_;
4356};
4357
4358
4359void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4360  stream()->Add("digraph G {\n  graph [label=\"");
4361  for (int i = 0; label[i]; i++) {
4362    switch (label[i]) {
4363      case '\\':
4364        stream()->Add("\\\\");
4365        break;
4366      case '"':
4367        stream()->Add("\"");
4368        break;
4369      default:
4370        stream()->Put(label[i]);
4371        break;
4372    }
4373  }
4374  stream()->Add("\"];\n");
4375  Visit(node);
4376  stream()->Add("}\n");
4377  printf("%s", *(stream()->ToCString()));
4378}
4379
4380
4381void DotPrinter::Visit(RegExpNode* node) {
4382  if (node->info()->visited) return;
4383  node->info()->visited = true;
4384  node->Accept(this);
4385}
4386
4387
4388void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4389  stream()->Add("  n%p -> n%p [style=dotted];\n", from, on_failure);
4390  Visit(on_failure);
4391}
4392
4393
4394class TableEntryBodyPrinter {
4395 public:
4396  TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
4397      : stream_(stream), choice_(choice) { }
4398  void Call(uc16 from, DispatchTable::Entry entry) {
4399    OutSet* out_set = entry.out_set();
4400    for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4401      if (out_set->Get(i)) {
4402        stream()->Add("    n%p:s%io%i -> n%p;\n",
4403                      choice(),
4404                      from,
4405                      i,
4406                      choice()->alternatives()->at(i).node());
4407      }
4408    }
4409  }
4410 private:
4411  StringStream* stream() { return stream_; }
4412  ChoiceNode* choice() { return choice_; }
4413  StringStream* stream_;
4414  ChoiceNode* choice_;
4415};
4416
4417
4418class TableEntryHeaderPrinter {
4419 public:
4420  explicit TableEntryHeaderPrinter(StringStream* stream)
4421      : first_(true), stream_(stream) { }
4422  void Call(uc16 from, DispatchTable::Entry entry) {
4423    if (first_) {
4424      first_ = false;
4425    } else {
4426      stream()->Add("|");
4427    }
4428    stream()->Add("{\\%k-\\%k|{", from, entry.to());
4429    OutSet* out_set = entry.out_set();
4430    int priority = 0;
4431    for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4432      if (out_set->Get(i)) {
4433        if (priority > 0) stream()->Add("|");
4434        stream()->Add("<s%io%i> %i", from, i, priority);
4435        priority++;
4436      }
4437    }
4438    stream()->Add("}}");
4439  }
4440
4441 private:
4442  bool first_;
4443  StringStream* stream() { return stream_; }
4444  StringStream* stream_;
4445};
4446
4447
4448class AttributePrinter {
4449 public:
4450  explicit AttributePrinter(DotPrinter* out)
4451      : out_(out), first_(true) { }
4452  void PrintSeparator() {
4453    if (first_) {
4454      first_ = false;
4455    } else {
4456      out_->stream()->Add("|");
4457    }
4458  }
4459  void PrintBit(const char* name, bool value) {
4460    if (!value) return;
4461    PrintSeparator();
4462    out_->stream()->Add("{%s}", name);
4463  }
4464  void PrintPositive(const char* name, int value) {
4465    if (value < 0) return;
4466    PrintSeparator();
4467    out_->stream()->Add("{%s|%x}", name, value);
4468  }
4469 private:
4470  DotPrinter* out_;
4471  bool first_;
4472};
4473
4474
4475void DotPrinter::PrintAttributes(RegExpNode* that) {
4476  stream()->Add("  a%p [shape=Mrecord, color=grey, fontcolor=grey, "
4477                "margin=0.1, fontsize=10, label=\"{",
4478                that);
4479  AttributePrinter printer(this);
4480  NodeInfo* info = that->info();
4481  printer.PrintBit("NI", info->follows_newline_interest);
4482  printer.PrintBit("WI", info->follows_word_interest);
4483  printer.PrintBit("SI", info->follows_start_interest);
4484  Label* label = that->label();
4485  if (label->is_bound())
4486    printer.PrintPositive("@", label->pos());
4487  stream()->Add("}\"];\n");
4488  stream()->Add("  a%p -> n%p [style=dashed, color=grey, "
4489                "arrowhead=none];\n", that, that);
4490}
4491
4492
4493static const bool kPrintDispatchTable = false;
4494void DotPrinter::VisitChoice(ChoiceNode* that) {
4495  if (kPrintDispatchTable) {
4496    stream()->Add("  n%p [shape=Mrecord, label=\"", that);
4497    TableEntryHeaderPrinter header_printer(stream());
4498    that->GetTable(ignore_case_)->ForEach(&header_printer);
4499    stream()->Add("\"]\n", that);
4500    PrintAttributes(that);
4501    TableEntryBodyPrinter body_printer(stream(), that);
4502    that->GetTable(ignore_case_)->ForEach(&body_printer);
4503  } else {
4504    stream()->Add("  n%p [shape=Mrecord, label=\"?\"];\n", that);
4505    for (int i = 0; i < that->alternatives()->length(); i++) {
4506      GuardedAlternative alt = that->alternatives()->at(i);
4507      stream()->Add("  n%p -> n%p;\n", that, alt.node());
4508    }
4509  }
4510  for (int i = 0; i < that->alternatives()->length(); i++) {
4511    GuardedAlternative alt = that->alternatives()->at(i);
4512    alt.node()->Accept(this);
4513  }
4514}
4515
4516
4517void DotPrinter::VisitText(TextNode* that) {
4518  Zone* zone = that->zone();
4519  stream()->Add("  n%p [label=\"", that);
4520  for (int i = 0; i < that->elements()->length(); i++) {
4521    if (i > 0) stream()->Add(" ");
4522    TextElement elm = that->elements()->at(i);
4523    switch (elm.text_type()) {
4524      case TextElement::ATOM: {
4525        stream()->Add("'%w'", elm.atom()->data());
4526        break;
4527      }
4528      case TextElement::CHAR_CLASS: {
4529        RegExpCharacterClass* node = elm.char_class();
4530        stream()->Add("[");
4531        if (node->is_negated())
4532          stream()->Add("^");
4533        for (int j = 0; j < node->ranges(zone)->length(); j++) {
4534          CharacterRange range = node->ranges(zone)->at(j);
4535          stream()->Add("%k-%k", range.from(), range.to());
4536        }
4537        stream()->Add("]");
4538        break;
4539      }
4540      default:
4541        UNREACHABLE();
4542    }
4543  }
4544  stream()->Add("\", shape=box, peripheries=2];\n");
4545  PrintAttributes(that);
4546  stream()->Add("  n%p -> n%p;\n", that, that->on_success());
4547  Visit(that->on_success());
4548}
4549
4550
4551void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4552  stream()->Add("  n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
4553                that,
4554                that->start_register(),
4555                that->end_register());
4556  PrintAttributes(that);
4557  stream()->Add("  n%p -> n%p;\n", that, that->on_success());
4558  Visit(that->on_success());
4559}
4560
4561
4562void DotPrinter::VisitEnd(EndNode* that) {
4563  stream()->Add("  n%p [style=bold, shape=point];\n", that);
4564  PrintAttributes(that);
4565}
4566
4567
4568void DotPrinter::VisitAssertion(AssertionNode* that) {
4569  stream()->Add("  n%p [", that);
4570  switch (that->assertion_type()) {
4571    case AssertionNode::AT_END:
4572      stream()->Add("label=\"$\", shape=septagon");
4573      break;
4574    case AssertionNode::AT_START:
4575      stream()->Add("label=\"^\", shape=septagon");
4576      break;
4577    case AssertionNode::AT_BOUNDARY:
4578      stream()->Add("label=\"\\b\", shape=septagon");
4579      break;
4580    case AssertionNode::AT_NON_BOUNDARY:
4581      stream()->Add("label=\"\\B\", shape=septagon");
4582      break;
4583    case AssertionNode::AFTER_NEWLINE:
4584      stream()->Add("label=\"(?<=\\n)\", shape=septagon");
4585      break;
4586  }
4587  stream()->Add("];\n");
4588  PrintAttributes(that);
4589  RegExpNode* successor = that->on_success();
4590  stream()->Add("  n%p -> n%p;\n", that, successor);
4591  Visit(successor);
4592}
4593
4594
4595void DotPrinter::VisitAction(ActionNode* that) {
4596  stream()->Add("  n%p [", that);
4597  switch (that->action_type_) {
4598    case ActionNode::SET_REGISTER:
4599      stream()->Add("label=\"$%i:=%i\", shape=octagon",
4600                    that->data_.u_store_register.reg,
4601                    that->data_.u_store_register.value);
4602      break;
4603    case ActionNode::INCREMENT_REGISTER:
4604      stream()->Add("label=\"$%i++\", shape=octagon",
4605                    that->data_.u_increment_register.reg);
4606      break;
4607    case ActionNode::STORE_POSITION:
4608      stream()->Add("label=\"$%i:=$pos\", shape=octagon",
4609                    that->data_.u_position_register.reg);
4610      break;
4611    case ActionNode::BEGIN_SUBMATCH:
4612      stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
4613                    that->data_.u_submatch.current_position_register);
4614      break;
4615    case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4616      stream()->Add("label=\"escape\", shape=septagon");
4617      break;
4618    case ActionNode::EMPTY_MATCH_CHECK:
4619      stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
4620                    that->data_.u_empty_match_check.start_register,
4621                    that->data_.u_empty_match_check.repetition_register,
4622                    that->data_.u_empty_match_check.repetition_limit);
4623      break;
4624    case ActionNode::CLEAR_CAPTURES: {
4625      stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
4626                    that->data_.u_clear_captures.range_from,
4627                    that->data_.u_clear_captures.range_to);
4628      break;
4629    }
4630  }
4631  stream()->Add("];\n");
4632  PrintAttributes(that);
4633  RegExpNode* successor = that->on_success();
4634  stream()->Add("  n%p -> n%p;\n", that, successor);
4635  Visit(successor);
4636}
4637
4638
4639class DispatchTableDumper {
4640 public:
4641  explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
4642  void Call(uc16 key, DispatchTable::Entry entry);
4643  StringStream* stream() { return stream_; }
4644 private:
4645  StringStream* stream_;
4646};
4647
4648
4649void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4650  stream()->Add("[%k-%k]: {", key, entry.to());
4651  OutSet* set = entry.out_set();
4652  bool first = true;
4653  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4654    if (set->Get(i)) {
4655      if (first) {
4656        first = false;
4657      } else {
4658        stream()->Add(", ");
4659      }
4660      stream()->Add("%i", i);
4661    }
4662  }
4663  stream()->Add("}\n");
4664}
4665
4666
4667void DispatchTable::Dump() {
4668  HeapStringAllocator alloc;
4669  StringStream stream(&alloc);
4670  DispatchTableDumper dumper(&stream);
4671  tree()->ForEach(&dumper);
4672  OS::PrintError("%s", *stream.ToCString());
4673}
4674
4675
4676void RegExpEngine::DotPrint(const char* label,
4677                            RegExpNode* node,
4678                            bool ignore_case) {
4679  DotPrinter printer(ignore_case);
4680  printer.PrintNode(label, node);
4681}
4682
4683
4684#endif  // DEBUG
4685
4686
4687// -------------------------------------------------------------------
4688// Tree to graph conversion
4689
4690RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4691                               RegExpNode* on_success) {
4692  ZoneList<TextElement>* elms =
4693      new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4694  elms->Add(TextElement::Atom(this), compiler->zone());
4695  return new(compiler->zone()) TextNode(elms, on_success);
4696}
4697
4698
4699RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4700                               RegExpNode* on_success) {
4701  return new(compiler->zone()) TextNode(elements(), on_success);
4702}
4703
4704
4705static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4706                                 const int* special_class,
4707                                 int length) {
4708  length--;  // Remove final 0x10000.
4709  ASSERT(special_class[length] == 0x10000);
4710  ASSERT(ranges->length() != 0);
4711  ASSERT(length != 0);
4712  ASSERT(special_class[0] != 0);
4713  if (ranges->length() != (length >> 1) + 1) {
4714    return false;
4715  }
4716  CharacterRange range = ranges->at(0);
4717  if (range.from() != 0) {
4718    return false;
4719  }
4720  for (int i = 0; i < length; i += 2) {
4721    if (special_class[i] != (range.to() + 1)) {
4722      return false;
4723    }
4724    range = ranges->at((i >> 1) + 1);
4725    if (special_class[i+1] != range.from()) {
4726      return false;
4727    }
4728  }
4729  if (range.to() != 0xffff) {
4730    return false;
4731  }
4732  return true;
4733}
4734
4735
4736static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4737                          const int* special_class,
4738                          int length) {
4739  length--;  // Remove final 0x10000.
4740  ASSERT(special_class[length] == 0x10000);
4741  if (ranges->length() * 2 != length) {
4742    return false;
4743  }
4744  for (int i = 0; i < length; i += 2) {
4745    CharacterRange range = ranges->at(i >> 1);
4746    if (range.from() != special_class[i] ||
4747        range.to() != special_class[i + 1] - 1) {
4748      return false;
4749    }
4750  }
4751  return true;
4752}
4753
4754
4755bool RegExpCharacterClass::is_standard(Zone* zone) {
4756  // TODO(lrn): Remove need for this function, by not throwing away information
4757  // along the way.
4758  if (is_negated_) {
4759    return false;
4760  }
4761  if (set_.is_standard()) {
4762    return true;
4763  }
4764  if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4765    set_.set_standard_set_type('s');
4766    return true;
4767  }
4768  if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4769    set_.set_standard_set_type('S');
4770    return true;
4771  }
4772  if (CompareInverseRanges(set_.ranges(zone),
4773                           kLineTerminatorRanges,
4774                           kLineTerminatorRangeCount)) {
4775    set_.set_standard_set_type('.');
4776    return true;
4777  }
4778  if (CompareRanges(set_.ranges(zone),
4779                    kLineTerminatorRanges,
4780                    kLineTerminatorRangeCount)) {
4781    set_.set_standard_set_type('n');
4782    return true;
4783  }
4784  if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4785    set_.set_standard_set_type('w');
4786    return true;
4787  }
4788  if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4789    set_.set_standard_set_type('W');
4790    return true;
4791  }
4792  return false;
4793}
4794
4795
4796RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4797                                         RegExpNode* on_success) {
4798  return new(compiler->zone()) TextNode(this, on_success);
4799}
4800
4801
4802RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4803                                      RegExpNode* on_success) {
4804  ZoneList<RegExpTree*>* alternatives = this->alternatives();
4805  int length = alternatives->length();
4806  ChoiceNode* result =
4807      new(compiler->zone()) ChoiceNode(length, compiler->zone());
4808  for (int i = 0; i < length; i++) {
4809    GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
4810                                                               on_success));
4811    result->AddAlternative(alternative);
4812  }
4813  return result;
4814}
4815
4816
4817RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
4818                                     RegExpNode* on_success) {
4819  return ToNode(min(),
4820                max(),
4821                is_greedy(),
4822                body(),
4823                compiler,
4824                on_success);
4825}
4826
4827
4828// Scoped object to keep track of how much we unroll quantifier loops in the
4829// regexp graph generator.
4830class RegExpExpansionLimiter {
4831 public:
4832  static const int kMaxExpansionFactor = 6;
4833  RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
4834      : compiler_(compiler),
4835        saved_expansion_factor_(compiler->current_expansion_factor()),
4836        ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4837    ASSERT(factor > 0);
4838    if (ok_to_expand_) {
4839      if (factor > kMaxExpansionFactor) {
4840        // Avoid integer overflow of the current expansion factor.
4841        ok_to_expand_ = false;
4842        compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4843      } else {
4844        int new_factor = saved_expansion_factor_ * factor;
4845        ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4846        compiler->set_current_expansion_factor(new_factor);
4847      }
4848    }
4849  }
4850
4851  ~RegExpExpansionLimiter() {
4852    compiler_->set_current_expansion_factor(saved_expansion_factor_);
4853  }
4854
4855  bool ok_to_expand() { return ok_to_expand_; }
4856
4857 private:
4858  RegExpCompiler* compiler_;
4859  int saved_expansion_factor_;
4860  bool ok_to_expand_;
4861
4862  DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4863};
4864
4865
4866RegExpNode* RegExpQuantifier::ToNode(int min,
4867                                     int max,
4868                                     bool is_greedy,
4869                                     RegExpTree* body,
4870                                     RegExpCompiler* compiler,
4871                                     RegExpNode* on_success,
4872                                     bool not_at_start) {
4873  // x{f, t} becomes this:
4874  //
4875  //             (r++)<-.
4876  //               |     `
4877  //               |     (x)
4878  //               v     ^
4879  //      (r=0)-->(?)---/ [if r < t]
4880  //               |
4881  //   [if r >= f] \----> ...
4882  //
4883
4884  // 15.10.2.5 RepeatMatcher algorithm.
4885  // The parser has already eliminated the case where max is 0.  In the case
4886  // where max_match is zero the parser has removed the quantifier if min was
4887  // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
4888
4889  // If we know that we cannot match zero length then things are a little
4890  // simpler since we don't need to make the special zero length match check
4891  // from step 2.1.  If the min and max are small we can unroll a little in
4892  // this case.
4893  static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
4894  static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
4895  if (max == 0) return on_success;  // This can happen due to recursion.
4896  bool body_can_be_empty = (body->min_match() == 0);
4897  int body_start_reg = RegExpCompiler::kNoRegister;
4898  Interval capture_registers = body->CaptureRegisters();
4899  bool needs_capture_clearing = !capture_registers.is_empty();
4900  Zone* zone = compiler->zone();
4901
4902  if (body_can_be_empty) {
4903    body_start_reg = compiler->AllocateRegister();
4904  } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
4905    // Only unroll if there are no captures and the body can't be
4906    // empty.
4907    {
4908      RegExpExpansionLimiter limiter(
4909          compiler, min + ((max != min) ? 1 : 0));
4910      if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4911        int new_max = (max == kInfinity) ? max : max - min;
4912        // Recurse once to get the loop or optional matches after the fixed
4913        // ones.
4914        RegExpNode* answer = ToNode(
4915            0, new_max, is_greedy, body, compiler, on_success, true);
4916        // Unroll the forced matches from 0 to min.  This can cause chains of
4917        // TextNodes (which the parser does not generate).  These should be
4918        // combined if it turns out they hinder good code generation.
4919        for (int i = 0; i < min; i++) {
4920          answer = body->ToNode(compiler, answer);
4921        }
4922        return answer;
4923      }
4924    }
4925    if (max <= kMaxUnrolledMaxMatches && min == 0) {
4926      ASSERT(max > 0);  // Due to the 'if' above.
4927      RegExpExpansionLimiter limiter(compiler, max);
4928      if (limiter.ok_to_expand()) {
4929        // Unroll the optional matches up to max.
4930        RegExpNode* answer = on_success;
4931        for (int i = 0; i < max; i++) {
4932          ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
4933          if (is_greedy) {
4934            alternation->AddAlternative(
4935                GuardedAlternative(body->ToNode(compiler, answer)));
4936            alternation->AddAlternative(GuardedAlternative(on_success));
4937          } else {
4938            alternation->AddAlternative(GuardedAlternative(on_success));
4939            alternation->AddAlternative(
4940                GuardedAlternative(body->ToNode(compiler, answer)));
4941          }
4942          answer = alternation;
4943          if (not_at_start) alternation->set_not_at_start();
4944        }
4945        return answer;
4946      }
4947    }
4948  }
4949  bool has_min = min > 0;
4950  bool has_max = max < RegExpTree::kInfinity;
4951  bool needs_counter = has_min || has_max;
4952  int reg_ctr = needs_counter
4953      ? compiler->AllocateRegister()
4954      : RegExpCompiler::kNoRegister;
4955  LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0,
4956                                                    zone);
4957  if (not_at_start) center->set_not_at_start();
4958  RegExpNode* loop_return = needs_counter
4959      ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
4960      : static_cast<RegExpNode*>(center);
4961  if (body_can_be_empty) {
4962    // If the body can be empty we need to check if it was and then
4963    // backtrack.
4964    loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
4965                                              reg_ctr,
4966                                              min,
4967                                              loop_return);
4968  }
4969  RegExpNode* body_node = body->ToNode(compiler, loop_return);
4970  if (body_can_be_empty) {
4971    // If the body can be empty we need to store the start position
4972    // so we can bail out if it was empty.
4973    body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
4974  }
4975  if (needs_capture_clearing) {
4976    // Before entering the body of this loop we need to clear captures.
4977    body_node = ActionNode::ClearCaptures(capture_registers, body_node);
4978  }
4979  GuardedAlternative body_alt(body_node);
4980  if (has_max) {
4981    Guard* body_guard =
4982        new(zone) Guard(reg_ctr, Guard::LT, max);
4983    body_alt.AddGuard(body_guard, zone);
4984  }
4985  GuardedAlternative rest_alt(on_success);
4986  if (has_min) {
4987    Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
4988    rest_alt.AddGuard(rest_guard, zone);
4989  }
4990  if (is_greedy) {
4991    center->AddLoopAlternative(body_alt);
4992    center->AddContinueAlternative(rest_alt);
4993  } else {
4994    center->AddContinueAlternative(rest_alt);
4995    center->AddLoopAlternative(body_alt);
4996  }
4997  if (needs_counter) {
4998    return ActionNode::SetRegister(reg_ctr, 0, center);
4999  } else {
5000    return center;
5001  }
5002}
5003
5004
5005RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5006                                    RegExpNode* on_success) {
5007  NodeInfo info;
5008  Zone* zone = compiler->zone();
5009
5010  switch (assertion_type()) {
5011    case START_OF_LINE:
5012      return AssertionNode::AfterNewline(on_success);
5013    case START_OF_INPUT:
5014      return AssertionNode::AtStart(on_success);
5015    case BOUNDARY:
5016      return AssertionNode::AtBoundary(on_success);
5017    case NON_BOUNDARY:
5018      return AssertionNode::AtNonBoundary(on_success);
5019    case END_OF_INPUT:
5020      return AssertionNode::AtEnd(on_success);
5021    case END_OF_LINE: {
5022      // Compile $ in multiline regexps as an alternation with a positive
5023      // lookahead in one side and an end-of-input on the other side.
5024      // We need two registers for the lookahead.
5025      int stack_pointer_register = compiler->AllocateRegister();
5026      int position_register = compiler->AllocateRegister();
5027      // The ChoiceNode to distinguish between a newline and end-of-input.
5028      ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5029      // Create a newline atom.
5030      ZoneList<CharacterRange>* newline_ranges =
5031          new(zone) ZoneList<CharacterRange>(3, zone);
5032      CharacterRange::AddClassEscape('n', newline_ranges, zone);
5033      RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');
5034      TextNode* newline_matcher = new(zone) TextNode(
5035         newline_atom,
5036         ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5037                                             position_register,
5038                                             0,  // No captures inside.
5039                                             -1,  // Ignored if no captures.
5040                                             on_success));
5041      // Create an end-of-input matcher.
5042      RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5043          stack_pointer_register,
5044          position_register,
5045          newline_matcher);
5046      // Add the two alternatives to the ChoiceNode.
5047      GuardedAlternative eol_alternative(end_of_line);
5048      result->AddAlternative(eol_alternative);
5049      GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5050      result->AddAlternative(end_alternative);
5051      return result;
5052    }
5053    default:
5054      UNREACHABLE();
5055  }
5056  return on_success;
5057}
5058
5059
5060RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5061                                        RegExpNode* on_success) {
5062  return new(compiler->zone())
5063      BackReferenceNode(RegExpCapture::StartRegister(index()),
5064                        RegExpCapture::EndRegister(index()),
5065                        on_success);
5066}
5067
5068
5069RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5070                                RegExpNode* on_success) {
5071  return on_success;
5072}
5073
5074
5075RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
5076                                    RegExpNode* on_success) {
5077  int stack_pointer_register = compiler->AllocateRegister();
5078  int position_register = compiler->AllocateRegister();
5079
5080  const int registers_per_capture = 2;
5081  const int register_of_first_capture = 2;
5082  int register_count = capture_count_ * registers_per_capture;
5083  int register_start =
5084    register_of_first_capture + capture_from_ * registers_per_capture;
5085
5086  RegExpNode* success;
5087  if (is_positive()) {
5088    RegExpNode* node = ActionNode::BeginSubmatch(
5089        stack_pointer_register,
5090        position_register,
5091        body()->ToNode(
5092            compiler,
5093            ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5094                                                position_register,
5095                                                register_count,
5096                                                register_start,
5097                                                on_success)));
5098    return node;
5099  } else {
5100    // We use a ChoiceNode for a negative lookahead because it has most of
5101    // the characteristics we need.  It has the body of the lookahead as its
5102    // first alternative and the expression after the lookahead of the second
5103    // alternative.  If the first alternative succeeds then the
5104    // NegativeSubmatchSuccess will unwind the stack including everything the
5105    // choice node set up and backtrack.  If the first alternative fails then
5106    // the second alternative is tried, which is exactly the desired result
5107    // for a negative lookahead.  The NegativeLookaheadChoiceNode is a special
5108    // ChoiceNode that knows to ignore the first exit when calculating quick
5109    // checks.
5110    Zone* zone = compiler->zone();
5111
5112    GuardedAlternative body_alt(
5113        body()->ToNode(
5114            compiler,
5115            success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,
5116                                                        position_register,
5117                                                        register_count,
5118                                                        register_start,
5119                                                        zone)));
5120    ChoiceNode* choice_node =
5121        new(zone) NegativeLookaheadChoiceNode(body_alt,
5122                                              GuardedAlternative(on_success),
5123                                              zone);
5124    return ActionNode::BeginSubmatch(stack_pointer_register,
5125                                     position_register,
5126                                     choice_node);
5127  }
5128}
5129
5130
5131RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5132                                  RegExpNode* on_success) {
5133  return ToNode(body(), index(), compiler, on_success);
5134}
5135
5136
5137RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5138                                  int index,
5139                                  RegExpCompiler* compiler,
5140                                  RegExpNode* on_success) {
5141  int start_reg = RegExpCapture::StartRegister(index);
5142  int end_reg = RegExpCapture::EndRegister(index);
5143  RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5144  RegExpNode* body_node = body->ToNode(compiler, store_end);
5145  return ActionNode::StorePosition(start_reg, true, body_node);
5146}
5147
5148
5149RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5150                                      RegExpNode* on_success) {
5151  ZoneList<RegExpTree*>* children = nodes();
5152  RegExpNode* current = on_success;
5153  for (int i = children->length() - 1; i >= 0; i--) {
5154    current = children->at(i)->ToNode(compiler, current);
5155  }
5156  return current;
5157}
5158
5159
5160static void AddClass(const int* elmv,
5161                     int elmc,
5162                     ZoneList<CharacterRange>* ranges,
5163                     Zone* zone) {
5164  elmc--;
5165  ASSERT(elmv[elmc] == 0x10000);
5166  for (int i = 0; i < elmc; i += 2) {
5167    ASSERT(elmv[i] < elmv[i + 1]);
5168    ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
5169  }
5170}
5171
5172
5173static void AddClassNegated(const int *elmv,
5174                            int elmc,
5175                            ZoneList<CharacterRange>* ranges,
5176                            Zone* zone) {
5177  elmc--;
5178  ASSERT(elmv[elmc] == 0x10000);
5179  ASSERT(elmv[0] != 0x0000);
5180  ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
5181  uc16 last = 0x0000;
5182  for (int i = 0; i < elmc; i += 2) {
5183    ASSERT(last <= elmv[i] - 1);
5184    ASSERT(elmv[i] < elmv[i + 1]);
5185    ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
5186    last = elmv[i + 1];
5187  }
5188  ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
5189}
5190
5191
5192void CharacterRange::AddClassEscape(uc16 type,
5193                                    ZoneList<CharacterRange>* ranges,
5194                                    Zone* zone) {
5195  switch (type) {
5196    case 's':
5197      AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5198      break;
5199    case 'S':
5200      AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5201      break;
5202    case 'w':
5203      AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5204      break;
5205    case 'W':
5206      AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5207      break;
5208    case 'd':
5209      AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5210      break;
5211    case 'D':
5212      AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5213      break;
5214    case '.':
5215      AddClassNegated(kLineTerminatorRanges,
5216                      kLineTerminatorRangeCount,
5217                      ranges,
5218                      zone);
5219      break;
5220    // This is not a character range as defined by the spec but a
5221    // convenient shorthand for a character class that matches any
5222    // character.
5223    case '*':
5224      ranges->Add(CharacterRange::Everything(), zone);
5225      break;
5226    // This is the set of characters matched by the $ and ^ symbols
5227    // in multiline mode.
5228    case 'n':
5229      AddClass(kLineTerminatorRanges,
5230               kLineTerminatorRangeCount,
5231               ranges,
5232               zone);
5233      break;
5234    default:
5235      UNREACHABLE();
5236  }
5237}
5238
5239
5240Vector<const int> CharacterRange::GetWordBounds() {
5241  return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5242}
5243
5244
5245class CharacterRangeSplitter {
5246 public:
5247  CharacterRangeSplitter(ZoneList<CharacterRange>** included,
5248                         ZoneList<CharacterRange>** excluded,
5249                         Zone* zone)
5250      : included_(included),
5251        excluded_(excluded),
5252        zone_(zone) { }
5253  void Call(uc16 from, DispatchTable::Entry entry);
5254
5255  static const int kInBase = 0;
5256  static const int kInOverlay = 1;
5257
5258 private:
5259  ZoneList<CharacterRange>** included_;
5260  ZoneList<CharacterRange>** excluded_;
5261  Zone* zone_;
5262};
5263
5264
5265void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
5266  if (!entry.out_set()->Get(kInBase)) return;
5267  ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5268    ? included_
5269    : excluded_;
5270  if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5271  (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
5272}
5273
5274
5275void CharacterRange::Split(ZoneList<CharacterRange>* base,
5276                           Vector<const int> overlay,
5277                           ZoneList<CharacterRange>** included,
5278                           ZoneList<CharacterRange>** excluded,
5279                           Zone* zone) {
5280  ASSERT_EQ(NULL, *included);
5281  ASSERT_EQ(NULL, *excluded);
5282  DispatchTable table(zone);
5283  for (int i = 0; i < base->length(); i++)
5284    table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
5285  for (int i = 0; i < overlay.length(); i += 2) {
5286    table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
5287                   CharacterRangeSplitter::kInOverlay, zone);
5288  }
5289  CharacterRangeSplitter callback(included, excluded, zone);
5290  table.ForEach(&callback);
5291}
5292
5293
5294void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
5295                                        bool is_ascii,
5296                                        Zone* zone) {
5297  Isolate* isolate = zone->isolate();
5298  uc16 bottom = from();
5299  uc16 top = to();
5300  if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
5301    if (bottom > String::kMaxOneByteCharCode) return;
5302    if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5303  }
5304  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5305  if (top == bottom) {
5306    // If this is a singleton we just expand the one character.
5307    int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5308    for (int i = 0; i < length; i++) {
5309      uc32 chr = chars[i];
5310      if (chr != bottom) {
5311        ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5312      }
5313    }
5314  } else {
5315    // If this is a range we expand the characters block by block,
5316    // expanding contiguous subranges (blocks) one at a time.
5317    // The approach is as follows.  For a given start character we
5318    // look up the remainder of the block that contains it (represented
5319    // by the end point), for instance we find 'z' if the character
5320    // is 'c'.  A block is characterized by the property
5321    // that all characters uncanonicalize in the same way, except that
5322    // each entry in the result is incremented by the distance from the first
5323    // element.  So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
5324    // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
5325    // Once we've found the end point we look up its uncanonicalization
5326    // and produce a range for each element.  For instance for [c-f]
5327    // we look up ['z', 'Z'] and produce [c-f] and [C-F].  We then only
5328    // add a range if it is not already contained in the input, so [c-f]
5329    // will be skipped but [C-F] will be added.  If this range is not
5330    // completely contained in a block we do this for all the blocks
5331    // covered by the range (handling characters that is not in a block
5332    // as a "singleton block").
5333    unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5334    int pos = bottom;
5335    while (pos <= top) {
5336      int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
5337      uc16 block_end;
5338      if (length == 0) {
5339        block_end = pos;
5340      } else {
5341        ASSERT_EQ(1, length);
5342        block_end = range[0];
5343      }
5344      int end = (block_end > top) ? top : block_end;
5345      length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
5346      for (int i = 0; i < length; i++) {
5347        uc32 c = range[i];
5348        uc16 range_from = c - (block_end - pos);
5349        uc16 range_to = c - (block_end - end);
5350        if (!(bottom <= range_from && range_to <= top)) {
5351          ranges->Add(CharacterRange(range_from, range_to), zone);
5352        }
5353      }
5354      pos = end + 1;
5355    }
5356  }
5357}
5358
5359
5360bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5361  ASSERT_NOT_NULL(ranges);
5362  int n = ranges->length();
5363  if (n <= 1) return true;
5364  int max = ranges->at(0).to();
5365  for (int i = 1; i < n; i++) {
5366    CharacterRange next_range = ranges->at(i);
5367    if (next_range.from() <= max + 1) return false;
5368    max = next_range.to();
5369  }
5370  return true;
5371}
5372
5373
5374ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5375  if (ranges_ == NULL) {
5376    ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5377    CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5378  }
5379  return ranges_;
5380}
5381
5382
5383// Move a number of elements in a zonelist to another position
5384// in the same list. Handles overlapping source and target areas.
5385static void MoveRanges(ZoneList<CharacterRange>* list,
5386                       int from,
5387                       int to,
5388                       int count) {
5389  // Ranges are potentially overlapping.
5390  if (from < to) {
5391    for (int i = count - 1; i >= 0; i--) {
5392      list->at(to + i) = list->at(from + i);
5393    }
5394  } else {
5395    for (int i = 0; i < count; i++) {
5396      list->at(to + i) = list->at(from + i);
5397    }
5398  }
5399}
5400
5401
5402static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5403                                      int count,
5404                                      CharacterRange insert) {
5405  // Inserts a range into list[0..count[, which must be sorted
5406  // by from value and non-overlapping and non-adjacent, using at most
5407  // list[0..count] for the result. Returns the number of resulting
5408  // canonicalized ranges. Inserting a range may collapse existing ranges into
5409  // fewer ranges, so the return value can be anything in the range 1..count+1.
5410  uc16 from = insert.from();
5411  uc16 to = insert.to();
5412  int start_pos = 0;
5413  int end_pos = count;
5414  for (int i = count - 1; i >= 0; i--) {
5415    CharacterRange current = list->at(i);
5416    if (current.from() > to + 1) {
5417      end_pos = i;
5418    } else if (current.to() + 1 < from) {
5419      start_pos = i + 1;
5420      break;
5421    }
5422  }
5423
5424  // Inserted range overlaps, or is adjacent to, ranges at positions
5425  // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
5426  // not affected by the insertion.
5427  // If start_pos == end_pos, the range must be inserted before start_pos.
5428  // if start_pos < end_pos, the entire range from start_pos to end_pos
5429  // must be merged with the insert range.
5430
5431  if (start_pos == end_pos) {
5432    // Insert between existing ranges at position start_pos.
5433    if (start_pos < count) {
5434      MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
5435    }
5436    list->at(start_pos) = insert;
5437    return count + 1;
5438  }
5439  if (start_pos + 1 == end_pos) {
5440    // Replace single existing range at position start_pos.
5441    CharacterRange to_replace = list->at(start_pos);
5442    int new_from = Min(to_replace.from(), from);
5443    int new_to = Max(to_replace.to(), to);
5444    list->at(start_pos) = CharacterRange(new_from, new_to);
5445    return count;
5446  }
5447  // Replace a number of existing ranges from start_pos to end_pos - 1.
5448  // Move the remaining ranges down.
5449
5450  int new_from = Min(list->at(start_pos).from(), from);
5451  int new_to = Max(list->at(end_pos - 1).to(), to);
5452  if (end_pos < count) {
5453    MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
5454  }
5455  list->at(start_pos) = CharacterRange(new_from, new_to);
5456  return count - (end_pos - start_pos) + 1;
5457}
5458
5459
5460void CharacterSet::Canonicalize() {
5461  // Special/default classes are always considered canonical. The result
5462  // of calling ranges() will be sorted.
5463  if (ranges_ == NULL) return;
5464  CharacterRange::Canonicalize(ranges_);
5465}
5466
5467
5468void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
5469  if (character_ranges->length() <= 1) return;
5470  // Check whether ranges are already canonical (increasing, non-overlapping,
5471  // non-adjacent).
5472  int n = character_ranges->length();
5473  int max = character_ranges->at(0).to();
5474  int i = 1;
5475  while (i < n) {
5476    CharacterRange current = character_ranges->at(i);
5477    if (current.from() <= max + 1) {
5478      break;
5479    }
5480    max = current.to();
5481    i++;
5482  }
5483  // Canonical until the i'th range. If that's all of them, we are done.
5484  if (i == n) return;
5485
5486  // The ranges at index i and forward are not canonicalized. Make them so by
5487  // doing the equivalent of insertion sort (inserting each into the previous
5488  // list, in order).
5489  // Notice that inserting a range can reduce the number of ranges in the
5490  // result due to combining of adjacent and overlapping ranges.
5491  int read = i;  // Range to insert.
5492  int num_canonical = i;  // Length of canonicalized part of list.
5493  do {
5494    num_canonical = InsertRangeInCanonicalList(character_ranges,
5495                                               num_canonical,
5496                                               character_ranges->at(read));
5497    read++;
5498  } while (read < n);
5499  character_ranges->Rewind(num_canonical);
5500
5501  ASSERT(CharacterRange::IsCanonical(character_ranges));
5502}
5503
5504
5505void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
5506                            ZoneList<CharacterRange>* negated_ranges,
5507                            Zone* zone) {
5508  ASSERT(CharacterRange::IsCanonical(ranges));
5509  ASSERT_EQ(0, negated_ranges->length());
5510  int range_count = ranges->length();
5511  uc16 from = 0;
5512  int i = 0;
5513  if (range_count > 0 && ranges->at(0).from() == 0) {
5514    from = ranges->at(0).to();
5515    i = 1;
5516  }
5517  while (i < range_count) {
5518    CharacterRange range = ranges->at(i);
5519    negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
5520    from = range.to();
5521    i++;
5522  }
5523  if (from < String::kMaxUtf16CodeUnit) {
5524    negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
5525                        zone);
5526  }
5527}
5528
5529
5530// -------------------------------------------------------------------
5531// Splay tree
5532
5533
5534OutSet* OutSet::Extend(unsigned value, Zone* zone) {
5535  if (Get(value))
5536    return this;
5537  if (successors(zone) != NULL) {
5538    for (int i = 0; i < successors(zone)->length(); i++) {
5539      OutSet* successor = successors(zone)->at(i);
5540      if (successor->Get(value))
5541        return successor;
5542    }
5543  } else {
5544    successors_ = new(zone) ZoneList<OutSet*>(2, zone);
5545  }
5546  OutSet* result = new(zone) OutSet(first_, remaining_);
5547  result->Set(value, zone);
5548  successors(zone)->Add(result, zone);
5549  return result;
5550}
5551
5552
5553void OutSet::Set(unsigned value, Zone *zone) {
5554  if (value < kFirstLimit) {
5555    first_ |= (1 << value);
5556  } else {
5557    if (remaining_ == NULL)
5558      remaining_ = new(zone) ZoneList<unsigned>(1, zone);
5559    if (remaining_->is_empty() || !remaining_->Contains(value))
5560      remaining_->Add(value, zone);
5561  }
5562}
5563
5564
5565bool OutSet::Get(unsigned value) {
5566  if (value < kFirstLimit) {
5567    return (first_ & (1 << value)) != 0;
5568  } else if (remaining_ == NULL) {
5569    return false;
5570  } else {
5571    return remaining_->Contains(value);
5572  }
5573}
5574
5575
5576const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
5577
5578
5579void DispatchTable::AddRange(CharacterRange full_range, int value,
5580                             Zone* zone) {
5581  CharacterRange current = full_range;
5582  if (tree()->is_empty()) {
5583    // If this is the first range we just insert into the table.
5584    ZoneSplayTree<Config>::Locator loc;
5585    ASSERT_RESULT(tree()->Insert(current.from(), &loc));
5586    loc.set_value(Entry(current.from(), current.to(),
5587                        empty()->Extend(value, zone)));
5588    return;
5589  }
5590  // First see if there is a range to the left of this one that
5591  // overlaps.
5592  ZoneSplayTree<Config>::Locator loc;
5593  if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5594    Entry* entry = &loc.value();
5595    // If we've found a range that overlaps with this one, and it
5596    // starts strictly to the left of this one, we have to fix it
5597    // because the following code only handles ranges that start on
5598    // or after the start point of the range we're adding.
5599    if (entry->from() < current.from() && entry->to() >= current.from()) {
5600      // Snap the overlapping range in half around the start point of
5601      // the range we're adding.
5602      CharacterRange left(entry->from(), current.from() - 1);
5603      CharacterRange right(current.from(), entry->to());
5604      // The left part of the overlapping range doesn't overlap.
5605      // Truncate the whole entry to be just the left part.
5606      entry->set_to(left.to());
5607      // The right part is the one that overlaps.  We add this part
5608      // to the map and let the next step deal with merging it with
5609      // the range we're adding.
5610      ZoneSplayTree<Config>::Locator loc;
5611      ASSERT_RESULT(tree()->Insert(right.from(), &loc));
5612      loc.set_value(Entry(right.from(),
5613                          right.to(),
5614                          entry->out_set()));
5615    }
5616  }
5617  while (current.is_valid()) {
5618    if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5619        (loc.value().from() <= current.to()) &&
5620        (loc.value().to() >= current.from())) {
5621      Entry* entry = &loc.value();
5622      // We have overlap.  If there is space between the start point of
5623      // the range we're adding and where the overlapping range starts
5624      // then we have to add a range covering just that space.
5625      if (current.from() < entry->from()) {
5626        ZoneSplayTree<Config>::Locator ins;
5627        ASSERT_RESULT(tree()->Insert(current.from(), &ins));
5628        ins.set_value(Entry(current.from(),
5629                            entry->from() - 1,
5630                            empty()->Extend(value, zone)));
5631        current.set_from(entry->from());
5632      }
5633      ASSERT_EQ(current.from(), entry->from());
5634      // If the overlapping range extends beyond the one we want to add
5635      // we have to snap the right part off and add it separately.
5636      if (entry->to() > current.to()) {
5637        ZoneSplayTree<Config>::Locator ins;
5638        ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
5639        ins.set_value(Entry(current.to() + 1,
5640                            entry->to(),
5641                            entry->out_set()));
5642        entry->set_to(current.to());
5643      }
5644      ASSERT(entry->to() <= current.to());
5645      // The overlapping range is now completely contained by the range
5646      // we're adding so we can just update it and move the start point
5647      // of the range we're adding just past it.
5648      entry->AddValue(value, zone);
5649      // Bail out if the last interval ended at 0xFFFF since otherwise
5650      // adding 1 will wrap around to 0.
5651      if (entry->to() == String::kMaxUtf16CodeUnit)
5652        break;
5653      ASSERT(entry->to() + 1 > current.from());
5654      current.set_from(entry->to() + 1);
5655    } else {
5656      // There is no overlap so we can just add the range
5657      ZoneSplayTree<Config>::Locator ins;
5658      ASSERT_RESULT(tree()->Insert(current.from(), &ins));
5659      ins.set_value(Entry(current.from(),
5660                          current.to(),
5661                          empty()->Extend(value, zone)));
5662      break;
5663    }
5664  }
5665}
5666
5667
5668OutSet* DispatchTable::Get(uc16 value) {
5669  ZoneSplayTree<Config>::Locator loc;
5670  if (!tree()->FindGreatestLessThan(value, &loc))
5671    return empty();
5672  Entry* entry = &loc.value();
5673  if (value <= entry->to())
5674    return entry->out_set();
5675  else
5676    return empty();
5677}
5678
5679
5680// -------------------------------------------------------------------
5681// Analysis
5682
5683
5684void Analysis::EnsureAnalyzed(RegExpNode* that) {
5685  StackLimitCheck check(that->zone()->isolate());
5686  if (check.HasOverflowed()) {
5687    fail("Stack overflow");
5688    return;
5689  }
5690  if (that->info()->been_analyzed || that->info()->being_analyzed)
5691    return;
5692  that->info()->being_analyzed = true;
5693  that->Accept(this);
5694  that->info()->being_analyzed = false;
5695  that->info()->been_analyzed = true;
5696}
5697
5698
5699void Analysis::VisitEnd(EndNode* that) {
5700  // nothing to do
5701}
5702
5703
5704void TextNode::CalculateOffsets() {
5705  int element_count = elements()->length();
5706  // Set up the offsets of the elements relative to the start.  This is a fixed
5707  // quantity since a TextNode can only contain fixed-width things.
5708  int cp_offset = 0;
5709  for (int i = 0; i < element_count; i++) {
5710    TextElement& elm = elements()->at(i);
5711    elm.set_cp_offset(cp_offset);
5712    cp_offset += elm.length();
5713  }
5714}
5715
5716
5717void Analysis::VisitText(TextNode* that) {
5718  if (ignore_case_) {
5719    that->MakeCaseIndependent(is_ascii_);
5720  }
5721  EnsureAnalyzed(that->on_success());
5722  if (!has_failed()) {
5723    that->CalculateOffsets();
5724  }
5725}
5726
5727
5728void Analysis::VisitAction(ActionNode* that) {
5729  RegExpNode* target = that->on_success();
5730  EnsureAnalyzed(target);
5731  if (!has_failed()) {
5732    // If the next node is interested in what it follows then this node
5733    // has to be interested too so it can pass the information on.
5734    that->info()->AddFromFollowing(target->info());
5735  }
5736}
5737
5738
5739void Analysis::VisitChoice(ChoiceNode* that) {
5740  NodeInfo* info = that->info();
5741  for (int i = 0; i < that->alternatives()->length(); i++) {
5742    RegExpNode* node = that->alternatives()->at(i).node();
5743    EnsureAnalyzed(node);
5744    if (has_failed()) return;
5745    // Anything the following nodes need to know has to be known by
5746    // this node also, so it can pass it on.
5747    info->AddFromFollowing(node->info());
5748  }
5749}
5750
5751
5752void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
5753  NodeInfo* info = that->info();
5754  for (int i = 0; i < that->alternatives()->length(); i++) {
5755    RegExpNode* node = that->alternatives()->at(i).node();
5756    if (node != that->loop_node()) {
5757      EnsureAnalyzed(node);
5758      if (has_failed()) return;
5759      info->AddFromFollowing(node->info());
5760    }
5761  }
5762  // Check the loop last since it may need the value of this node
5763  // to get a correct result.
5764  EnsureAnalyzed(that->loop_node());
5765  if (!has_failed()) {
5766    info->AddFromFollowing(that->loop_node()->info());
5767  }
5768}
5769
5770
5771void Analysis::VisitBackReference(BackReferenceNode* that) {
5772  EnsureAnalyzed(that->on_success());
5773}
5774
5775
5776void Analysis::VisitAssertion(AssertionNode* that) {
5777  EnsureAnalyzed(that->on_success());
5778}
5779
5780
5781void BackReferenceNode::FillInBMInfo(int offset,
5782                                     int budget,
5783                                     BoyerMooreLookahead* bm,
5784                                     bool not_at_start) {
5785  // Working out the set of characters that a backreference can match is too
5786  // hard, so we just say that any character can match.
5787  bm->SetRest(offset);
5788  SaveBMInfo(bm, not_at_start, offset);
5789}
5790
5791
5792STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
5793              RegExpMacroAssembler::kTableSize);
5794
5795
5796void ChoiceNode::FillInBMInfo(int offset,
5797                              int budget,
5798                              BoyerMooreLookahead* bm,
5799                              bool not_at_start) {
5800  ZoneList<GuardedAlternative>* alts = alternatives();
5801  budget = (budget - 1) / alts->length();
5802  for (int i = 0; i < alts->length(); i++) {
5803    GuardedAlternative& alt = alts->at(i);
5804    if (alt.guards() != NULL && alt.guards()->length() != 0) {
5805      bm->SetRest(offset);  // Give up trying to fill in info.
5806      SaveBMInfo(bm, not_at_start, offset);
5807      return;
5808    }
5809    alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
5810  }
5811  SaveBMInfo(bm, not_at_start, offset);
5812}
5813
5814
5815void TextNode::FillInBMInfo(int initial_offset,
5816                            int budget,
5817                            BoyerMooreLookahead* bm,
5818                            bool not_at_start) {
5819  if (initial_offset >= bm->length()) return;
5820  int offset = initial_offset;
5821  int max_char = bm->max_char();
5822  for (int i = 0; i < elements()->length(); i++) {
5823    if (offset >= bm->length()) {
5824      if (initial_offset == 0) set_bm_info(not_at_start, bm);
5825      return;
5826    }
5827    TextElement text = elements()->at(i);
5828    if (text.text_type() == TextElement::ATOM) {
5829      RegExpAtom* atom = text.atom();
5830      for (int j = 0; j < atom->length(); j++, offset++) {
5831        if (offset >= bm->length()) {
5832          if (initial_offset == 0) set_bm_info(not_at_start, bm);
5833          return;
5834        }
5835        uc16 character = atom->data()[j];
5836        if (bm->compiler()->ignore_case()) {
5837          unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5838          int length = GetCaseIndependentLetters(
5839              Isolate::Current(),
5840              character,
5841              bm->max_char() == String::kMaxOneByteCharCode,
5842              chars);
5843          for (int j = 0; j < length; j++) {
5844            bm->Set(offset, chars[j]);
5845          }
5846        } else {
5847          if (character <= max_char) bm->Set(offset, character);
5848        }
5849      }
5850    } else {
5851      ASSERT_EQ(TextElement::CHAR_CLASS, text.text_type());
5852      RegExpCharacterClass* char_class = text.char_class();
5853      ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
5854      if (char_class->is_negated()) {
5855        bm->SetAll(offset);
5856      } else {
5857        for (int k = 0; k < ranges->length(); k++) {
5858          CharacterRange& range = ranges->at(k);
5859          if (range.from() > max_char) continue;
5860          int to = Min(max_char, static_cast<int>(range.to()));
5861          bm->SetInterval(offset, Interval(range.from(), to));
5862        }
5863      }
5864      offset++;
5865    }
5866  }
5867  if (offset >= bm->length()) {
5868    if (initial_offset == 0) set_bm_info(not_at_start, bm);
5869    return;
5870  }
5871  on_success()->FillInBMInfo(offset,
5872                             budget - 1,
5873                             bm,
5874                             true);  // Not at start after a text node.
5875  if (initial_offset == 0) set_bm_info(not_at_start, bm);
5876}
5877
5878
5879// -------------------------------------------------------------------
5880// Dispatch table construction
5881
5882
5883void DispatchTableConstructor::VisitEnd(EndNode* that) {
5884  AddRange(CharacterRange::Everything());
5885}
5886
5887
5888void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
5889  node->set_being_calculated(true);
5890  ZoneList<GuardedAlternative>* alternatives = node->alternatives();
5891  for (int i = 0; i < alternatives->length(); i++) {
5892    set_choice_index(i);
5893    alternatives->at(i).node()->Accept(this);
5894  }
5895  node->set_being_calculated(false);
5896}
5897
5898
5899class AddDispatchRange {
5900 public:
5901  explicit AddDispatchRange(DispatchTableConstructor* constructor)
5902    : constructor_(constructor) { }
5903  void Call(uc32 from, DispatchTable::Entry entry);
5904 private:
5905  DispatchTableConstructor* constructor_;
5906};
5907
5908
5909void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
5910  CharacterRange range(from, entry.to());
5911  constructor_->AddRange(range);
5912}
5913
5914
5915void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
5916  if (node->being_calculated())
5917    return;
5918  DispatchTable* table = node->GetTable(ignore_case_);
5919  AddDispatchRange adder(this);
5920  table->ForEach(&adder);
5921}
5922
5923
5924void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
5925  // TODO(160): Find the node that we refer back to and propagate its start
5926  // set back to here.  For now we just accept anything.
5927  AddRange(CharacterRange::Everything());
5928}
5929
5930
5931void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
5932  RegExpNode* target = that->on_success();
5933  target->Accept(this);
5934}
5935
5936
5937static int CompareRangeByFrom(const CharacterRange* a,
5938                              const CharacterRange* b) {
5939  return Compare<uc16>(a->from(), b->from());
5940}
5941
5942
5943void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
5944  ranges->Sort(CompareRangeByFrom);
5945  uc16 last = 0;
5946  for (int i = 0; i < ranges->length(); i++) {
5947    CharacterRange range = ranges->at(i);
5948    if (last < range.from())
5949      AddRange(CharacterRange(last, range.from() - 1));
5950    if (range.to() >= last) {
5951      if (range.to() == String::kMaxUtf16CodeUnit) {
5952        return;
5953      } else {
5954        last = range.to() + 1;
5955      }
5956    }
5957  }
5958  AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
5959}
5960
5961
5962void DispatchTableConstructor::VisitText(TextNode* that) {
5963  TextElement elm = that->elements()->at(0);
5964  switch (elm.text_type()) {
5965    case TextElement::ATOM: {
5966      uc16 c = elm.atom()->data()[0];
5967      AddRange(CharacterRange(c, c));
5968      break;
5969    }
5970    case TextElement::CHAR_CLASS: {
5971      RegExpCharacterClass* tree = elm.char_class();
5972      ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
5973      if (tree->is_negated()) {
5974        AddInverse(ranges);
5975      } else {
5976        for (int i = 0; i < ranges->length(); i++)
5977          AddRange(ranges->at(i));
5978      }
5979      break;
5980    }
5981    default: {
5982      UNIMPLEMENTED();
5983    }
5984  }
5985}
5986
5987
5988void DispatchTableConstructor::VisitAction(ActionNode* that) {
5989  RegExpNode* target = that->on_success();
5990  target->Accept(this);
5991}
5992
5993
5994RegExpEngine::CompilationResult RegExpEngine::Compile(
5995    RegExpCompileData* data,
5996    bool ignore_case,
5997    bool is_global,
5998    bool is_multiline,
5999    Handle<String> pattern,
6000    Handle<String> sample_subject,
6001    bool is_ascii,
6002    Zone* zone) {
6003  if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6004    return IrregexpRegExpTooBig(zone->isolate());
6005  }
6006  RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii, zone);
6007
6008  // Sample some characters from the middle of the string.
6009  static const int kSampleSize = 128;
6010
6011  FlattenString(sample_subject);
6012  int chars_sampled = 0;
6013  int half_way = (sample_subject->length() - kSampleSize) / 2;
6014  for (int i = Max(0, half_way);
6015       i < sample_subject->length() && chars_sampled < kSampleSize;
6016       i++, chars_sampled++) {
6017    compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6018  }
6019
6020  // Wrap the body of the regexp in capture #0.
6021  RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6022                                                    0,
6023                                                    &compiler,
6024                                                    compiler.accept());
6025  RegExpNode* node = captured_body;
6026  bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6027  bool is_start_anchored = data->tree->IsAnchoredAtStart();
6028  int max_length = data->tree->max_match();
6029  if (!is_start_anchored) {
6030    // Add a .*? at the beginning, outside the body capture, unless
6031    // this expression is anchored at the beginning.
6032    RegExpNode* loop_node =
6033        RegExpQuantifier::ToNode(0,
6034                                 RegExpTree::kInfinity,
6035                                 false,
6036                                 new(zone) RegExpCharacterClass('*'),
6037                                 &compiler,
6038                                 captured_body,
6039                                 data->contains_anchor);
6040
6041    if (data->contains_anchor) {
6042      // Unroll loop once, to take care of the case that might start
6043      // at the start of input.
6044      ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6045      first_step_node->AddAlternative(GuardedAlternative(captured_body));
6046      first_step_node->AddAlternative(GuardedAlternative(
6047          new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));
6048      node = first_step_node;
6049    } else {
6050      node = loop_node;
6051    }
6052  }
6053  if (is_ascii) {
6054    node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
6055    // Do it again to propagate the new nodes to places where they were not
6056    // put because they had not been calculated yet.
6057    if (node != NULL) {
6058      node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
6059    }
6060  }
6061
6062  if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6063  data->node = node;
6064  Analysis analysis(ignore_case, is_ascii);
6065  analysis.EnsureAnalyzed(node);
6066  if (analysis.has_failed()) {
6067    const char* error_message = analysis.error_message();
6068    return CompilationResult(zone->isolate(), error_message);
6069  }
6070
6071  // Create the correct assembler for the architecture.
6072#ifndef V8_INTERPRETED_REGEXP
6073  // Native regexp implementation.
6074
6075  NativeRegExpMacroAssembler::Mode mode =
6076      is_ascii ? NativeRegExpMacroAssembler::ASCII
6077               : NativeRegExpMacroAssembler::UC16;
6078
6079#if V8_TARGET_ARCH_IA32
6080  RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2,
6081                                           zone);
6082#elif V8_TARGET_ARCH_X64
6083  RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2,
6084                                          zone);
6085#elif V8_TARGET_ARCH_ARM
6086  RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2,
6087                                          zone);
6088#elif V8_TARGET_ARCH_MIPS
6089  RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2,
6090                                           zone);
6091#endif
6092
6093#else  // V8_INTERPRETED_REGEXP
6094  // Interpreted regexp implementation.
6095  EmbeddedVector<byte, 1024> codes;
6096  RegExpMacroAssemblerIrregexp macro_assembler(codes, zone);
6097#endif  // V8_INTERPRETED_REGEXP
6098
6099  // Inserted here, instead of in Assembler, because it depends on information
6100  // in the AST that isn't replicated in the Node structure.
6101  static const int kMaxBacksearchLimit = 1024;
6102  if (is_end_anchored &&
6103      !is_start_anchored &&
6104      max_length < kMaxBacksearchLimit) {
6105    macro_assembler.SetCurrentPositionFromEnd(max_length);
6106  }
6107
6108  if (is_global) {
6109    macro_assembler.set_global_mode(
6110        (data->tree->min_match() > 0)
6111            ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
6112            : RegExpMacroAssembler::GLOBAL);
6113  }
6114
6115  return compiler.Assemble(&macro_assembler,
6116                           node,
6117                           data->capture_count,
6118                           pattern);
6119}
6120
6121
6122}}  // namespace v8::internal
6123