1// Copyright 2011 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
30#include "ast.h"
31#include "compiler.h"
32#include "execution.h"
33#include "factory.h"
34#include "jsregexp.h"
35#include "platform.h"
36#include "string-search.h"
37#include "runtime.h"
38#include "compilation-cache.h"
39#include "string-stream.h"
40#include "parser.h"
41#include "regexp-macro-assembler.h"
42#include "regexp-macro-assembler-tracer.h"
43#include "regexp-macro-assembler-irregexp.h"
44#include "regexp-stack.h"
45
46#ifndef V8_INTERPRETED_REGEXP
47#if V8_TARGET_ARCH_IA32
48#include "ia32/regexp-macro-assembler-ia32.h"
49#elif V8_TARGET_ARCH_X64
50#include "x64/regexp-macro-assembler-x64.h"
51#elif V8_TARGET_ARCH_ARM
52#include "arm/regexp-macro-assembler-arm.h"
53#elif V8_TARGET_ARCH_MIPS
54#include "mips/regexp-macro-assembler-mips.h"
55#else
56#error Unsupported target architecture.
57#endif
58#endif
59
60#include "interpreter-irregexp.h"
61
62
63namespace v8 {
64namespace internal {
65
66Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
67                                               Handle<String> pattern,
68                                               Handle<String> flags,
69                                               bool* has_pending_exception) {
70  // Call the construct code with 2 arguments.
71  Handle<Object> argv[] = { pattern, flags };
72  return Execution::New(constructor, ARRAY_SIZE(argv), argv,
73                        has_pending_exception);
74}
75
76
77static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
78  int flags = JSRegExp::NONE;
79  for (int i = 0; i < str->length(); i++) {
80    switch (str->Get(i)) {
81      case 'i':
82        flags |= JSRegExp::IGNORE_CASE;
83        break;
84      case 'g':
85        flags |= JSRegExp::GLOBAL;
86        break;
87      case 'm':
88        flags |= JSRegExp::MULTILINE;
89        break;
90    }
91  }
92  return JSRegExp::Flags(flags);
93}
94
95
96static inline void ThrowRegExpException(Handle<JSRegExp> re,
97                                        Handle<String> pattern,
98                                        Handle<String> error_text,
99                                        const char* message) {
100  Isolate* isolate = re->GetIsolate();
101  Factory* factory = isolate->factory();
102  Handle<FixedArray> elements = factory->NewFixedArray(2);
103  elements->set(0, *pattern);
104  elements->set(1, *error_text);
105  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
106  Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
107  isolate->Throw(*regexp_err);
108}
109
110
111// Generic RegExp methods. Dispatches to implementation specific methods.
112
113
114Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
115                                   Handle<String> pattern,
116                                   Handle<String> flag_str) {
117  Isolate* isolate = re->GetIsolate();
118  JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
119  CompilationCache* compilation_cache = isolate->compilation_cache();
120  Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags);
121  bool in_cache = !cached.is_null();
122  LOG(isolate, RegExpCompileEvent(re, in_cache));
123
124  Handle<Object> result;
125  if (in_cache) {
126    re->set_data(*cached);
127    return re;
128  }
129  pattern = FlattenGetString(pattern);
130  ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
131  PostponeInterruptsScope postpone(isolate);
132  RegExpCompileData parse_result;
133  FlatStringReader reader(isolate, pattern);
134  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
135                                 &parse_result)) {
136    // Throw an exception if we fail to parse the pattern.
137    ThrowRegExpException(re,
138                         pattern,
139                         parse_result.error,
140                         "malformed_regexp");
141    return Handle<Object>::null();
142  }
143
144  if (parse_result.simple && !flags.is_ignore_case()) {
145    // Parse-tree is a single atom that is equal to the pattern.
146    AtomCompile(re, pattern, flags, pattern);
147  } else if (parse_result.tree->IsAtom() &&
148      !flags.is_ignore_case() &&
149      parse_result.capture_count == 0) {
150    RegExpAtom* atom = parse_result.tree->AsAtom();
151    Vector<const uc16> atom_pattern = atom->data();
152    Handle<String> atom_string =
153        isolate->factory()->NewStringFromTwoByte(atom_pattern);
154    AtomCompile(re, pattern, flags, atom_string);
155  } else {
156    IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
157  }
158  ASSERT(re->data()->IsFixedArray());
159  // Compilation succeeded so the data is set on the regexp
160  // and we can store it in the cache.
161  Handle<FixedArray> data(FixedArray::cast(re->data()));
162  compilation_cache->PutRegExp(pattern, flags, data);
163
164  return re;
165}
166
167
168Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
169                                Handle<String> subject,
170                                int index,
171                                Handle<JSArray> last_match_info) {
172  switch (regexp->TypeTag()) {
173    case JSRegExp::ATOM:
174      return AtomExec(regexp, subject, index, last_match_info);
175    case JSRegExp::IRREGEXP: {
176      Handle<Object> result =
177          IrregexpExec(regexp, subject, index, last_match_info);
178      ASSERT(!result.is_null() ||
179             regexp->GetIsolate()->has_pending_exception());
180      return result;
181    }
182    default:
183      UNREACHABLE();
184      return Handle<Object>::null();
185  }
186}
187
188
189// RegExp Atom implementation: Simple string search using indexOf.
190
191
192void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
193                             Handle<String> pattern,
194                             JSRegExp::Flags flags,
195                             Handle<String> match_pattern) {
196  re->GetIsolate()->factory()->SetRegExpAtomData(re,
197                                                 JSRegExp::ATOM,
198                                                 pattern,
199                                                 flags,
200                                                 match_pattern);
201}
202
203
204static void SetAtomLastCapture(FixedArray* array,
205                               String* subject,
206                               int from,
207                               int to) {
208  NoHandleAllocation no_handles;
209  RegExpImpl::SetLastCaptureCount(array, 2);
210  RegExpImpl::SetLastSubject(array, subject);
211  RegExpImpl::SetLastInput(array, subject);
212  RegExpImpl::SetCapture(array, 0, from);
213  RegExpImpl::SetCapture(array, 1, to);
214}
215
216
217Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
218                                    Handle<String> subject,
219                                    int index,
220                                    Handle<JSArray> last_match_info) {
221  Isolate* isolate = re->GetIsolate();
222
223  ASSERT(0 <= index);
224  ASSERT(index <= subject->length());
225
226  if (!subject->IsFlat()) FlattenString(subject);
227  AssertNoAllocation no_heap_allocation;  // ensure vectors stay valid
228
229  String* needle = String::cast(re->DataAt(JSRegExp::kAtomPatternIndex));
230  int needle_len = needle->length();
231  ASSERT(needle->IsFlat());
232
233  if (needle_len != 0) {
234    if (index + needle_len > subject->length()) {
235      return isolate->factory()->null_value();
236    }
237
238    String::FlatContent needle_content = needle->GetFlatContent();
239    String::FlatContent subject_content = subject->GetFlatContent();
240    ASSERT(needle_content.IsFlat());
241    ASSERT(subject_content.IsFlat());
242    // dispatch on type of strings
243    index = (needle_content.IsAscii()
244             ? (subject_content.IsAscii()
245                ? SearchString(isolate,
246                               subject_content.ToAsciiVector(),
247                               needle_content.ToAsciiVector(),
248                               index)
249                : SearchString(isolate,
250                               subject_content.ToUC16Vector(),
251                               needle_content.ToAsciiVector(),
252                               index))
253             : (subject_content.IsAscii()
254                ? SearchString(isolate,
255                               subject_content.ToAsciiVector(),
256                               needle_content.ToUC16Vector(),
257                               index)
258                : SearchString(isolate,
259                               subject_content.ToUC16Vector(),
260                               needle_content.ToUC16Vector(),
261                               index)));
262    if (index == -1) return isolate->factory()->null_value();
263  }
264  ASSERT(last_match_info->HasFastElements());
265
266  {
267    NoHandleAllocation no_handles;
268    FixedArray* array = FixedArray::cast(last_match_info->elements());
269    SetAtomLastCapture(array, *subject, index, index + needle_len);
270  }
271  return last_match_info;
272}
273
274
275// Irregexp implementation.
276
277// Ensures that the regexp object contains a compiled version of the
278// source for either ASCII or non-ASCII strings.
279// If the compiled version doesn't already exist, it is compiled
280// from the source pattern.
281// If compilation fails, an exception is thrown and this function
282// returns false.
283bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
284  Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
285#ifdef V8_INTERPRETED_REGEXP
286  if (compiled_code->IsByteArray()) return true;
287#else  // V8_INTERPRETED_REGEXP (RegExp native code)
288  if (compiled_code->IsCode()) return true;
289#endif
290  // We could potentially have marked this as flushable, but have kept
291  // a saved version if we did not flush it yet.
292  Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
293  if (saved_code->IsCode()) {
294    // Reinstate the code in the original place.
295    re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
296    ASSERT(compiled_code->IsSmi());
297    return true;
298  }
299  return CompileIrregexp(re, is_ascii);
300}
301
302
303static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
304                                            bool is_ascii,
305                                            Handle<String> error_message,
306                                            Isolate* isolate) {
307  Factory* factory = isolate->factory();
308  Handle<FixedArray> elements = factory->NewFixedArray(2);
309  elements->set(0, re->Pattern());
310  elements->set(1, *error_message);
311  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
312  Handle<Object> regexp_err =
313      factory->NewSyntaxError("malformed_regexp", array);
314  isolate->Throw(*regexp_err);
315  return false;
316}
317
318
319bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
320  // Compile the RegExp.
321  Isolate* isolate = re->GetIsolate();
322  ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
323  PostponeInterruptsScope postpone(isolate);
324  // If we had a compilation error the last time this is saved at the
325  // saved code index.
326  Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
327  // When arriving here entry can only be a smi, either representing an
328  // uncompiled regexp, a previous compilation error, or code that has
329  // been flushed.
330  ASSERT(entry->IsSmi());
331  int entry_value = Smi::cast(entry)->value();
332  ASSERT(entry_value == JSRegExp::kUninitializedValue ||
333         entry_value == JSRegExp::kCompilationErrorValue ||
334         (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
335
336  if (entry_value == JSRegExp::kCompilationErrorValue) {
337    // A previous compilation failed and threw an error which we store in
338    // the saved code index (we store the error message, not the actual
339    // error). Recreate the error object and throw it.
340    Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
341    ASSERT(error_string->IsString());
342    Handle<String> error_message(String::cast(error_string));
343    CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
344    return false;
345  }
346
347  JSRegExp::Flags flags = re->GetFlags();
348
349  Handle<String> pattern(re->Pattern());
350  if (!pattern->IsFlat()) FlattenString(pattern);
351  RegExpCompileData compile_data;
352  FlatStringReader reader(isolate, pattern);
353  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
354                                 &compile_data)) {
355    // Throw an exception if we fail to parse the pattern.
356    // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
357    ThrowRegExpException(re,
358                         pattern,
359                         compile_data.error,
360                         "malformed_regexp");
361    return false;
362  }
363  RegExpEngine::CompilationResult result =
364      RegExpEngine::Compile(&compile_data,
365                            flags.is_ignore_case(),
366                            flags.is_multiline(),
367                            pattern,
368                            is_ascii);
369  if (result.error_message != NULL) {
370    // Unable to compile regexp.
371    Handle<String> error_message =
372        isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message));
373    CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
374    return false;
375  }
376
377  Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
378  data->set(JSRegExp::code_index(is_ascii), result.code);
379  int register_max = IrregexpMaxRegisterCount(*data);
380  if (result.num_registers > register_max) {
381    SetIrregexpMaxRegisterCount(*data, result.num_registers);
382  }
383
384  return true;
385}
386
387
388int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
389  return Smi::cast(
390      re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
391}
392
393
394void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
395  re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
396}
397
398
399int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
400  return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
401}
402
403
404int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
405  return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
406}
407
408
409ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
410  return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
411}
412
413
414Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
415  return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
416}
417
418
419void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
420                                    Handle<String> pattern,
421                                    JSRegExp::Flags flags,
422                                    int capture_count) {
423  // Initialize compiled code entries to null.
424  re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
425                                                     JSRegExp::IRREGEXP,
426                                                     pattern,
427                                                     flags,
428                                                     capture_count);
429}
430
431
432int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
433                                Handle<String> subject) {
434  if (!subject->IsFlat()) FlattenString(subject);
435
436  // Check the asciiness of the underlying storage.
437  bool is_ascii = subject->IsAsciiRepresentationUnderneath();
438  if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1;
439
440#ifdef V8_INTERPRETED_REGEXP
441  // Byte-code regexp needs space allocated for all its registers.
442  return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data()));
443#else  // V8_INTERPRETED_REGEXP
444  // Native regexp only needs room to output captures. Registers are handled
445  // internally.
446  return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
447#endif  // V8_INTERPRETED_REGEXP
448}
449
450
451RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce(
452    Handle<JSRegExp> regexp,
453    Handle<String> subject,
454    int index,
455    Vector<int> output) {
456  Isolate* isolate = regexp->GetIsolate();
457
458  Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
459
460  ASSERT(index >= 0);
461  ASSERT(index <= subject->length());
462  ASSERT(subject->IsFlat());
463
464  bool is_ascii = subject->IsAsciiRepresentationUnderneath();
465
466#ifndef V8_INTERPRETED_REGEXP
467  ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
468  do {
469    EnsureCompiledIrregexp(regexp, is_ascii);
470    Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
471    NativeRegExpMacroAssembler::Result res =
472        NativeRegExpMacroAssembler::Match(code,
473                                          subject,
474                                          output.start(),
475                                          output.length(),
476                                          index,
477                                          isolate);
478    if (res != NativeRegExpMacroAssembler::RETRY) {
479      ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION ||
480             isolate->has_pending_exception());
481      STATIC_ASSERT(
482          static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
483      STATIC_ASSERT(
484          static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
485      STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
486                    == RE_EXCEPTION);
487      return static_cast<IrregexpResult>(res);
488    }
489    // If result is RETRY, the string has changed representation, and we
490    // must restart from scratch.
491    // In this case, it means we must make sure we are prepared to handle
492    // the, potentially, different subject (the string can switch between
493    // being internal and external, and even between being ASCII and UC16,
494    // but the characters are always the same).
495    IrregexpPrepare(regexp, subject);
496    is_ascii = subject->IsAsciiRepresentationUnderneath();
497  } while (true);
498  UNREACHABLE();
499  return RE_EXCEPTION;
500#else  // V8_INTERPRETED_REGEXP
501
502  ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp));
503  // We must have done EnsureCompiledIrregexp, so we can get the number of
504  // registers.
505  int* register_vector = output.start();
506  int number_of_capture_registers =
507      (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
508  for (int i = number_of_capture_registers - 1; i >= 0; i--) {
509    register_vector[i] = -1;
510  }
511  Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
512
513  IrregexpResult result = IrregexpInterpreter::Match(isolate,
514                                                     byte_codes,
515                                                     subject,
516                                                     register_vector,
517                                                     index);
518  if (result == RE_EXCEPTION) {
519    ASSERT(!isolate->has_pending_exception());
520    isolate->StackOverflow();
521  }
522  return result;
523#endif  // V8_INTERPRETED_REGEXP
524}
525
526
527Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
528                                        Handle<String> subject,
529                                        int previous_index,
530                                        Handle<JSArray> last_match_info) {
531  Isolate* isolate = jsregexp->GetIsolate();
532  ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
533
534  // Prepare space for the return values.
535#ifdef V8_INTERPRETED_REGEXP
536#ifdef DEBUG
537  if (FLAG_trace_regexp_bytecodes) {
538    String* pattern = jsregexp->Pattern();
539    PrintF("\n\nRegexp match:   /%s/\n\n", *(pattern->ToCString()));
540    PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
541  }
542#endif
543#endif
544  int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject);
545  if (required_registers < 0) {
546    // Compiling failed with an exception.
547    ASSERT(isolate->has_pending_exception());
548    return Handle<Object>::null();
549  }
550
551  OffsetsVector registers(required_registers, isolate);
552
553  IrregexpResult res = RegExpImpl::IrregexpExecOnce(
554      jsregexp, subject, previous_index, Vector<int>(registers.vector(),
555                                                     registers.length()));
556  if (res == RE_SUCCESS) {
557    int capture_register_count =
558        (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
559    last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
560    AssertNoAllocation no_gc;
561    int* register_vector = registers.vector();
562    FixedArray* array = FixedArray::cast(last_match_info->elements());
563    for (int i = 0; i < capture_register_count; i += 2) {
564      SetCapture(array, i, register_vector[i]);
565      SetCapture(array, i + 1, register_vector[i + 1]);
566    }
567    SetLastCaptureCount(array, capture_register_count);
568    SetLastSubject(array, *subject);
569    SetLastInput(array, *subject);
570    return last_match_info;
571  }
572  if (res == RE_EXCEPTION) {
573    ASSERT(isolate->has_pending_exception());
574    return Handle<Object>::null();
575  }
576  ASSERT(res == RE_FAILURE);
577  return isolate->factory()->null_value();
578}
579
580
581// -------------------------------------------------------------------
582// Implementation of the Irregexp regular expression engine.
583//
584// The Irregexp regular expression engine is intended to be a complete
585// implementation of ECMAScript regular expressions.  It generates either
586// bytecodes or native code.
587
588//   The Irregexp regexp engine is structured in three steps.
589//   1) The parser generates an abstract syntax tree.  See ast.cc.
590//   2) From the AST a node network is created.  The nodes are all
591//      subclasses of RegExpNode.  The nodes represent states when
592//      executing a regular expression.  Several optimizations are
593//      performed on the node network.
594//   3) From the nodes we generate either byte codes or native code
595//      that can actually execute the regular expression (perform
596//      the search).  The code generation step is described in more
597//      detail below.
598
599// Code generation.
600//
601//   The nodes are divided into four main categories.
602//   * Choice nodes
603//        These represent places where the regular expression can
604//        match in more than one way.  For example on entry to an
605//        alternation (foo|bar) or a repetition (*, +, ? or {}).
606//   * Action nodes
607//        These represent places where some action should be
608//        performed.  Examples include recording the current position
609//        in the input string to a register (in order to implement
610//        captures) or other actions on register for example in order
611//        to implement the counters needed for {} repetitions.
612//   * Matching nodes
613//        These attempt to match some element part of the input string.
614//        Examples of elements include character classes, plain strings
615//        or back references.
616//   * End nodes
617//        These are used to implement the actions required on finding
618//        a successful match or failing to find a match.
619//
620//   The code generated (whether as byte codes or native code) maintains
621//   some state as it runs.  This consists of the following elements:
622//
623//   * The capture registers.  Used for string captures.
624//   * Other registers.  Used for counters etc.
625//   * The current position.
626//   * The stack of backtracking information.  Used when a matching node
627//     fails to find a match and needs to try an alternative.
628//
629// Conceptual regular expression execution model:
630//
631//   There is a simple conceptual model of regular expression execution
632//   which will be presented first.  The actual code generated is a more
633//   efficient simulation of the simple conceptual model:
634//
635//   * Choice nodes are implemented as follows:
636//     For each choice except the last {
637//       push current position
638//       push backtrack code location
639//       <generate code to test for choice>
640//       backtrack code location:
641//       pop current position
642//     }
643//     <generate code to test for last choice>
644//
645//   * Actions nodes are generated as follows
646//     <push affected registers on backtrack stack>
647//     <generate code to perform action>
648//     push backtrack code location
649//     <generate code to test for following nodes>
650//     backtrack code location:
651//     <pop affected registers to restore their state>
652//     <pop backtrack location from stack and go to it>
653//
654//   * Matching nodes are generated as follows:
655//     if input string matches at current position
656//       update current position
657//       <generate code to test for following nodes>
658//     else
659//       <pop backtrack location from stack and go to it>
660//
661//   Thus it can be seen that the current position is saved and restored
662//   by the choice nodes, whereas the registers are saved and restored by
663//   by the action nodes that manipulate them.
664//
665//   The other interesting aspect of this model is that nodes are generated
666//   at the point where they are needed by a recursive call to Emit().  If
667//   the node has already been code generated then the Emit() call will
668//   generate a jump to the previously generated code instead.  In order to
669//   limit recursion it is possible for the Emit() function to put the node
670//   on a work list for later generation and instead generate a jump.  The
671//   destination of the jump is resolved later when the code is generated.
672//
673// Actual regular expression code generation.
674//
675//   Code generation is actually more complicated than the above.  In order
676//   to improve the efficiency of the generated code some optimizations are
677//   performed
678//
679//   * Choice nodes have 1-character lookahead.
680//     A choice node looks at the following character and eliminates some of
681//     the choices immediately based on that character.  This is not yet
682//     implemented.
683//   * Simple greedy loops store reduced backtracking information.
684//     A quantifier like /.*foo/m will greedily match the whole input.  It will
685//     then need to backtrack to a point where it can match "foo".  The naive
686//     implementation of this would push each character position onto the
687//     backtracking stack, then pop them off one by one.  This would use space
688//     proportional to the length of the input string.  However since the "."
689//     can only match in one way and always has a constant length (in this case
690//     of 1) it suffices to store the current position on the top of the stack
691//     once.  Matching now becomes merely incrementing the current position and
692//     backtracking becomes decrementing the current position and checking the
693//     result against the stored current position.  This is faster and saves
694//     space.
695//   * The current state is virtualized.
696//     This is used to defer expensive operations until it is clear that they
697//     are needed and to generate code for a node more than once, allowing
698//     specialized an efficient versions of the code to be created. This is
699//     explained in the section below.
700//
701// Execution state virtualization.
702//
703//   Instead of emitting code, nodes that manipulate the state can record their
704//   manipulation in an object called the Trace.  The Trace object can record a
705//   current position offset, an optional backtrack code location on the top of
706//   the virtualized backtrack stack and some register changes.  When a node is
707//   to be emitted it can flush the Trace or update it.  Flushing the Trace
708//   will emit code to bring the actual state into line with the virtual state.
709//   Avoiding flushing the state can postpone some work (e.g. updates of capture
710//   registers).  Postponing work can save time when executing the regular
711//   expression since it may be found that the work never has to be done as a
712//   failure to match can occur.  In addition it is much faster to jump to a
713//   known backtrack code location than it is to pop an unknown backtrack
714//   location from the stack and jump there.
715//
716//   The virtual state found in the Trace affects code generation.  For example
717//   the virtual state contains the difference between the actual current
718//   position and the virtual current position, and matching code needs to use
719//   this offset to attempt a match in the correct location of the input
720//   string.  Therefore code generated for a non-trivial trace is specialized
721//   to that trace.  The code generator therefore has the ability to generate
722//   code for each node several times.  In order to limit the size of the
723//   generated code there is an arbitrary limit on how many specialized sets of
724//   code may be generated for a given node.  If the limit is reached, the
725//   trace is flushed and a generic version of the code for a node is emitted.
726//   This is subsequently used for that node.  The code emitted for non-generic
727//   trace is not recorded in the node and so it cannot currently be reused in
728//   the event that code generation is requested for an identical trace.
729
730
731void RegExpTree::AppendToText(RegExpText* text) {
732  UNREACHABLE();
733}
734
735
736void RegExpAtom::AppendToText(RegExpText* text) {
737  text->AddElement(TextElement::Atom(this));
738}
739
740
741void RegExpCharacterClass::AppendToText(RegExpText* text) {
742  text->AddElement(TextElement::CharClass(this));
743}
744
745
746void RegExpText::AppendToText(RegExpText* text) {
747  for (int i = 0; i < elements()->length(); i++)
748    text->AddElement(elements()->at(i));
749}
750
751
752TextElement TextElement::Atom(RegExpAtom* atom) {
753  TextElement result = TextElement(ATOM);
754  result.data.u_atom = atom;
755  return result;
756}
757
758
759TextElement TextElement::CharClass(
760      RegExpCharacterClass* char_class) {
761  TextElement result = TextElement(CHAR_CLASS);
762  result.data.u_char_class = char_class;
763  return result;
764}
765
766
767int TextElement::length() {
768  if (type == ATOM) {
769    return data.u_atom->length();
770  } else {
771    ASSERT(type == CHAR_CLASS);
772    return 1;
773  }
774}
775
776
777DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
778  if (table_ == NULL) {
779    table_ = new DispatchTable();
780    DispatchTableConstructor cons(table_, ignore_case);
781    cons.BuildTable(this);
782  }
783  return table_;
784}
785
786
787class RegExpCompiler {
788 public:
789  RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
790
791  int AllocateRegister() {
792    if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
793      reg_exp_too_big_ = true;
794      return next_register_;
795    }
796    return next_register_++;
797  }
798
799  RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
800                                           RegExpNode* start,
801                                           int capture_count,
802                                           Handle<String> pattern);
803
804  inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
805
806  static const int kImplementationOffset = 0;
807  static const int kNumberOfRegistersOffset = 0;
808  static const int kCodeOffset = 1;
809
810  RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
811  EndNode* accept() { return accept_; }
812
813  static const int kMaxRecursion = 100;
814  inline int recursion_depth() { return recursion_depth_; }
815  inline void IncrementRecursionDepth() { recursion_depth_++; }
816  inline void DecrementRecursionDepth() { recursion_depth_--; }
817
818  void SetRegExpTooBig() { reg_exp_too_big_ = true; }
819
820  inline bool ignore_case() { return ignore_case_; }
821  inline bool ascii() { return ascii_; }
822
823  int current_expansion_factor() { return current_expansion_factor_; }
824  void set_current_expansion_factor(int value) {
825    current_expansion_factor_ = value;
826  }
827
828  static const int kNoRegister = -1;
829
830 private:
831  EndNode* accept_;
832  int next_register_;
833  List<RegExpNode*>* work_list_;
834  int recursion_depth_;
835  RegExpMacroAssembler* macro_assembler_;
836  bool ignore_case_;
837  bool ascii_;
838  bool reg_exp_too_big_;
839  int current_expansion_factor_;
840};
841
842
843class RecursionCheck {
844 public:
845  explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
846    compiler->IncrementRecursionDepth();
847  }
848  ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
849 private:
850  RegExpCompiler* compiler_;
851};
852
853
854static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
855  return RegExpEngine::CompilationResult("RegExp too big");
856}
857
858
859// Attempts to compile the regexp using an Irregexp code generator.  Returns
860// a fixed array or a null handle depending on whether it succeeded.
861RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
862    : next_register_(2 * (capture_count + 1)),
863      work_list_(NULL),
864      recursion_depth_(0),
865      ignore_case_(ignore_case),
866      ascii_(ascii),
867      reg_exp_too_big_(false),
868      current_expansion_factor_(1) {
869  accept_ = new EndNode(EndNode::ACCEPT);
870  ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
871}
872
873
874RegExpEngine::CompilationResult RegExpCompiler::Assemble(
875    RegExpMacroAssembler* macro_assembler,
876    RegExpNode* start,
877    int capture_count,
878    Handle<String> pattern) {
879  Heap* heap = pattern->GetHeap();
880
881  bool use_slow_safe_regexp_compiler = false;
882  if (heap->total_regexp_code_generated() >
883          RegExpImpl::kRegWxpCompiledLimit &&
884      heap->isolate()->memory_allocator()->SizeExecutable() >
885          RegExpImpl::kRegExpExecutableMemoryLimit) {
886    use_slow_safe_regexp_compiler = true;
887  }
888
889  macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
890
891#ifdef DEBUG
892  if (FLAG_trace_regexp_assembler)
893    macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
894  else
895#endif
896    macro_assembler_ = macro_assembler;
897
898  List <RegExpNode*> work_list(0);
899  work_list_ = &work_list;
900  Label fail;
901  macro_assembler_->PushBacktrack(&fail);
902  Trace new_trace;
903  start->Emit(this, &new_trace);
904  macro_assembler_->Bind(&fail);
905  macro_assembler_->Fail();
906  while (!work_list.is_empty()) {
907    work_list.RemoveLast()->Emit(this, &new_trace);
908  }
909  if (reg_exp_too_big_) return IrregexpRegExpTooBig();
910
911  Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
912  heap->IncreaseTotalRegexpCodeGenerated(code->Size());
913  work_list_ = NULL;
914#ifdef DEBUG
915  if (FLAG_print_code) {
916    Handle<Code>::cast(code)->Disassemble(*pattern->ToCString());
917  }
918  if (FLAG_trace_regexp_assembler) {
919    delete macro_assembler_;
920  }
921#endif
922  return RegExpEngine::CompilationResult(*code, next_register_);
923}
924
925
926bool Trace::DeferredAction::Mentions(int that) {
927  if (type() == ActionNode::CLEAR_CAPTURES) {
928    Interval range = static_cast<DeferredClearCaptures*>(this)->range();
929    return range.Contains(that);
930  } else {
931    return reg() == that;
932  }
933}
934
935
936bool Trace::mentions_reg(int reg) {
937  for (DeferredAction* action = actions_;
938       action != NULL;
939       action = action->next()) {
940    if (action->Mentions(reg))
941      return true;
942  }
943  return false;
944}
945
946
947bool Trace::GetStoredPosition(int reg, int* cp_offset) {
948  ASSERT_EQ(0, *cp_offset);
949  for (DeferredAction* action = actions_;
950       action != NULL;
951       action = action->next()) {
952    if (action->Mentions(reg)) {
953      if (action->type() == ActionNode::STORE_POSITION) {
954        *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
955        return true;
956      } else {
957        return false;
958      }
959    }
960  }
961  return false;
962}
963
964
965int Trace::FindAffectedRegisters(OutSet* affected_registers) {
966  int max_register = RegExpCompiler::kNoRegister;
967  for (DeferredAction* action = actions_;
968       action != NULL;
969       action = action->next()) {
970    if (action->type() == ActionNode::CLEAR_CAPTURES) {
971      Interval range = static_cast<DeferredClearCaptures*>(action)->range();
972      for (int i = range.from(); i <= range.to(); i++)
973        affected_registers->Set(i);
974      if (range.to() > max_register) max_register = range.to();
975    } else {
976      affected_registers->Set(action->reg());
977      if (action->reg() > max_register) max_register = action->reg();
978    }
979  }
980  return max_register;
981}
982
983
984void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
985                                     int max_register,
986                                     OutSet& registers_to_pop,
987                                     OutSet& registers_to_clear) {
988  for (int reg = max_register; reg >= 0; reg--) {
989    if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
990    else if (registers_to_clear.Get(reg)) {
991      int clear_to = reg;
992      while (reg > 0 && registers_to_clear.Get(reg - 1)) {
993        reg--;
994      }
995      assembler->ClearRegisters(reg, clear_to);
996    }
997  }
998}
999
1000
1001void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1002                                   int max_register,
1003                                   OutSet& affected_registers,
1004                                   OutSet* registers_to_pop,
1005                                   OutSet* registers_to_clear) {
1006  // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1007  const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1008
1009  // Count pushes performed to force a stack limit check occasionally.
1010  int pushes = 0;
1011
1012  for (int reg = 0; reg <= max_register; reg++) {
1013    if (!affected_registers.Get(reg)) {
1014      continue;
1015    }
1016
1017    // The chronologically first deferred action in the trace
1018    // is used to infer the action needed to restore a register
1019    // to its previous state (or not, if it's safe to ignore it).
1020    enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1021    DeferredActionUndoType undo_action = IGNORE;
1022
1023    int value = 0;
1024    bool absolute = false;
1025    bool clear = false;
1026    int store_position = -1;
1027    // This is a little tricky because we are scanning the actions in reverse
1028    // historical order (newest first).
1029    for (DeferredAction* action = actions_;
1030         action != NULL;
1031         action = action->next()) {
1032      if (action->Mentions(reg)) {
1033        switch (action->type()) {
1034          case ActionNode::SET_REGISTER: {
1035            Trace::DeferredSetRegister* psr =
1036                static_cast<Trace::DeferredSetRegister*>(action);
1037            if (!absolute) {
1038              value += psr->value();
1039              absolute = true;
1040            }
1041            // SET_REGISTER is currently only used for newly introduced loop
1042            // counters. They can have a significant previous value if they
1043            // occour in a loop. TODO(lrn): Propagate this information, so
1044            // we can set undo_action to IGNORE if we know there is no value to
1045            // restore.
1046            undo_action = RESTORE;
1047            ASSERT_EQ(store_position, -1);
1048            ASSERT(!clear);
1049            break;
1050          }
1051          case ActionNode::INCREMENT_REGISTER:
1052            if (!absolute) {
1053              value++;
1054            }
1055            ASSERT_EQ(store_position, -1);
1056            ASSERT(!clear);
1057            undo_action = RESTORE;
1058            break;
1059          case ActionNode::STORE_POSITION: {
1060            Trace::DeferredCapture* pc =
1061                static_cast<Trace::DeferredCapture*>(action);
1062            if (!clear && store_position == -1) {
1063              store_position = pc->cp_offset();
1064            }
1065
1066            // For captures we know that stores and clears alternate.
1067            // Other register, are never cleared, and if the occur
1068            // inside a loop, they might be assigned more than once.
1069            if (reg <= 1) {
1070              // Registers zero and one, aka "capture zero", is
1071              // always set correctly if we succeed. There is no
1072              // need to undo a setting on backtrack, because we
1073              // will set it again or fail.
1074              undo_action = IGNORE;
1075            } else {
1076              undo_action = pc->is_capture() ? CLEAR : RESTORE;
1077            }
1078            ASSERT(!absolute);
1079            ASSERT_EQ(value, 0);
1080            break;
1081          }
1082          case ActionNode::CLEAR_CAPTURES: {
1083            // Since we're scanning in reverse order, if we've already
1084            // set the position we have to ignore historically earlier
1085            // clearing operations.
1086            if (store_position == -1) {
1087              clear = true;
1088            }
1089            undo_action = RESTORE;
1090            ASSERT(!absolute);
1091            ASSERT_EQ(value, 0);
1092            break;
1093          }
1094          default:
1095            UNREACHABLE();
1096            break;
1097        }
1098      }
1099    }
1100    // Prepare for the undo-action (e.g., push if it's going to be popped).
1101    if (undo_action == RESTORE) {
1102      pushes++;
1103      RegExpMacroAssembler::StackCheckFlag stack_check =
1104          RegExpMacroAssembler::kNoStackLimitCheck;
1105      if (pushes == push_limit) {
1106        stack_check = RegExpMacroAssembler::kCheckStackLimit;
1107        pushes = 0;
1108      }
1109
1110      assembler->PushRegister(reg, stack_check);
1111      registers_to_pop->Set(reg);
1112    } else if (undo_action == CLEAR) {
1113      registers_to_clear->Set(reg);
1114    }
1115    // Perform the chronologically last action (or accumulated increment)
1116    // for the register.
1117    if (store_position != -1) {
1118      assembler->WriteCurrentPositionToRegister(reg, store_position);
1119    } else if (clear) {
1120      assembler->ClearRegisters(reg, reg);
1121    } else if (absolute) {
1122      assembler->SetRegister(reg, value);
1123    } else if (value != 0) {
1124      assembler->AdvanceRegister(reg, value);
1125    }
1126  }
1127}
1128
1129
1130// This is called as we come into a loop choice node and some other tricky
1131// nodes.  It normalizes the state of the code generator to ensure we can
1132// generate generic code.
1133void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1134  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1135
1136  ASSERT(!is_trivial());
1137
1138  if (actions_ == NULL && backtrack() == NULL) {
1139    // Here we just have some deferred cp advances to fix and we are back to
1140    // a normal situation.  We may also have to forget some information gained
1141    // through a quick check that was already performed.
1142    if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1143    // Create a new trivial state and generate the node with that.
1144    Trace new_state;
1145    successor->Emit(compiler, &new_state);
1146    return;
1147  }
1148
1149  // Generate deferred actions here along with code to undo them again.
1150  OutSet affected_registers;
1151
1152  if (backtrack() != NULL) {
1153    // Here we have a concrete backtrack location.  These are set up by choice
1154    // nodes and so they indicate that we have a deferred save of the current
1155    // position which we may need to emit here.
1156    assembler->PushCurrentPosition();
1157  }
1158
1159  int max_register = FindAffectedRegisters(&affected_registers);
1160  OutSet registers_to_pop;
1161  OutSet registers_to_clear;
1162  PerformDeferredActions(assembler,
1163                         max_register,
1164                         affected_registers,
1165                         &registers_to_pop,
1166                         &registers_to_clear);
1167  if (cp_offset_ != 0) {
1168    assembler->AdvanceCurrentPosition(cp_offset_);
1169  }
1170
1171  // Create a new trivial state and generate the node with that.
1172  Label undo;
1173  assembler->PushBacktrack(&undo);
1174  Trace new_state;
1175  successor->Emit(compiler, &new_state);
1176
1177  // On backtrack we need to restore state.
1178  assembler->Bind(&undo);
1179  RestoreAffectedRegisters(assembler,
1180                           max_register,
1181                           registers_to_pop,
1182                           registers_to_clear);
1183  if (backtrack() == NULL) {
1184    assembler->Backtrack();
1185  } else {
1186    assembler->PopCurrentPosition();
1187    assembler->GoTo(backtrack());
1188  }
1189}
1190
1191
1192void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1193  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1194
1195  // Omit flushing the trace. We discard the entire stack frame anyway.
1196
1197  if (!label()->is_bound()) {
1198    // We are completely independent of the trace, since we ignore it,
1199    // so this code can be used as the generic version.
1200    assembler->Bind(label());
1201  }
1202
1203  // Throw away everything on the backtrack stack since the start
1204  // of the negative submatch and restore the character position.
1205  assembler->ReadCurrentPositionFromRegister(current_position_register_);
1206  assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1207  if (clear_capture_count_ > 0) {
1208    // Clear any captures that might have been performed during the success
1209    // of the body of the negative look-ahead.
1210    int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1211    assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1212  }
1213  // Now that we have unwound the stack we find at the top of the stack the
1214  // backtrack that the BeginSubmatch node got.
1215  assembler->Backtrack();
1216}
1217
1218
1219void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1220  if (!trace->is_trivial()) {
1221    trace->Flush(compiler, this);
1222    return;
1223  }
1224  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1225  if (!label()->is_bound()) {
1226    assembler->Bind(label());
1227  }
1228  switch (action_) {
1229    case ACCEPT:
1230      assembler->Succeed();
1231      return;
1232    case BACKTRACK:
1233      assembler->GoTo(trace->backtrack());
1234      return;
1235    case NEGATIVE_SUBMATCH_SUCCESS:
1236      // This case is handled in a different virtual method.
1237      UNREACHABLE();
1238  }
1239  UNIMPLEMENTED();
1240}
1241
1242
1243void GuardedAlternative::AddGuard(Guard* guard) {
1244  if (guards_ == NULL)
1245    guards_ = new ZoneList<Guard*>(1);
1246  guards_->Add(guard);
1247}
1248
1249
1250ActionNode* ActionNode::SetRegister(int reg,
1251                                    int val,
1252                                    RegExpNode* on_success) {
1253  ActionNode* result = new ActionNode(SET_REGISTER, on_success);
1254  result->data_.u_store_register.reg = reg;
1255  result->data_.u_store_register.value = val;
1256  return result;
1257}
1258
1259
1260ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1261  ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1262  result->data_.u_increment_register.reg = reg;
1263  return result;
1264}
1265
1266
1267ActionNode* ActionNode::StorePosition(int reg,
1268                                      bool is_capture,
1269                                      RegExpNode* on_success) {
1270  ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1271  result->data_.u_position_register.reg = reg;
1272  result->data_.u_position_register.is_capture = is_capture;
1273  return result;
1274}
1275
1276
1277ActionNode* ActionNode::ClearCaptures(Interval range,
1278                                      RegExpNode* on_success) {
1279  ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1280  result->data_.u_clear_captures.range_from = range.from();
1281  result->data_.u_clear_captures.range_to = range.to();
1282  return result;
1283}
1284
1285
1286ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1287                                      int position_reg,
1288                                      RegExpNode* on_success) {
1289  ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1290  result->data_.u_submatch.stack_pointer_register = stack_reg;
1291  result->data_.u_submatch.current_position_register = position_reg;
1292  return result;
1293}
1294
1295
1296ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1297                                                int position_reg,
1298                                                int clear_register_count,
1299                                                int clear_register_from,
1300                                                RegExpNode* on_success) {
1301  ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1302  result->data_.u_submatch.stack_pointer_register = stack_reg;
1303  result->data_.u_submatch.current_position_register = position_reg;
1304  result->data_.u_submatch.clear_register_count = clear_register_count;
1305  result->data_.u_submatch.clear_register_from = clear_register_from;
1306  return result;
1307}
1308
1309
1310ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1311                                        int repetition_register,
1312                                        int repetition_limit,
1313                                        RegExpNode* on_success) {
1314  ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1315  result->data_.u_empty_match_check.start_register = start_register;
1316  result->data_.u_empty_match_check.repetition_register = repetition_register;
1317  result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1318  return result;
1319}
1320
1321
1322#define DEFINE_ACCEPT(Type)                                          \
1323  void Type##Node::Accept(NodeVisitor* visitor) {                    \
1324    visitor->Visit##Type(this);                                      \
1325  }
1326FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1327#undef DEFINE_ACCEPT
1328
1329
1330void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1331  visitor->VisitLoopChoice(this);
1332}
1333
1334
1335// -------------------------------------------------------------------
1336// Emit code.
1337
1338
1339void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1340                               Guard* guard,
1341                               Trace* trace) {
1342  switch (guard->op()) {
1343    case Guard::LT:
1344      ASSERT(!trace->mentions_reg(guard->reg()));
1345      macro_assembler->IfRegisterGE(guard->reg(),
1346                                    guard->value(),
1347                                    trace->backtrack());
1348      break;
1349    case Guard::GEQ:
1350      ASSERT(!trace->mentions_reg(guard->reg()));
1351      macro_assembler->IfRegisterLT(guard->reg(),
1352                                    guard->value(),
1353                                    trace->backtrack());
1354      break;
1355  }
1356}
1357
1358
1359// Returns the number of characters in the equivalence class, omitting those
1360// that cannot occur in the source string because it is ASCII.
1361static int GetCaseIndependentLetters(Isolate* isolate,
1362                                     uc16 character,
1363                                     bool ascii_subject,
1364                                     unibrow::uchar* letters) {
1365  int length =
1366      isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1367  // Unibrow returns 0 or 1 for characters where case independence is
1368  // trivial.
1369  if (length == 0) {
1370    letters[0] = character;
1371    length = 1;
1372  }
1373  if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1374    return length;
1375  }
1376  // The standard requires that non-ASCII characters cannot have ASCII
1377  // character codes in their equivalence class.
1378  return 0;
1379}
1380
1381
1382static inline bool EmitSimpleCharacter(Isolate* isolate,
1383                                       RegExpCompiler* compiler,
1384                                       uc16 c,
1385                                       Label* on_failure,
1386                                       int cp_offset,
1387                                       bool check,
1388                                       bool preloaded) {
1389  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1390  bool bound_checked = false;
1391  if (!preloaded) {
1392    assembler->LoadCurrentCharacter(
1393        cp_offset,
1394        on_failure,
1395        check);
1396    bound_checked = true;
1397  }
1398  assembler->CheckNotCharacter(c, on_failure);
1399  return bound_checked;
1400}
1401
1402
1403// Only emits non-letters (things that don't have case).  Only used for case
1404// independent matches.
1405static inline bool EmitAtomNonLetter(Isolate* isolate,
1406                                     RegExpCompiler* compiler,
1407                                     uc16 c,
1408                                     Label* on_failure,
1409                                     int cp_offset,
1410                                     bool check,
1411                                     bool preloaded) {
1412  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1413  bool ascii = compiler->ascii();
1414  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1415  int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
1416  if (length < 1) {
1417    // This can't match.  Must be an ASCII subject and a non-ASCII character.
1418    // We do not need to do anything since the ASCII pass already handled this.
1419    return false;  // Bounds not checked.
1420  }
1421  bool checked = false;
1422  // We handle the length > 1 case in a later pass.
1423  if (length == 1) {
1424    if (ascii && c > String::kMaxAsciiCharCodeU) {
1425      // Can't match - see above.
1426      return false;  // Bounds not checked.
1427    }
1428    if (!preloaded) {
1429      macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1430      checked = check;
1431    }
1432    macro_assembler->CheckNotCharacter(c, on_failure);
1433  }
1434  return checked;
1435}
1436
1437
1438static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1439                                      bool ascii,
1440                                      uc16 c1,
1441                                      uc16 c2,
1442                                      Label* on_failure) {
1443  uc16 char_mask;
1444  if (ascii) {
1445    char_mask = String::kMaxAsciiCharCode;
1446  } else {
1447    char_mask = String::kMaxUtf16CodeUnit;
1448  }
1449  uc16 exor = c1 ^ c2;
1450  // Check whether exor has only one bit set.
1451  if (((exor - 1) & exor) == 0) {
1452    // If c1 and c2 differ only by one bit.
1453    // Ecma262UnCanonicalize always gives the highest number last.
1454    ASSERT(c2 > c1);
1455    uc16 mask = char_mask ^ exor;
1456    macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1457    return true;
1458  }
1459  ASSERT(c2 > c1);
1460  uc16 diff = c2 - c1;
1461  if (((diff - 1) & diff) == 0 && c1 >= diff) {
1462    // If the characters differ by 2^n but don't differ by one bit then
1463    // subtract the difference from the found character, then do the or
1464    // trick.  We avoid the theoretical case where negative numbers are
1465    // involved in order to simplify code generation.
1466    uc16 mask = char_mask ^ diff;
1467    macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1468                                                    diff,
1469                                                    mask,
1470                                                    on_failure);
1471    return true;
1472  }
1473  return false;
1474}
1475
1476
1477typedef bool EmitCharacterFunction(Isolate* isolate,
1478                                   RegExpCompiler* compiler,
1479                                   uc16 c,
1480                                   Label* on_failure,
1481                                   int cp_offset,
1482                                   bool check,
1483                                   bool preloaded);
1484
1485// Only emits letters (things that have case).  Only used for case independent
1486// matches.
1487static inline bool EmitAtomLetter(Isolate* isolate,
1488                                  RegExpCompiler* compiler,
1489                                  uc16 c,
1490                                  Label* on_failure,
1491                                  int cp_offset,
1492                                  bool check,
1493                                  bool preloaded) {
1494  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1495  bool ascii = compiler->ascii();
1496  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1497  int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
1498  if (length <= 1) return false;
1499  // We may not need to check against the end of the input string
1500  // if this character lies before a character that matched.
1501  if (!preloaded) {
1502    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1503  }
1504  Label ok;
1505  ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1506  switch (length) {
1507    case 2: {
1508      if (ShortCutEmitCharacterPair(macro_assembler,
1509                                    ascii,
1510                                    chars[0],
1511                                    chars[1],
1512                                    on_failure)) {
1513      } else {
1514        macro_assembler->CheckCharacter(chars[0], &ok);
1515        macro_assembler->CheckNotCharacter(chars[1], on_failure);
1516        macro_assembler->Bind(&ok);
1517      }
1518      break;
1519    }
1520    case 4:
1521      macro_assembler->CheckCharacter(chars[3], &ok);
1522      // Fall through!
1523    case 3:
1524      macro_assembler->CheckCharacter(chars[0], &ok);
1525      macro_assembler->CheckCharacter(chars[1], &ok);
1526      macro_assembler->CheckNotCharacter(chars[2], on_failure);
1527      macro_assembler->Bind(&ok);
1528      break;
1529    default:
1530      UNREACHABLE();
1531      break;
1532  }
1533  return true;
1534}
1535
1536
1537static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1538                          RegExpCharacterClass* cc,
1539                          bool ascii,
1540                          Label* on_failure,
1541                          int cp_offset,
1542                          bool check_offset,
1543                          bool preloaded) {
1544  ZoneList<CharacterRange>* ranges = cc->ranges();
1545  int max_char;
1546  if (ascii) {
1547    max_char = String::kMaxAsciiCharCode;
1548  } else {
1549    max_char = String::kMaxUtf16CodeUnit;
1550  }
1551
1552  Label success;
1553
1554  Label* char_is_in_class =
1555      cc->is_negated() ? on_failure : &success;
1556
1557  int range_count = ranges->length();
1558
1559  int last_valid_range = range_count - 1;
1560  while (last_valid_range >= 0) {
1561    CharacterRange& range = ranges->at(last_valid_range);
1562    if (range.from() <= max_char) {
1563      break;
1564    }
1565    last_valid_range--;
1566  }
1567
1568  if (last_valid_range < 0) {
1569    if (!cc->is_negated()) {
1570      // TODO(plesner): We can remove this when the node level does our
1571      // ASCII optimizations for us.
1572      macro_assembler->GoTo(on_failure);
1573    }
1574    if (check_offset) {
1575      macro_assembler->CheckPosition(cp_offset, on_failure);
1576    }
1577    return;
1578  }
1579
1580  if (last_valid_range == 0 &&
1581      !cc->is_negated() &&
1582      ranges->at(0).IsEverything(max_char)) {
1583    // This is a common case hit by non-anchored expressions.
1584    if (check_offset) {
1585      macro_assembler->CheckPosition(cp_offset, on_failure);
1586    }
1587    return;
1588  }
1589
1590  if (!preloaded) {
1591    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1592  }
1593
1594  if (cc->is_standard() &&
1595        macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1596                                                    on_failure)) {
1597      return;
1598  }
1599
1600  for (int i = 0; i < last_valid_range; i++) {
1601    CharacterRange& range = ranges->at(i);
1602    Label next_range;
1603    uc16 from = range.from();
1604    uc16 to = range.to();
1605    if (from > max_char) {
1606      continue;
1607    }
1608    if (to > max_char) to = max_char;
1609    if (to == from) {
1610      macro_assembler->CheckCharacter(to, char_is_in_class);
1611    } else {
1612      if (from != 0) {
1613        macro_assembler->CheckCharacterLT(from, &next_range);
1614      }
1615      if (to != max_char) {
1616        macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1617      } else {
1618        macro_assembler->GoTo(char_is_in_class);
1619      }
1620    }
1621    macro_assembler->Bind(&next_range);
1622  }
1623
1624  CharacterRange& range = ranges->at(last_valid_range);
1625  uc16 from = range.from();
1626  uc16 to = range.to();
1627
1628  if (to > max_char) to = max_char;
1629  ASSERT(to >= from);
1630
1631  if (to == from) {
1632    if (cc->is_negated()) {
1633      macro_assembler->CheckCharacter(to, on_failure);
1634    } else {
1635      macro_assembler->CheckNotCharacter(to, on_failure);
1636    }
1637  } else {
1638    if (from != 0) {
1639      if (cc->is_negated()) {
1640        macro_assembler->CheckCharacterLT(from, &success);
1641      } else {
1642        macro_assembler->CheckCharacterLT(from, on_failure);
1643      }
1644    }
1645    if (to != String::kMaxUtf16CodeUnit) {
1646      if (cc->is_negated()) {
1647        macro_assembler->CheckCharacterLT(to + 1, on_failure);
1648      } else {
1649        macro_assembler->CheckCharacterGT(to, on_failure);
1650      }
1651    } else {
1652      if (cc->is_negated()) {
1653        macro_assembler->GoTo(on_failure);
1654      }
1655    }
1656  }
1657  macro_assembler->Bind(&success);
1658}
1659
1660
1661RegExpNode::~RegExpNode() {
1662}
1663
1664
1665RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1666                                                  Trace* trace) {
1667  // If we are generating a greedy loop then don't stop and don't reuse code.
1668  if (trace->stop_node() != NULL) {
1669    return CONTINUE;
1670  }
1671
1672  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1673  if (trace->is_trivial()) {
1674    if (label_.is_bound()) {
1675      // We are being asked to generate a generic version, but that's already
1676      // been done so just go to it.
1677      macro_assembler->GoTo(&label_);
1678      return DONE;
1679    }
1680    if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1681      // To avoid too deep recursion we push the node to the work queue and just
1682      // generate a goto here.
1683      compiler->AddWork(this);
1684      macro_assembler->GoTo(&label_);
1685      return DONE;
1686    }
1687    // Generate generic version of the node and bind the label for later use.
1688    macro_assembler->Bind(&label_);
1689    return CONTINUE;
1690  }
1691
1692  // We are being asked to make a non-generic version.  Keep track of how many
1693  // non-generic versions we generate so as not to overdo it.
1694  trace_count_++;
1695  if (FLAG_regexp_optimization &&
1696      trace_count_ < kMaxCopiesCodeGenerated &&
1697      compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1698    return CONTINUE;
1699  }
1700
1701  // If we get here code has been generated for this node too many times or
1702  // recursion is too deep.  Time to switch to a generic version.  The code for
1703  // generic versions above can handle deep recursion properly.
1704  trace->Flush(compiler, this);
1705  return DONE;
1706}
1707
1708
1709int ActionNode::EatsAtLeast(int still_to_find,
1710                            int recursion_depth,
1711                            bool not_at_start) {
1712  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1713  if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
1714  return on_success()->EatsAtLeast(still_to_find,
1715                                   recursion_depth + 1,
1716                                   not_at_start);
1717}
1718
1719
1720int AssertionNode::EatsAtLeast(int still_to_find,
1721                               int recursion_depth,
1722                               bool not_at_start) {
1723  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1724  // If we know we are not at the start and we are asked "how many characters
1725  // will you match if you succeed?" then we can answer anything since false
1726  // implies false.  So lets just return the max answer (still_to_find) since
1727  // that won't prevent us from preloading a lot of characters for the other
1728  // branches in the node graph.
1729  if (type() == AT_START && not_at_start) return still_to_find;
1730  return on_success()->EatsAtLeast(still_to_find,
1731                                   recursion_depth + 1,
1732                                   not_at_start);
1733}
1734
1735
1736int BackReferenceNode::EatsAtLeast(int still_to_find,
1737                                   int recursion_depth,
1738                                   bool not_at_start) {
1739  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1740  return on_success()->EatsAtLeast(still_to_find,
1741                                   recursion_depth + 1,
1742                                   not_at_start);
1743}
1744
1745
1746int TextNode::EatsAtLeast(int still_to_find,
1747                          int recursion_depth,
1748                          bool not_at_start) {
1749  int answer = Length();
1750  if (answer >= still_to_find) return answer;
1751  if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
1752  // We are not at start after this node so we set the last argument to 'true'.
1753  return answer + on_success()->EatsAtLeast(still_to_find - answer,
1754                                            recursion_depth + 1,
1755                                            true);
1756}
1757
1758
1759int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
1760                                             int recursion_depth,
1761                                             bool not_at_start) {
1762  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1763  // Alternative 0 is the negative lookahead, alternative 1 is what comes
1764  // afterwards.
1765  RegExpNode* node = alternatives_->at(1).node();
1766  return node->EatsAtLeast(still_to_find, recursion_depth + 1, not_at_start);
1767}
1768
1769
1770void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
1771    QuickCheckDetails* details,
1772    RegExpCompiler* compiler,
1773    int filled_in,
1774    bool not_at_start) {
1775  // Alternative 0 is the negative lookahead, alternative 1 is what comes
1776  // afterwards.
1777  RegExpNode* node = alternatives_->at(1).node();
1778  return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1779}
1780
1781
1782int ChoiceNode::EatsAtLeastHelper(int still_to_find,
1783                                  int recursion_depth,
1784                                  RegExpNode* ignore_this_node,
1785                                  bool not_at_start) {
1786  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1787  int min = 100;
1788  int choice_count = alternatives_->length();
1789  for (int i = 0; i < choice_count; i++) {
1790    RegExpNode* node = alternatives_->at(i).node();
1791    if (node == ignore_this_node) continue;
1792    int node_eats_at_least = node->EatsAtLeast(still_to_find,
1793                                               recursion_depth + 1,
1794                                               not_at_start);
1795    if (node_eats_at_least < min) min = node_eats_at_least;
1796  }
1797  return min;
1798}
1799
1800
1801int LoopChoiceNode::EatsAtLeast(int still_to_find,
1802                                int recursion_depth,
1803                                bool not_at_start) {
1804  return EatsAtLeastHelper(still_to_find,
1805                           recursion_depth,
1806                           loop_node_,
1807                           not_at_start);
1808}
1809
1810
1811int ChoiceNode::EatsAtLeast(int still_to_find,
1812                            int recursion_depth,
1813                            bool not_at_start) {
1814  return EatsAtLeastHelper(still_to_find,
1815                           recursion_depth,
1816                           NULL,
1817                           not_at_start);
1818}
1819
1820
1821// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1822static inline uint32_t SmearBitsRight(uint32_t v) {
1823  v |= v >> 1;
1824  v |= v >> 2;
1825  v |= v >> 4;
1826  v |= v >> 8;
1827  v |= v >> 16;
1828  return v;
1829}
1830
1831
1832bool QuickCheckDetails::Rationalize(bool asc) {
1833  bool found_useful_op = false;
1834  uint32_t char_mask;
1835  if (asc) {
1836    char_mask = String::kMaxAsciiCharCode;
1837  } else {
1838    char_mask = String::kMaxUtf16CodeUnit;
1839  }
1840  mask_ = 0;
1841  value_ = 0;
1842  int char_shift = 0;
1843  for (int i = 0; i < characters_; i++) {
1844    Position* pos = &positions_[i];
1845    if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
1846      found_useful_op = true;
1847    }
1848    mask_ |= (pos->mask & char_mask) << char_shift;
1849    value_ |= (pos->value & char_mask) << char_shift;
1850    char_shift += asc ? 8 : 16;
1851  }
1852  return found_useful_op;
1853}
1854
1855
1856bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1857                                Trace* trace,
1858                                bool preload_has_checked_bounds,
1859                                Label* on_possible_success,
1860                                QuickCheckDetails* details,
1861                                bool fall_through_on_failure) {
1862  if (details->characters() == 0) return false;
1863  GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
1864  if (details->cannot_match()) return false;
1865  if (!details->Rationalize(compiler->ascii())) return false;
1866  ASSERT(details->characters() == 1 ||
1867         compiler->macro_assembler()->CanReadUnaligned());
1868  uint32_t mask = details->mask();
1869  uint32_t value = details->value();
1870
1871  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1872
1873  if (trace->characters_preloaded() != details->characters()) {
1874    assembler->LoadCurrentCharacter(trace->cp_offset(),
1875                                    trace->backtrack(),
1876                                    !preload_has_checked_bounds,
1877                                    details->characters());
1878  }
1879
1880
1881  bool need_mask = true;
1882
1883  if (details->characters() == 1) {
1884    // If number of characters preloaded is 1 then we used a byte or 16 bit
1885    // load so the value is already masked down.
1886    uint32_t char_mask;
1887    if (compiler->ascii()) {
1888      char_mask = String::kMaxAsciiCharCode;
1889    } else {
1890      char_mask = String::kMaxUtf16CodeUnit;
1891    }
1892    if ((mask & char_mask) == char_mask) need_mask = false;
1893    mask &= char_mask;
1894  } else {
1895    // For 2-character preloads in ASCII mode or 1-character preloads in
1896    // TWO_BYTE mode we also use a 16 bit load with zero extend.
1897    if (details->characters() == 2 && compiler->ascii()) {
1898      if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;
1899    } else if (details->characters() == 1 && !compiler->ascii()) {
1900      if ((mask & 0xffff) == 0xffff) need_mask = false;
1901    } else {
1902      if (mask == 0xffffffff) need_mask = false;
1903    }
1904  }
1905
1906  if (fall_through_on_failure) {
1907    if (need_mask) {
1908      assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1909    } else {
1910      assembler->CheckCharacter(value, on_possible_success);
1911    }
1912  } else {
1913    if (need_mask) {
1914      assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1915    } else {
1916      assembler->CheckNotCharacter(value, trace->backtrack());
1917    }
1918  }
1919  return true;
1920}
1921
1922
1923// Here is the meat of GetQuickCheckDetails (see also the comment on the
1924// super-class in the .h file).
1925//
1926// We iterate along the text object, building up for each character a
1927// mask and value that can be used to test for a quick failure to match.
1928// The masks and values for the positions will be combined into a single
1929// machine word for the current character width in order to be used in
1930// generating a quick check.
1931void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1932                                    RegExpCompiler* compiler,
1933                                    int characters_filled_in,
1934                                    bool not_at_start) {
1935  Isolate* isolate = Isolate::Current();
1936  ASSERT(characters_filled_in < details->characters());
1937  int characters = details->characters();
1938  int char_mask;
1939  if (compiler->ascii()) {
1940    char_mask = String::kMaxAsciiCharCode;
1941  } else {
1942    char_mask = String::kMaxUtf16CodeUnit;
1943  }
1944  for (int k = 0; k < elms_->length(); k++) {
1945    TextElement elm = elms_->at(k);
1946    if (elm.type == TextElement::ATOM) {
1947      Vector<const uc16> quarks = elm.data.u_atom->data();
1948      for (int i = 0; i < characters && i < quarks.length(); i++) {
1949        QuickCheckDetails::Position* pos =
1950            details->positions(characters_filled_in);
1951        uc16 c = quarks[i];
1952        if (c > char_mask) {
1953          // If we expect a non-ASCII character from an ASCII string,
1954          // there is no way we can match. Not even case independent
1955          // matching can turn an ASCII character into non-ASCII or
1956          // vice versa.
1957          details->set_cannot_match();
1958          pos->determines_perfectly = false;
1959          return;
1960        }
1961        if (compiler->ignore_case()) {
1962          unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1963          int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
1964                                                 chars);
1965          ASSERT(length != 0);  // Can only happen if c > char_mask (see above).
1966          if (length == 1) {
1967            // This letter has no case equivalents, so it's nice and simple
1968            // and the mask-compare will determine definitely whether we have
1969            // a match at this character position.
1970            pos->mask = char_mask;
1971            pos->value = c;
1972            pos->determines_perfectly = true;
1973          } else {
1974            uint32_t common_bits = char_mask;
1975            uint32_t bits = chars[0];
1976            for (int j = 1; j < length; j++) {
1977              uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1978              common_bits ^= differing_bits;
1979              bits &= common_bits;
1980            }
1981            // If length is 2 and common bits has only one zero in it then
1982            // our mask and compare instruction will determine definitely
1983            // whether we have a match at this character position.  Otherwise
1984            // it can only be an approximate check.
1985            uint32_t one_zero = (common_bits | ~char_mask);
1986            if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1987              pos->determines_perfectly = true;
1988            }
1989            pos->mask = common_bits;
1990            pos->value = bits;
1991          }
1992        } else {
1993          // Don't ignore case.  Nice simple case where the mask-compare will
1994          // determine definitely whether we have a match at this character
1995          // position.
1996          pos->mask = char_mask;
1997          pos->value = c;
1998          pos->determines_perfectly = true;
1999        }
2000        characters_filled_in++;
2001        ASSERT(characters_filled_in <= details->characters());
2002        if (characters_filled_in == details->characters()) {
2003          return;
2004        }
2005      }
2006    } else {
2007      QuickCheckDetails::Position* pos =
2008          details->positions(characters_filled_in);
2009      RegExpCharacterClass* tree = elm.data.u_char_class;
2010      ZoneList<CharacterRange>* ranges = tree->ranges();
2011      if (tree->is_negated()) {
2012        // A quick check uses multi-character mask and compare.  There is no
2013        // useful way to incorporate a negative char class into this scheme
2014        // so we just conservatively create a mask and value that will always
2015        // succeed.
2016        pos->mask = 0;
2017        pos->value = 0;
2018      } else {
2019        int first_range = 0;
2020        while (ranges->at(first_range).from() > char_mask) {
2021          first_range++;
2022          if (first_range == ranges->length()) {
2023            details->set_cannot_match();
2024            pos->determines_perfectly = false;
2025            return;
2026          }
2027        }
2028        CharacterRange range = ranges->at(first_range);
2029        uc16 from = range.from();
2030        uc16 to = range.to();
2031        if (to > char_mask) {
2032          to = char_mask;
2033        }
2034        uint32_t differing_bits = (from ^ to);
2035        // A mask and compare is only perfect if the differing bits form a
2036        // number like 00011111 with one single block of trailing 1s.
2037        if ((differing_bits & (differing_bits + 1)) == 0 &&
2038             from + differing_bits == to) {
2039          pos->determines_perfectly = true;
2040        }
2041        uint32_t common_bits = ~SmearBitsRight(differing_bits);
2042        uint32_t bits = (from & common_bits);
2043        for (int i = first_range + 1; i < ranges->length(); i++) {
2044          CharacterRange range = ranges->at(i);
2045          uc16 from = range.from();
2046          uc16 to = range.to();
2047          if (from > char_mask) continue;
2048          if (to > char_mask) to = char_mask;
2049          // Here we are combining more ranges into the mask and compare
2050          // value.  With each new range the mask becomes more sparse and
2051          // so the chances of a false positive rise.  A character class
2052          // with multiple ranges is assumed never to be equivalent to a
2053          // mask and compare operation.
2054          pos->determines_perfectly = false;
2055          uint32_t new_common_bits = (from ^ to);
2056          new_common_bits = ~SmearBitsRight(new_common_bits);
2057          common_bits &= new_common_bits;
2058          bits &= new_common_bits;
2059          uint32_t differing_bits = (from & common_bits) ^ bits;
2060          common_bits ^= differing_bits;
2061          bits &= common_bits;
2062        }
2063        pos->mask = common_bits;
2064        pos->value = bits;
2065      }
2066      characters_filled_in++;
2067      ASSERT(characters_filled_in <= details->characters());
2068      if (characters_filled_in == details->characters()) {
2069        return;
2070      }
2071    }
2072  }
2073  ASSERT(characters_filled_in != details->characters());
2074  on_success()-> GetQuickCheckDetails(details,
2075                                      compiler,
2076                                      characters_filled_in,
2077                                      true);
2078}
2079
2080
2081void QuickCheckDetails::Clear() {
2082  for (int i = 0; i < characters_; i++) {
2083    positions_[i].mask = 0;
2084    positions_[i].value = 0;
2085    positions_[i].determines_perfectly = false;
2086  }
2087  characters_ = 0;
2088}
2089
2090
2091void QuickCheckDetails::Advance(int by, bool ascii) {
2092  ASSERT(by >= 0);
2093  if (by >= characters_) {
2094    Clear();
2095    return;
2096  }
2097  for (int i = 0; i < characters_ - by; i++) {
2098    positions_[i] = positions_[by + i];
2099  }
2100  for (int i = characters_ - by; i < characters_; i++) {
2101    positions_[i].mask = 0;
2102    positions_[i].value = 0;
2103    positions_[i].determines_perfectly = false;
2104  }
2105  characters_ -= by;
2106  // We could change mask_ and value_ here but we would never advance unless
2107  // they had already been used in a check and they won't be used again because
2108  // it would gain us nothing.  So there's no point.
2109}
2110
2111
2112void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2113  ASSERT(characters_ == other->characters_);
2114  if (other->cannot_match_) {
2115    return;
2116  }
2117  if (cannot_match_) {
2118    *this = *other;
2119    return;
2120  }
2121  for (int i = from_index; i < characters_; i++) {
2122    QuickCheckDetails::Position* pos = positions(i);
2123    QuickCheckDetails::Position* other_pos = other->positions(i);
2124    if (pos->mask != other_pos->mask ||
2125        pos->value != other_pos->value ||
2126        !other_pos->determines_perfectly) {
2127      // Our mask-compare operation will be approximate unless we have the
2128      // exact same operation on both sides of the alternation.
2129      pos->determines_perfectly = false;
2130    }
2131    pos->mask &= other_pos->mask;
2132    pos->value &= pos->mask;
2133    other_pos->value &= pos->mask;
2134    uc16 differing_bits = (pos->value ^ other_pos->value);
2135    pos->mask &= ~differing_bits;
2136    pos->value &= pos->mask;
2137  }
2138}
2139
2140
2141class VisitMarker {
2142 public:
2143  explicit VisitMarker(NodeInfo* info) : info_(info) {
2144    ASSERT(!info->visited);
2145    info->visited = true;
2146  }
2147  ~VisitMarker() {
2148    info_->visited = false;
2149  }
2150 private:
2151  NodeInfo* info_;
2152};
2153
2154
2155void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2156                                          RegExpCompiler* compiler,
2157                                          int characters_filled_in,
2158                                          bool not_at_start) {
2159  if (body_can_be_zero_length_ || info()->visited) return;
2160  VisitMarker marker(info());
2161  return ChoiceNode::GetQuickCheckDetails(details,
2162                                          compiler,
2163                                          characters_filled_in,
2164                                          not_at_start);
2165}
2166
2167
2168void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2169                                      RegExpCompiler* compiler,
2170                                      int characters_filled_in,
2171                                      bool not_at_start) {
2172  not_at_start = (not_at_start || not_at_start_);
2173  int choice_count = alternatives_->length();
2174  ASSERT(choice_count > 0);
2175  alternatives_->at(0).node()->GetQuickCheckDetails(details,
2176                                                    compiler,
2177                                                    characters_filled_in,
2178                                                    not_at_start);
2179  for (int i = 1; i < choice_count; i++) {
2180    QuickCheckDetails new_details(details->characters());
2181    RegExpNode* node = alternatives_->at(i).node();
2182    node->GetQuickCheckDetails(&new_details, compiler,
2183                               characters_filled_in,
2184                               not_at_start);
2185    // Here we merge the quick match details of the two branches.
2186    details->Merge(&new_details, characters_filled_in);
2187  }
2188}
2189
2190
2191// Check for [0-9A-Z_a-z].
2192static void EmitWordCheck(RegExpMacroAssembler* assembler,
2193                          Label* word,
2194                          Label* non_word,
2195                          bool fall_through_on_word) {
2196  if (assembler->CheckSpecialCharacterClass(
2197          fall_through_on_word ? 'w' : 'W',
2198          fall_through_on_word ? non_word : word)) {
2199    // Optimized implementation available.
2200    return;
2201  }
2202  assembler->CheckCharacterGT('z', non_word);
2203  assembler->CheckCharacterLT('0', non_word);
2204  assembler->CheckCharacterGT('a' - 1, word);
2205  assembler->CheckCharacterLT('9' + 1, word);
2206  assembler->CheckCharacterLT('A', non_word);
2207  assembler->CheckCharacterLT('Z' + 1, word);
2208  if (fall_through_on_word) {
2209    assembler->CheckNotCharacter('_', non_word);
2210  } else {
2211    assembler->CheckCharacter('_', word);
2212  }
2213}
2214
2215
2216// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2217// that matches newline or the start of input).
2218static void EmitHat(RegExpCompiler* compiler,
2219                    RegExpNode* on_success,
2220                    Trace* trace) {
2221  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2222  // We will be loading the previous character into the current character
2223  // register.
2224  Trace new_trace(*trace);
2225  new_trace.InvalidateCurrentCharacter();
2226
2227  Label ok;
2228  if (new_trace.cp_offset() == 0) {
2229    // The start of input counts as a newline in this context, so skip to
2230    // ok if we are at the start.
2231    assembler->CheckAtStart(&ok);
2232  }
2233  // We already checked that we are not at the start of input so it must be
2234  // OK to load the previous character.
2235  assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2236                                  new_trace.backtrack(),
2237                                  false);
2238  if (!assembler->CheckSpecialCharacterClass('n',
2239                                             new_trace.backtrack())) {
2240    // Newline means \n, \r, 0x2028 or 0x2029.
2241    if (!compiler->ascii()) {
2242      assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2243    }
2244    assembler->CheckCharacter('\n', &ok);
2245    assembler->CheckNotCharacter('\r', new_trace.backtrack());
2246  }
2247  assembler->Bind(&ok);
2248  on_success->Emit(compiler, &new_trace);
2249}
2250
2251
2252// Emit the code to handle \b and \B (word-boundary or non-word-boundary)
2253// when we know whether the next character must be a word character or not.
2254static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
2255                                  RegExpCompiler* compiler,
2256                                  RegExpNode* on_success,
2257                                  Trace* trace) {
2258  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2259  Label done;
2260
2261  Trace new_trace(*trace);
2262
2263  bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
2264  Label* on_word = expect_word_character ? &done : new_trace.backtrack();
2265  Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
2266
2267  // Check whether previous character was a word character.
2268  switch (trace->at_start()) {
2269    case Trace::TRUE:
2270      if (expect_word_character) {
2271        assembler->GoTo(on_non_word);
2272      }
2273      break;
2274    case Trace::UNKNOWN:
2275      ASSERT_EQ(0, trace->cp_offset());
2276      assembler->CheckAtStart(on_non_word);
2277      // Fall through.
2278    case Trace::FALSE:
2279      int prev_char_offset = trace->cp_offset() - 1;
2280      assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
2281      EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
2282      // We may or may not have loaded the previous character.
2283      new_trace.InvalidateCurrentCharacter();
2284  }
2285
2286  assembler->Bind(&done);
2287
2288  on_success->Emit(compiler, &new_trace);
2289}
2290
2291
2292// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2293static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2294                              RegExpCompiler* compiler,
2295                              RegExpNode* on_success,
2296                              Trace* trace) {
2297  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2298  Label before_non_word;
2299  Label before_word;
2300  if (trace->characters_preloaded() != 1) {
2301    assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2302  }
2303  // Fall through on non-word.
2304  EmitWordCheck(assembler, &before_word, &before_non_word, false);
2305
2306  // We will be loading the previous character into the current character
2307  // register.
2308  Trace new_trace(*trace);
2309  new_trace.InvalidateCurrentCharacter();
2310
2311  Label ok;
2312  Label* boundary;
2313  Label* not_boundary;
2314  if (type == AssertionNode::AT_BOUNDARY) {
2315    boundary = &ok;
2316    not_boundary = new_trace.backtrack();
2317  } else {
2318    not_boundary = &ok;
2319    boundary = new_trace.backtrack();
2320  }
2321
2322  // Next character is not a word character.
2323  assembler->Bind(&before_non_word);
2324  if (new_trace.cp_offset() == 0) {
2325    // The start of input counts as a non-word character, so the question is
2326    // decided if we are at the start.
2327    assembler->CheckAtStart(not_boundary);
2328  }
2329  // We already checked that we are not at the start of input so it must be
2330  // OK to load the previous character.
2331  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2332                                  &ok,  // Unused dummy label in this call.
2333                                  false);
2334  // Fall through on non-word.
2335  EmitWordCheck(assembler, boundary, not_boundary, false);
2336  assembler->GoTo(not_boundary);
2337
2338  // Next character is a word character.
2339  assembler->Bind(&before_word);
2340  if (new_trace.cp_offset() == 0) {
2341    // The start of input counts as a non-word character, so the question is
2342    // decided if we are at the start.
2343    assembler->CheckAtStart(boundary);
2344  }
2345  // We already checked that we are not at the start of input so it must be
2346  // OK to load the previous character.
2347  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2348                                  &ok,  // Unused dummy label in this call.
2349                                  false);
2350  bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2351  EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2352
2353  assembler->Bind(&ok);
2354
2355  on_success->Emit(compiler, &new_trace);
2356}
2357
2358
2359void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2360                                         RegExpCompiler* compiler,
2361                                         int filled_in,
2362                                         bool not_at_start) {
2363  if (type_ == AT_START && not_at_start) {
2364    details->set_cannot_match();
2365    return;
2366  }
2367  return on_success()->GetQuickCheckDetails(details,
2368                                            compiler,
2369                                            filled_in,
2370                                            not_at_start);
2371}
2372
2373
2374void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2375  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2376  switch (type_) {
2377    case AT_END: {
2378      Label ok;
2379      assembler->CheckPosition(trace->cp_offset(), &ok);
2380      assembler->GoTo(trace->backtrack());
2381      assembler->Bind(&ok);
2382      break;
2383    }
2384    case AT_START: {
2385      if (trace->at_start() == Trace::FALSE) {
2386        assembler->GoTo(trace->backtrack());
2387        return;
2388      }
2389      if (trace->at_start() == Trace::UNKNOWN) {
2390        assembler->CheckNotAtStart(trace->backtrack());
2391        Trace at_start_trace = *trace;
2392        at_start_trace.set_at_start(true);
2393        on_success()->Emit(compiler, &at_start_trace);
2394        return;
2395      }
2396    }
2397    break;
2398    case AFTER_NEWLINE:
2399      EmitHat(compiler, on_success(), trace);
2400      return;
2401    case AT_BOUNDARY:
2402    case AT_NON_BOUNDARY: {
2403      EmitBoundaryCheck(type_, compiler, on_success(), trace);
2404      return;
2405    }
2406    case AFTER_WORD_CHARACTER:
2407    case AFTER_NONWORD_CHARACTER: {
2408      EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
2409    }
2410  }
2411  on_success()->Emit(compiler, trace);
2412}
2413
2414
2415static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2416  if (quick_check == NULL) return false;
2417  if (offset >= quick_check->characters()) return false;
2418  return quick_check->positions(offset)->determines_perfectly;
2419}
2420
2421
2422static void UpdateBoundsCheck(int index, int* checked_up_to) {
2423  if (index > *checked_up_to) {
2424    *checked_up_to = index;
2425  }
2426}
2427
2428
2429// We call this repeatedly to generate code for each pass over the text node.
2430// The passes are in increasing order of difficulty because we hope one
2431// of the first passes will fail in which case we are saved the work of the
2432// later passes.  for example for the case independent regexp /%[asdfghjkl]a/
2433// we will check the '%' in the first pass, the case independent 'a' in the
2434// second pass and the character class in the last pass.
2435//
2436// The passes are done from right to left, so for example to test for /bar/
2437// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2438// and then a 'b' with offset 0.  This means we can avoid the end-of-input
2439// bounds check most of the time.  In the example we only need to check for
2440// end-of-input when loading the putative 'r'.
2441//
2442// A slight complication involves the fact that the first character may already
2443// be fetched into a register by the previous node.  In this case we want to
2444// do the test for that character first.  We do this in separate passes.  The
2445// 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
2446// pass has been performed then subsequent passes will have true in
2447// first_element_checked to indicate that that character does not need to be
2448// checked again.
2449//
2450// In addition to all this we are passed a Trace, which can
2451// contain an AlternativeGeneration object.  In this AlternativeGeneration
2452// object we can see details of any quick check that was already passed in
2453// order to get to the code we are now generating.  The quick check can involve
2454// loading characters, which means we do not need to recheck the bounds
2455// up to the limit the quick check already checked.  In addition the quick
2456// check can have involved a mask and compare operation which may simplify
2457// or obviate the need for further checks at some character positions.
2458void TextNode::TextEmitPass(RegExpCompiler* compiler,
2459                            TextEmitPassType pass,
2460                            bool preloaded,
2461                            Trace* trace,
2462                            bool first_element_checked,
2463                            int* checked_up_to) {
2464  Isolate* isolate = Isolate::Current();
2465  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2466  bool ascii = compiler->ascii();
2467  Label* backtrack = trace->backtrack();
2468  QuickCheckDetails* quick_check = trace->quick_check_performed();
2469  int element_count = elms_->length();
2470  for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2471    TextElement elm = elms_->at(i);
2472    int cp_offset = trace->cp_offset() + elm.cp_offset;
2473    if (elm.type == TextElement::ATOM) {
2474      Vector<const uc16> quarks = elm.data.u_atom->data();
2475      for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2476        if (first_element_checked && i == 0 && j == 0) continue;
2477        if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
2478        EmitCharacterFunction* emit_function = NULL;
2479        switch (pass) {
2480          case NON_ASCII_MATCH:
2481            ASSERT(ascii);
2482            if (quarks[j] > String::kMaxAsciiCharCode) {
2483              assembler->GoTo(backtrack);
2484              return;
2485            }
2486            break;
2487          case NON_LETTER_CHARACTER_MATCH:
2488            emit_function = &EmitAtomNonLetter;
2489            break;
2490          case SIMPLE_CHARACTER_MATCH:
2491            emit_function = &EmitSimpleCharacter;
2492            break;
2493          case CASE_CHARACTER_MATCH:
2494            emit_function = &EmitAtomLetter;
2495            break;
2496          default:
2497            break;
2498        }
2499        if (emit_function != NULL) {
2500          bool bound_checked = emit_function(isolate,
2501                                             compiler,
2502                                             quarks[j],
2503                                             backtrack,
2504                                             cp_offset + j,
2505                                             *checked_up_to < cp_offset + j,
2506                                             preloaded);
2507          if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2508        }
2509      }
2510    } else {
2511      ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
2512      if (pass == CHARACTER_CLASS_MATCH) {
2513        if (first_element_checked && i == 0) continue;
2514        if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
2515        RegExpCharacterClass* cc = elm.data.u_char_class;
2516        EmitCharClass(assembler,
2517                      cc,
2518                      ascii,
2519                      backtrack,
2520                      cp_offset,
2521                      *checked_up_to < cp_offset,
2522                      preloaded);
2523        UpdateBoundsCheck(cp_offset, checked_up_to);
2524      }
2525    }
2526  }
2527}
2528
2529
2530int TextNode::Length() {
2531  TextElement elm = elms_->last();
2532  ASSERT(elm.cp_offset >= 0);
2533  if (elm.type == TextElement::ATOM) {
2534    return elm.cp_offset + elm.data.u_atom->data().length();
2535  } else {
2536    return elm.cp_offset + 1;
2537  }
2538}
2539
2540
2541bool TextNode::SkipPass(int int_pass, bool ignore_case) {
2542  TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
2543  if (ignore_case) {
2544    return pass == SIMPLE_CHARACTER_MATCH;
2545  } else {
2546    return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2547  }
2548}
2549
2550
2551// This generates the code to match a text node.  A text node can contain
2552// straight character sequences (possibly to be matched in a case-independent
2553// way) and character classes.  For efficiency we do not do this in a single
2554// pass from left to right.  Instead we pass over the text node several times,
2555// emitting code for some character positions every time.  See the comment on
2556// TextEmitPass for details.
2557void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2558  LimitResult limit_result = LimitVersions(compiler, trace);
2559  if (limit_result == DONE) return;
2560  ASSERT(limit_result == CONTINUE);
2561
2562  if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2563    compiler->SetRegExpTooBig();
2564    return;
2565  }
2566
2567  if (compiler->ascii()) {
2568    int dummy = 0;
2569    TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
2570  }
2571
2572  bool first_elt_done = false;
2573  int bound_checked_to = trace->cp_offset() - 1;
2574  bound_checked_to += trace->bound_checked_up_to();
2575
2576  // If a character is preloaded into the current character register then
2577  // check that now.
2578  if (trace->characters_preloaded() == 1) {
2579    for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2580      if (!SkipPass(pass, compiler->ignore_case())) {
2581        TextEmitPass(compiler,
2582                     static_cast<TextEmitPassType>(pass),
2583                     true,
2584                     trace,
2585                     false,
2586                     &bound_checked_to);
2587      }
2588    }
2589    first_elt_done = true;
2590  }
2591
2592  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2593    if (!SkipPass(pass, compiler->ignore_case())) {
2594      TextEmitPass(compiler,
2595                   static_cast<TextEmitPassType>(pass),
2596                   false,
2597                   trace,
2598                   first_elt_done,
2599                   &bound_checked_to);
2600    }
2601  }
2602
2603  Trace successor_trace(*trace);
2604  successor_trace.set_at_start(false);
2605  successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
2606  RecursionCheck rc(compiler);
2607  on_success()->Emit(compiler, &successor_trace);
2608}
2609
2610
2611void Trace::InvalidateCurrentCharacter() {
2612  characters_preloaded_ = 0;
2613}
2614
2615
2616void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
2617  ASSERT(by > 0);
2618  // We don't have an instruction for shifting the current character register
2619  // down or for using a shifted value for anything so lets just forget that
2620  // we preloaded any characters into it.
2621  characters_preloaded_ = 0;
2622  // Adjust the offsets of the quick check performed information.  This
2623  // information is used to find out what we already determined about the
2624  // characters by means of mask and compare.
2625  quick_check_performed_.Advance(by, compiler->ascii());
2626  cp_offset_ += by;
2627  if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2628    compiler->SetRegExpTooBig();
2629    cp_offset_ = 0;
2630  }
2631  bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
2632}
2633
2634
2635void TextNode::MakeCaseIndependent(bool is_ascii) {
2636  int element_count = elms_->length();
2637  for (int i = 0; i < element_count; i++) {
2638    TextElement elm = elms_->at(i);
2639    if (elm.type == TextElement::CHAR_CLASS) {
2640      RegExpCharacterClass* cc = elm.data.u_char_class;
2641      // None of the standard character classes is different in the case
2642      // independent case and it slows us down if we don't know that.
2643      if (cc->is_standard()) continue;
2644      ZoneList<CharacterRange>* ranges = cc->ranges();
2645      int range_count = ranges->length();
2646      for (int j = 0; j < range_count; j++) {
2647        ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
2648      }
2649    }
2650  }
2651}
2652
2653
2654int TextNode::GreedyLoopTextLength() {
2655  TextElement elm = elms_->at(elms_->length() - 1);
2656  if (elm.type == TextElement::CHAR_CLASS) {
2657    return elm.cp_offset + 1;
2658  } else {
2659    return elm.cp_offset + elm.data.u_atom->data().length();
2660  }
2661}
2662
2663
2664// Finds the fixed match length of a sequence of nodes that goes from
2665// this alternative and back to this choice node.  If there are variable
2666// length nodes or other complications in the way then return a sentinel
2667// value indicating that a greedy loop cannot be constructed.
2668int ChoiceNode::GreedyLoopTextLengthForAlternative(
2669    GuardedAlternative* alternative) {
2670  int length = 0;
2671  RegExpNode* node = alternative->node();
2672  // Later we will generate code for all these text nodes using recursion
2673  // so we have to limit the max number.
2674  int recursion_depth = 0;
2675  while (node != this) {
2676    if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2677      return kNodeIsTooComplexForGreedyLoops;
2678    }
2679    int node_length = node->GreedyLoopTextLength();
2680    if (node_length == kNodeIsTooComplexForGreedyLoops) {
2681      return kNodeIsTooComplexForGreedyLoops;
2682    }
2683    length += node_length;
2684    SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2685    node = seq_node->on_success();
2686  }
2687  return length;
2688}
2689
2690
2691void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2692  ASSERT_EQ(loop_node_, NULL);
2693  AddAlternative(alt);
2694  loop_node_ = alt.node();
2695}
2696
2697
2698void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2699  ASSERT_EQ(continue_node_, NULL);
2700  AddAlternative(alt);
2701  continue_node_ = alt.node();
2702}
2703
2704
2705void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2706  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2707  if (trace->stop_node() == this) {
2708    int text_length =
2709        GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
2710    ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2711    // Update the counter-based backtracking info on the stack.  This is an
2712    // optimization for greedy loops (see below).
2713    ASSERT(trace->cp_offset() == text_length);
2714    macro_assembler->AdvanceCurrentPosition(text_length);
2715    macro_assembler->GoTo(trace->loop_label());
2716    return;
2717  }
2718  ASSERT(trace->stop_node() == NULL);
2719  if (!trace->is_trivial()) {
2720    trace->Flush(compiler, this);
2721    return;
2722  }
2723  ChoiceNode::Emit(compiler, trace);
2724}
2725
2726
2727int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2728                                           bool not_at_start) {
2729  int preload_characters = EatsAtLeast(4, 0, not_at_start);
2730  if (compiler->macro_assembler()->CanReadUnaligned()) {
2731    bool ascii = compiler->ascii();
2732    if (ascii) {
2733      if (preload_characters > 4) preload_characters = 4;
2734      // We can't preload 3 characters because there is no machine instruction
2735      // to do that.  We can't just load 4 because we could be reading
2736      // beyond the end of the string, which could cause a memory fault.
2737      if (preload_characters == 3) preload_characters = 2;
2738    } else {
2739      if (preload_characters > 2) preload_characters = 2;
2740    }
2741  } else {
2742    if (preload_characters > 1) preload_characters = 1;
2743  }
2744  return preload_characters;
2745}
2746
2747
2748// This class is used when generating the alternatives in a choice node.  It
2749// records the way the alternative is being code generated.
2750class AlternativeGeneration: public Malloced {
2751 public:
2752  AlternativeGeneration()
2753      : possible_success(),
2754        expects_preload(false),
2755        after(),
2756        quick_check_details() { }
2757  Label possible_success;
2758  bool expects_preload;
2759  Label after;
2760  QuickCheckDetails quick_check_details;
2761};
2762
2763
2764// Creates a list of AlternativeGenerations.  If the list has a reasonable
2765// size then it is on the stack, otherwise the excess is on the heap.
2766class AlternativeGenerationList {
2767 public:
2768  explicit AlternativeGenerationList(int count)
2769      : alt_gens_(count) {
2770    for (int i = 0; i < count && i < kAFew; i++) {
2771      alt_gens_.Add(a_few_alt_gens_ + i);
2772    }
2773    for (int i = kAFew; i < count; i++) {
2774      alt_gens_.Add(new AlternativeGeneration());
2775    }
2776  }
2777  ~AlternativeGenerationList() {
2778    for (int i = kAFew; i < alt_gens_.length(); i++) {
2779      delete alt_gens_[i];
2780      alt_gens_[i] = NULL;
2781    }
2782  }
2783
2784  AlternativeGeneration* at(int i) {
2785    return alt_gens_[i];
2786  }
2787
2788 private:
2789  static const int kAFew = 10;
2790  ZoneList<AlternativeGeneration*> alt_gens_;
2791  AlternativeGeneration a_few_alt_gens_[kAFew];
2792};
2793
2794
2795/* Code generation for choice nodes.
2796 *
2797 * We generate quick checks that do a mask and compare to eliminate a
2798 * choice.  If the quick check succeeds then it jumps to the continuation to
2799 * do slow checks and check subsequent nodes.  If it fails (the common case)
2800 * it falls through to the next choice.
2801 *
2802 * Here is the desired flow graph.  Nodes directly below each other imply
2803 * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
2804 * 3 doesn't have a quick check so we have to call the slow check.
2805 * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
2806 * regexp continuation is generated directly after the Sn node, up to the
2807 * next GoTo if we decide to reuse some already generated code.  Some
2808 * nodes expect preload_characters to be preloaded into the current
2809 * character register.  R nodes do this preloading.  Vertices are marked
2810 * F for failures and S for success (possible success in the case of quick
2811 * nodes).  L, V, < and > are used as arrow heads.
2812 *
2813 * ----------> R
2814 *             |
2815 *             V
2816 *            Q1 -----> S1
2817 *             |   S   /
2818 *            F|      /
2819 *             |    F/
2820 *             |    /
2821 *             |   R
2822 *             |  /
2823 *             V L
2824 *            Q2 -----> S2
2825 *             |   S   /
2826 *            F|      /
2827 *             |    F/
2828 *             |    /
2829 *             |   R
2830 *             |  /
2831 *             V L
2832 *            S3
2833 *             |
2834 *            F|
2835 *             |
2836 *             R
2837 *             |
2838 * backtrack   V
2839 * <----------Q4
2840 *   \    F    |
2841 *    \        |S
2842 *     \   F   V
2843 *      \-----S4
2844 *
2845 * For greedy loops we reverse our expectation and expect to match rather
2846 * than fail. Therefore we want the loop code to look like this (U is the
2847 * unwind code that steps back in the greedy loop).  The following alternatives
2848 * look the same as above.
2849 *              _____
2850 *             /     \
2851 *             V     |
2852 * ----------> S1    |
2853 *            /|     |
2854 *           / |S    |
2855 *         F/  \_____/
2856 *         /
2857 *        |<-----------
2858 *        |            \
2859 *        V             \
2860 *        Q2 ---> S2     \
2861 *        |  S   /       |
2862 *       F|     /        |
2863 *        |   F/         |
2864 *        |   /          |
2865 *        |  R           |
2866 *        | /            |
2867 *   F    VL             |
2868 * <------U              |
2869 * back   |S             |
2870 *        \______________/
2871 */
2872
2873
2874void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2875  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2876  int choice_count = alternatives_->length();
2877#ifdef DEBUG
2878  for (int i = 0; i < choice_count - 1; i++) {
2879    GuardedAlternative alternative = alternatives_->at(i);
2880    ZoneList<Guard*>* guards = alternative.guards();
2881    int guard_count = (guards == NULL) ? 0 : guards->length();
2882    for (int j = 0; j < guard_count; j++) {
2883      ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
2884    }
2885  }
2886#endif
2887
2888  LimitResult limit_result = LimitVersions(compiler, trace);
2889  if (limit_result == DONE) return;
2890  ASSERT(limit_result == CONTINUE);
2891
2892  int new_flush_budget = trace->flush_budget() / choice_count;
2893  if (trace->flush_budget() == 0 && trace->actions() != NULL) {
2894    trace->Flush(compiler, this);
2895    return;
2896  }
2897
2898  RecursionCheck rc(compiler);
2899
2900  Trace* current_trace = trace;
2901
2902  int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
2903  bool greedy_loop = false;
2904  Label greedy_loop_label;
2905  Trace counter_backtrack_trace;
2906  counter_backtrack_trace.set_backtrack(&greedy_loop_label);
2907  if (not_at_start()) counter_backtrack_trace.set_at_start(false);
2908
2909  if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2910    // Here we have special handling for greedy loops containing only text nodes
2911    // and other simple nodes.  These are handled by pushing the current
2912    // position on the stack and then incrementing the current position each
2913    // time around the switch.  On backtrack we decrement the current position
2914    // and check it against the pushed value.  This avoids pushing backtrack
2915    // information for each iteration of the loop, which could take up a lot of
2916    // space.
2917    greedy_loop = true;
2918    ASSERT(trace->stop_node() == NULL);
2919    macro_assembler->PushCurrentPosition();
2920    current_trace = &counter_backtrack_trace;
2921    Label greedy_match_failed;
2922    Trace greedy_match_trace;
2923    if (not_at_start()) greedy_match_trace.set_at_start(false);
2924    greedy_match_trace.set_backtrack(&greedy_match_failed);
2925    Label loop_label;
2926    macro_assembler->Bind(&loop_label);
2927    greedy_match_trace.set_stop_node(this);
2928    greedy_match_trace.set_loop_label(&loop_label);
2929    alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
2930    macro_assembler->Bind(&greedy_match_failed);
2931  }
2932
2933  Label second_choice;  // For use in greedy matches.
2934  macro_assembler->Bind(&second_choice);
2935
2936  int first_normal_choice = greedy_loop ? 1 : 0;
2937
2938  int preload_characters =
2939      CalculatePreloadCharacters(compiler,
2940                                 current_trace->at_start() == Trace::FALSE);
2941  bool preload_is_current =
2942      (current_trace->characters_preloaded() == preload_characters);
2943  bool preload_has_checked_bounds = preload_is_current;
2944
2945  AlternativeGenerationList alt_gens(choice_count);
2946
2947  // For now we just call all choices one after the other.  The idea ultimately
2948  // is to use the Dispatch table to try only the relevant ones.
2949  for (int i = first_normal_choice; i < choice_count; i++) {
2950    GuardedAlternative alternative = alternatives_->at(i);
2951    AlternativeGeneration* alt_gen = alt_gens.at(i);
2952    alt_gen->quick_check_details.set_characters(preload_characters);
2953    ZoneList<Guard*>* guards = alternative.guards();
2954    int guard_count = (guards == NULL) ? 0 : guards->length();
2955    Trace new_trace(*current_trace);
2956    new_trace.set_characters_preloaded(preload_is_current ?
2957                                         preload_characters :
2958                                         0);
2959    if (preload_has_checked_bounds) {
2960      new_trace.set_bound_checked_up_to(preload_characters);
2961    }
2962    new_trace.quick_check_performed()->Clear();
2963    if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
2964    alt_gen->expects_preload = preload_is_current;
2965    bool generate_full_check_inline = false;
2966    if (FLAG_regexp_optimization &&
2967        try_to_emit_quick_check_for_alternative(i) &&
2968        alternative.node()->EmitQuickCheck(compiler,
2969                                           &new_trace,
2970                                           preload_has_checked_bounds,
2971                                           &alt_gen->possible_success,
2972                                           &alt_gen->quick_check_details,
2973                                           i < choice_count - 1)) {
2974      // Quick check was generated for this choice.
2975      preload_is_current = true;
2976      preload_has_checked_bounds = true;
2977      // On the last choice in the ChoiceNode we generated the quick
2978      // check to fall through on possible success.  So now we need to
2979      // generate the full check inline.
2980      if (i == choice_count - 1) {
2981        macro_assembler->Bind(&alt_gen->possible_success);
2982        new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2983        new_trace.set_characters_preloaded(preload_characters);
2984        new_trace.set_bound_checked_up_to(preload_characters);
2985        generate_full_check_inline = true;
2986      }
2987    } else if (alt_gen->quick_check_details.cannot_match()) {
2988      if (i == choice_count - 1 && !greedy_loop) {
2989        macro_assembler->GoTo(trace->backtrack());
2990      }
2991      continue;
2992    } else {
2993      // No quick check was generated.  Put the full code here.
2994      // If this is not the first choice then there could be slow checks from
2995      // previous cases that go here when they fail.  There's no reason to
2996      // insist that they preload characters since the slow check we are about
2997      // to generate probably can't use it.
2998      if (i != first_normal_choice) {
2999        alt_gen->expects_preload = false;
3000        new_trace.InvalidateCurrentCharacter();
3001      }
3002      if (i < choice_count - 1) {
3003        new_trace.set_backtrack(&alt_gen->after);
3004      }
3005      generate_full_check_inline = true;
3006    }
3007    if (generate_full_check_inline) {
3008      if (new_trace.actions() != NULL) {
3009        new_trace.set_flush_budget(new_flush_budget);
3010      }
3011      for (int j = 0; j < guard_count; j++) {
3012        GenerateGuard(macro_assembler, guards->at(j), &new_trace);
3013      }
3014      alternative.node()->Emit(compiler, &new_trace);
3015      preload_is_current = false;
3016    }
3017    macro_assembler->Bind(&alt_gen->after);
3018  }
3019  if (greedy_loop) {
3020    macro_assembler->Bind(&greedy_loop_label);
3021    // If we have unwound to the bottom then backtrack.
3022    macro_assembler->CheckGreedyLoop(trace->backtrack());
3023    // Otherwise try the second priority at an earlier position.
3024    macro_assembler->AdvanceCurrentPosition(-text_length);
3025    macro_assembler->GoTo(&second_choice);
3026  }
3027
3028  // At this point we need to generate slow checks for the alternatives where
3029  // the quick check was inlined.  We can recognize these because the associated
3030  // label was bound.
3031  for (int i = first_normal_choice; i < choice_count - 1; i++) {
3032    AlternativeGeneration* alt_gen = alt_gens.at(i);
3033    Trace new_trace(*current_trace);
3034    // If there are actions to be flushed we have to limit how many times
3035    // they are flushed.  Take the budget of the parent trace and distribute
3036    // it fairly amongst the children.
3037    if (new_trace.actions() != NULL) {
3038      new_trace.set_flush_budget(new_flush_budget);
3039    }
3040    EmitOutOfLineContinuation(compiler,
3041                              &new_trace,
3042                              alternatives_->at(i),
3043                              alt_gen,
3044                              preload_characters,
3045                              alt_gens.at(i + 1)->expects_preload);
3046  }
3047}
3048
3049
3050void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3051                                           Trace* trace,
3052                                           GuardedAlternative alternative,
3053                                           AlternativeGeneration* alt_gen,
3054                                           int preload_characters,
3055                                           bool next_expects_preload) {
3056  if (!alt_gen->possible_success.is_linked()) return;
3057
3058  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3059  macro_assembler->Bind(&alt_gen->possible_success);
3060  Trace out_of_line_trace(*trace);
3061  out_of_line_trace.set_characters_preloaded(preload_characters);
3062  out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3063  if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
3064  ZoneList<Guard*>* guards = alternative.guards();
3065  int guard_count = (guards == NULL) ? 0 : guards->length();
3066  if (next_expects_preload) {
3067    Label reload_current_char;
3068    out_of_line_trace.set_backtrack(&reload_current_char);
3069    for (int j = 0; j < guard_count; j++) {
3070      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3071    }
3072    alternative.node()->Emit(compiler, &out_of_line_trace);
3073    macro_assembler->Bind(&reload_current_char);
3074    // Reload the current character, since the next quick check expects that.
3075    // We don't need to check bounds here because we only get into this
3076    // code through a quick check which already did the checked load.
3077    macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
3078                                          NULL,
3079                                          false,
3080                                          preload_characters);
3081    macro_assembler->GoTo(&(alt_gen->after));
3082  } else {
3083    out_of_line_trace.set_backtrack(&(alt_gen->after));
3084    for (int j = 0; j < guard_count; j++) {
3085      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3086    }
3087    alternative.node()->Emit(compiler, &out_of_line_trace);
3088  }
3089}
3090
3091
3092void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3093  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3094  LimitResult limit_result = LimitVersions(compiler, trace);
3095  if (limit_result == DONE) return;
3096  ASSERT(limit_result == CONTINUE);
3097
3098  RecursionCheck rc(compiler);
3099
3100  switch (type_) {
3101    case STORE_POSITION: {
3102      Trace::DeferredCapture
3103          new_capture(data_.u_position_register.reg,
3104                      data_.u_position_register.is_capture,
3105                      trace);
3106      Trace new_trace = *trace;
3107      new_trace.add_action(&new_capture);
3108      on_success()->Emit(compiler, &new_trace);
3109      break;
3110    }
3111    case INCREMENT_REGISTER: {
3112      Trace::DeferredIncrementRegister
3113          new_increment(data_.u_increment_register.reg);
3114      Trace new_trace = *trace;
3115      new_trace.add_action(&new_increment);
3116      on_success()->Emit(compiler, &new_trace);
3117      break;
3118    }
3119    case SET_REGISTER: {
3120      Trace::DeferredSetRegister
3121          new_set(data_.u_store_register.reg, data_.u_store_register.value);
3122      Trace new_trace = *trace;
3123      new_trace.add_action(&new_set);
3124      on_success()->Emit(compiler, &new_trace);
3125      break;
3126    }
3127    case CLEAR_CAPTURES: {
3128      Trace::DeferredClearCaptures
3129        new_capture(Interval(data_.u_clear_captures.range_from,
3130                             data_.u_clear_captures.range_to));
3131      Trace new_trace = *trace;
3132      new_trace.add_action(&new_capture);
3133      on_success()->Emit(compiler, &new_trace);
3134      break;
3135    }
3136    case BEGIN_SUBMATCH:
3137      if (!trace->is_trivial()) {
3138        trace->Flush(compiler, this);
3139      } else {
3140        assembler->WriteCurrentPositionToRegister(
3141            data_.u_submatch.current_position_register, 0);
3142        assembler->WriteStackPointerToRegister(
3143            data_.u_submatch.stack_pointer_register);
3144        on_success()->Emit(compiler, trace);
3145      }
3146      break;
3147    case EMPTY_MATCH_CHECK: {
3148      int start_pos_reg = data_.u_empty_match_check.start_register;
3149      int stored_pos = 0;
3150      int rep_reg = data_.u_empty_match_check.repetition_register;
3151      bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3152      bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3153      if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3154        // If we know we haven't advanced and there is no minimum we
3155        // can just backtrack immediately.
3156        assembler->GoTo(trace->backtrack());
3157      } else if (know_dist && stored_pos < trace->cp_offset()) {
3158        // If we know we've advanced we can generate the continuation
3159        // immediately.
3160        on_success()->Emit(compiler, trace);
3161      } else if (!trace->is_trivial()) {
3162        trace->Flush(compiler, this);
3163      } else {
3164        Label skip_empty_check;
3165        // If we have a minimum number of repetitions we check the current
3166        // number first and skip the empty check if it's not enough.
3167        if (has_minimum) {
3168          int limit = data_.u_empty_match_check.repetition_limit;
3169          assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3170        }
3171        // If the match is empty we bail out, otherwise we fall through
3172        // to the on-success continuation.
3173        assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3174                                   trace->backtrack());
3175        assembler->Bind(&skip_empty_check);
3176        on_success()->Emit(compiler, trace);
3177      }
3178      break;
3179    }
3180    case POSITIVE_SUBMATCH_SUCCESS: {
3181      if (!trace->is_trivial()) {
3182        trace->Flush(compiler, this);
3183        return;
3184      }
3185      assembler->ReadCurrentPositionFromRegister(
3186          data_.u_submatch.current_position_register);
3187      assembler->ReadStackPointerFromRegister(
3188          data_.u_submatch.stack_pointer_register);
3189      int clear_register_count = data_.u_submatch.clear_register_count;
3190      if (clear_register_count == 0) {
3191        on_success()->Emit(compiler, trace);
3192        return;
3193      }
3194      int clear_registers_from = data_.u_submatch.clear_register_from;
3195      Label clear_registers_backtrack;
3196      Trace new_trace = *trace;
3197      new_trace.set_backtrack(&clear_registers_backtrack);
3198      on_success()->Emit(compiler, &new_trace);
3199
3200      assembler->Bind(&clear_registers_backtrack);
3201      int clear_registers_to = clear_registers_from + clear_register_count - 1;
3202      assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3203
3204      ASSERT(trace->backtrack() == NULL);
3205      assembler->Backtrack();
3206      return;
3207    }
3208    default:
3209      UNREACHABLE();
3210  }
3211}
3212
3213
3214void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3215  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3216  if (!trace->is_trivial()) {
3217    trace->Flush(compiler, this);
3218    return;
3219  }
3220
3221  LimitResult limit_result = LimitVersions(compiler, trace);
3222  if (limit_result == DONE) return;
3223  ASSERT(limit_result == CONTINUE);
3224
3225  RecursionCheck rc(compiler);
3226
3227  ASSERT_EQ(start_reg_ + 1, end_reg_);
3228  if (compiler->ignore_case()) {
3229    assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3230                                               trace->backtrack());
3231  } else {
3232    assembler->CheckNotBackReference(start_reg_, trace->backtrack());
3233  }
3234  on_success()->Emit(compiler, trace);
3235}
3236
3237
3238// -------------------------------------------------------------------
3239// Dot/dotty output
3240
3241
3242#ifdef DEBUG
3243
3244
3245class DotPrinter: public NodeVisitor {
3246 public:
3247  explicit DotPrinter(bool ignore_case)
3248      : ignore_case_(ignore_case),
3249        stream_(&alloc_) { }
3250  void PrintNode(const char* label, RegExpNode* node);
3251  void Visit(RegExpNode* node);
3252  void PrintAttributes(RegExpNode* from);
3253  StringStream* stream() { return &stream_; }
3254  void PrintOnFailure(RegExpNode* from, RegExpNode* to);
3255#define DECLARE_VISIT(Type)                                          \
3256  virtual void Visit##Type(Type##Node* that);
3257FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3258#undef DECLARE_VISIT
3259 private:
3260  bool ignore_case_;
3261  HeapStringAllocator alloc_;
3262  StringStream stream_;
3263};
3264
3265
3266void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3267  stream()->Add("digraph G {\n  graph [label=\"");
3268  for (int i = 0; label[i]; i++) {
3269    switch (label[i]) {
3270      case '\\':
3271        stream()->Add("\\\\");
3272        break;
3273      case '"':
3274        stream()->Add("\"");
3275        break;
3276      default:
3277        stream()->Put(label[i]);
3278        break;
3279    }
3280  }
3281  stream()->Add("\"];\n");
3282  Visit(node);
3283  stream()->Add("}\n");
3284  printf("%s", *(stream()->ToCString()));
3285}
3286
3287
3288void DotPrinter::Visit(RegExpNode* node) {
3289  if (node->info()->visited) return;
3290  node->info()->visited = true;
3291  node->Accept(this);
3292}
3293
3294
3295void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
3296  stream()->Add("  n%p -> n%p [style=dotted];\n", from, on_failure);
3297  Visit(on_failure);
3298}
3299
3300
3301class TableEntryBodyPrinter {
3302 public:
3303  TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3304      : stream_(stream), choice_(choice) { }
3305  void Call(uc16 from, DispatchTable::Entry entry) {
3306    OutSet* out_set = entry.out_set();
3307    for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3308      if (out_set->Get(i)) {
3309        stream()->Add("    n%p:s%io%i -> n%p;\n",
3310                      choice(),
3311                      from,
3312                      i,
3313                      choice()->alternatives()->at(i).node());
3314      }
3315    }
3316  }
3317 private:
3318  StringStream* stream() { return stream_; }
3319  ChoiceNode* choice() { return choice_; }
3320  StringStream* stream_;
3321  ChoiceNode* choice_;
3322};
3323
3324
3325class TableEntryHeaderPrinter {
3326 public:
3327  explicit TableEntryHeaderPrinter(StringStream* stream)
3328      : first_(true), stream_(stream) { }
3329  void Call(uc16 from, DispatchTable::Entry entry) {
3330    if (first_) {
3331      first_ = false;
3332    } else {
3333      stream()->Add("|");
3334    }
3335    stream()->Add("{\\%k-\\%k|{", from, entry.to());
3336    OutSet* out_set = entry.out_set();
3337    int priority = 0;
3338    for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3339      if (out_set->Get(i)) {
3340        if (priority > 0) stream()->Add("|");
3341        stream()->Add("<s%io%i> %i", from, i, priority);
3342        priority++;
3343      }
3344    }
3345    stream()->Add("}}");
3346  }
3347
3348 private:
3349  bool first_;
3350  StringStream* stream() { return stream_; }
3351  StringStream* stream_;
3352};
3353
3354
3355class AttributePrinter {
3356 public:
3357  explicit AttributePrinter(DotPrinter* out)
3358      : out_(out), first_(true) { }
3359  void PrintSeparator() {
3360    if (first_) {
3361      first_ = false;
3362    } else {
3363      out_->stream()->Add("|");
3364    }
3365  }
3366  void PrintBit(const char* name, bool value) {
3367    if (!value) return;
3368    PrintSeparator();
3369    out_->stream()->Add("{%s}", name);
3370  }
3371  void PrintPositive(const char* name, int value) {
3372    if (value < 0) return;
3373    PrintSeparator();
3374    out_->stream()->Add("{%s|%x}", name, value);
3375  }
3376 private:
3377  DotPrinter* out_;
3378  bool first_;
3379};
3380
3381
3382void DotPrinter::PrintAttributes(RegExpNode* that) {
3383  stream()->Add("  a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3384                "margin=0.1, fontsize=10, label=\"{",
3385                that);
3386  AttributePrinter printer(this);
3387  NodeInfo* info = that->info();
3388  printer.PrintBit("NI", info->follows_newline_interest);
3389  printer.PrintBit("WI", info->follows_word_interest);
3390  printer.PrintBit("SI", info->follows_start_interest);
3391  Label* label = that->label();
3392  if (label->is_bound())
3393    printer.PrintPositive("@", label->pos());
3394  stream()->Add("}\"];\n");
3395  stream()->Add("  a%p -> n%p [style=dashed, color=grey, "
3396                "arrowhead=none];\n", that, that);
3397}
3398
3399
3400static const bool kPrintDispatchTable = false;
3401void DotPrinter::VisitChoice(ChoiceNode* that) {
3402  if (kPrintDispatchTable) {
3403    stream()->Add("  n%p [shape=Mrecord, label=\"", that);
3404    TableEntryHeaderPrinter header_printer(stream());
3405    that->GetTable(ignore_case_)->ForEach(&header_printer);
3406    stream()->Add("\"]\n", that);
3407    PrintAttributes(that);
3408    TableEntryBodyPrinter body_printer(stream(), that);
3409    that->GetTable(ignore_case_)->ForEach(&body_printer);
3410  } else {
3411    stream()->Add("  n%p [shape=Mrecord, label=\"?\"];\n", that);
3412    for (int i = 0; i < that->alternatives()->length(); i++) {
3413      GuardedAlternative alt = that->alternatives()->at(i);
3414      stream()->Add("  n%p -> n%p;\n", that, alt.node());
3415    }
3416  }
3417  for (int i = 0; i < that->alternatives()->length(); i++) {
3418    GuardedAlternative alt = that->alternatives()->at(i);
3419    alt.node()->Accept(this);
3420  }
3421}
3422
3423
3424void DotPrinter::VisitText(TextNode* that) {
3425  stream()->Add("  n%p [label=\"", that);
3426  for (int i = 0; i < that->elements()->length(); i++) {
3427    if (i > 0) stream()->Add(" ");
3428    TextElement elm = that->elements()->at(i);
3429    switch (elm.type) {
3430      case TextElement::ATOM: {
3431        stream()->Add("'%w'", elm.data.u_atom->data());
3432        break;
3433      }
3434      case TextElement::CHAR_CLASS: {
3435        RegExpCharacterClass* node = elm.data.u_char_class;
3436        stream()->Add("[");
3437        if (node->is_negated())
3438          stream()->Add("^");
3439        for (int j = 0; j < node->ranges()->length(); j++) {
3440          CharacterRange range = node->ranges()->at(j);
3441          stream()->Add("%k-%k", range.from(), range.to());
3442        }
3443        stream()->Add("]");
3444        break;
3445      }
3446      default:
3447        UNREACHABLE();
3448    }
3449  }
3450  stream()->Add("\", shape=box, peripheries=2];\n");
3451  PrintAttributes(that);
3452  stream()->Add("  n%p -> n%p;\n", that, that->on_success());
3453  Visit(that->on_success());
3454}
3455
3456
3457void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3458  stream()->Add("  n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3459                that,
3460                that->start_register(),
3461                that->end_register());
3462  PrintAttributes(that);
3463  stream()->Add("  n%p -> n%p;\n", that, that->on_success());
3464  Visit(that->on_success());
3465}
3466
3467
3468void DotPrinter::VisitEnd(EndNode* that) {
3469  stream()->Add("  n%p [style=bold, shape=point];\n", that);
3470  PrintAttributes(that);
3471}
3472
3473
3474void DotPrinter::VisitAssertion(AssertionNode* that) {
3475  stream()->Add("  n%p [", that);
3476  switch (that->type()) {
3477    case AssertionNode::AT_END:
3478      stream()->Add("label=\"$\", shape=septagon");
3479      break;
3480    case AssertionNode::AT_START:
3481      stream()->Add("label=\"^\", shape=septagon");
3482      break;
3483    case AssertionNode::AT_BOUNDARY:
3484      stream()->Add("label=\"\\b\", shape=septagon");
3485      break;
3486    case AssertionNode::AT_NON_BOUNDARY:
3487      stream()->Add("label=\"\\B\", shape=septagon");
3488      break;
3489    case AssertionNode::AFTER_NEWLINE:
3490      stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3491      break;
3492    case AssertionNode::AFTER_WORD_CHARACTER:
3493      stream()->Add("label=\"(?<=\\w)\", shape=septagon");
3494      break;
3495    case AssertionNode::AFTER_NONWORD_CHARACTER:
3496      stream()->Add("label=\"(?<=\\W)\", shape=septagon");
3497      break;
3498  }
3499  stream()->Add("];\n");
3500  PrintAttributes(that);
3501  RegExpNode* successor = that->on_success();
3502  stream()->Add("  n%p -> n%p;\n", that, successor);
3503  Visit(successor);
3504}
3505
3506
3507void DotPrinter::VisitAction(ActionNode* that) {
3508  stream()->Add("  n%p [", that);
3509  switch (that->type_) {
3510    case ActionNode::SET_REGISTER:
3511      stream()->Add("label=\"$%i:=%i\", shape=octagon",
3512                    that->data_.u_store_register.reg,
3513                    that->data_.u_store_register.value);
3514      break;
3515    case ActionNode::INCREMENT_REGISTER:
3516      stream()->Add("label=\"$%i++\", shape=octagon",
3517                    that->data_.u_increment_register.reg);
3518      break;
3519    case ActionNode::STORE_POSITION:
3520      stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3521                    that->data_.u_position_register.reg);
3522      break;
3523    case ActionNode::BEGIN_SUBMATCH:
3524      stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3525                    that->data_.u_submatch.current_position_register);
3526      break;
3527    case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3528      stream()->Add("label=\"escape\", shape=septagon");
3529      break;
3530    case ActionNode::EMPTY_MATCH_CHECK:
3531      stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3532                    that->data_.u_empty_match_check.start_register,
3533                    that->data_.u_empty_match_check.repetition_register,
3534                    that->data_.u_empty_match_check.repetition_limit);
3535      break;
3536    case ActionNode::CLEAR_CAPTURES: {
3537      stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3538                    that->data_.u_clear_captures.range_from,
3539                    that->data_.u_clear_captures.range_to);
3540      break;
3541    }
3542  }
3543  stream()->Add("];\n");
3544  PrintAttributes(that);
3545  RegExpNode* successor = that->on_success();
3546  stream()->Add("  n%p -> n%p;\n", that, successor);
3547  Visit(successor);
3548}
3549
3550
3551class DispatchTableDumper {
3552 public:
3553  explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3554  void Call(uc16 key, DispatchTable::Entry entry);
3555  StringStream* stream() { return stream_; }
3556 private:
3557  StringStream* stream_;
3558};
3559
3560
3561void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3562  stream()->Add("[%k-%k]: {", key, entry.to());
3563  OutSet* set = entry.out_set();
3564  bool first = true;
3565  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3566    if (set->Get(i)) {
3567      if (first) {
3568        first = false;
3569      } else {
3570        stream()->Add(", ");
3571      }
3572      stream()->Add("%i", i);
3573    }
3574  }
3575  stream()->Add("}\n");
3576}
3577
3578
3579void DispatchTable::Dump() {
3580  HeapStringAllocator alloc;
3581  StringStream stream(&alloc);
3582  DispatchTableDumper dumper(&stream);
3583  tree()->ForEach(&dumper);
3584  OS::PrintError("%s", *stream.ToCString());
3585}
3586
3587
3588void RegExpEngine::DotPrint(const char* label,
3589                            RegExpNode* node,
3590                            bool ignore_case) {
3591  DotPrinter printer(ignore_case);
3592  printer.PrintNode(label, node);
3593}
3594
3595
3596#endif  // DEBUG
3597
3598
3599// -------------------------------------------------------------------
3600// Tree to graph conversion
3601
3602static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0,
3603    0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,
3604    0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF };
3605static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
3606
3607static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' };
3608static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);
3609
3610static const uc16 kDigitRanges[] = { '0', '9' };
3611static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
3612
3613static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D,
3614    0x2028, 0x2029 };
3615static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);
3616
3617RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
3618                               RegExpNode* on_success) {
3619  ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3620  elms->Add(TextElement::Atom(this));
3621  return new TextNode(elms, on_success);
3622}
3623
3624
3625RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
3626                               RegExpNode* on_success) {
3627  return new TextNode(elements(), on_success);
3628}
3629
3630static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3631                                 const uc16* special_class,
3632                                 int length) {
3633  ASSERT(ranges->length() != 0);
3634  ASSERT(length != 0);
3635  ASSERT(special_class[0] != 0);
3636  if (ranges->length() != (length >> 1) + 1) {
3637    return false;
3638  }
3639  CharacterRange range = ranges->at(0);
3640  if (range.from() != 0) {
3641    return false;
3642  }
3643  for (int i = 0; i < length; i += 2) {
3644    if (special_class[i] != (range.to() + 1)) {
3645      return false;
3646    }
3647    range = ranges->at((i >> 1) + 1);
3648    if (special_class[i+1] != range.from() - 1) {
3649      return false;
3650    }
3651  }
3652  if (range.to() != 0xffff) {
3653    return false;
3654  }
3655  return true;
3656}
3657
3658
3659static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3660                          const uc16* special_class,
3661                          int length) {
3662  if (ranges->length() * 2 != length) {
3663    return false;
3664  }
3665  for (int i = 0; i < length; i += 2) {
3666    CharacterRange range = ranges->at(i >> 1);
3667    if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3668      return false;
3669    }
3670  }
3671  return true;
3672}
3673
3674
3675bool RegExpCharacterClass::is_standard() {
3676  // TODO(lrn): Remove need for this function, by not throwing away information
3677  // along the way.
3678  if (is_negated_) {
3679    return false;
3680  }
3681  if (set_.is_standard()) {
3682    return true;
3683  }
3684  if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3685    set_.set_standard_set_type('s');
3686    return true;
3687  }
3688  if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3689    set_.set_standard_set_type('S');
3690    return true;
3691  }
3692  if (CompareInverseRanges(set_.ranges(),
3693                           kLineTerminatorRanges,
3694                           kLineTerminatorRangeCount)) {
3695    set_.set_standard_set_type('.');
3696    return true;
3697  }
3698  if (CompareRanges(set_.ranges(),
3699                    kLineTerminatorRanges,
3700                    kLineTerminatorRangeCount)) {
3701    set_.set_standard_set_type('n');
3702    return true;
3703  }
3704  if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3705    set_.set_standard_set_type('w');
3706    return true;
3707  }
3708  if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3709    set_.set_standard_set_type('W');
3710    return true;
3711  }
3712  return false;
3713}
3714
3715
3716RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
3717                                         RegExpNode* on_success) {
3718  return new TextNode(this, on_success);
3719}
3720
3721
3722RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
3723                                      RegExpNode* on_success) {
3724  ZoneList<RegExpTree*>* alternatives = this->alternatives();
3725  int length = alternatives->length();
3726  ChoiceNode* result = new ChoiceNode(length);
3727  for (int i = 0; i < length; i++) {
3728    GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
3729                                                               on_success));
3730    result->AddAlternative(alternative);
3731  }
3732  return result;
3733}
3734
3735
3736RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
3737                                     RegExpNode* on_success) {
3738  return ToNode(min(),
3739                max(),
3740                is_greedy(),
3741                body(),
3742                compiler,
3743                on_success);
3744}
3745
3746
3747// Scoped object to keep track of how much we unroll quantifier loops in the
3748// regexp graph generator.
3749class RegExpExpansionLimiter {
3750 public:
3751  static const int kMaxExpansionFactor = 6;
3752  RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
3753      : compiler_(compiler),
3754        saved_expansion_factor_(compiler->current_expansion_factor()),
3755        ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
3756    ASSERT(factor > 0);
3757    if (ok_to_expand_) {
3758      if (factor > kMaxExpansionFactor) {
3759        // Avoid integer overflow of the current expansion factor.
3760        ok_to_expand_ = false;
3761        compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
3762      } else {
3763        int new_factor = saved_expansion_factor_ * factor;
3764        ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
3765        compiler->set_current_expansion_factor(new_factor);
3766      }
3767    }
3768  }
3769
3770  ~RegExpExpansionLimiter() {
3771    compiler_->set_current_expansion_factor(saved_expansion_factor_);
3772  }
3773
3774  bool ok_to_expand() { return ok_to_expand_; }
3775
3776 private:
3777  RegExpCompiler* compiler_;
3778  int saved_expansion_factor_;
3779  bool ok_to_expand_;
3780
3781  DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
3782};
3783
3784
3785RegExpNode* RegExpQuantifier::ToNode(int min,
3786                                     int max,
3787                                     bool is_greedy,
3788                                     RegExpTree* body,
3789                                     RegExpCompiler* compiler,
3790                                     RegExpNode* on_success,
3791                                     bool not_at_start) {
3792  // x{f, t} becomes this:
3793  //
3794  //             (r++)<-.
3795  //               |     `
3796  //               |     (x)
3797  //               v     ^
3798  //      (r=0)-->(?)---/ [if r < t]
3799  //               |
3800  //   [if r >= f] \----> ...
3801  //
3802
3803  // 15.10.2.5 RepeatMatcher algorithm.
3804  // The parser has already eliminated the case where max is 0.  In the case
3805  // where max_match is zero the parser has removed the quantifier if min was
3806  // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
3807
3808  // If we know that we cannot match zero length then things are a little
3809  // simpler since we don't need to make the special zero length match check
3810  // from step 2.1.  If the min and max are small we can unroll a little in
3811  // this case.
3812  static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
3813  static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
3814  if (max == 0) return on_success;  // This can happen due to recursion.
3815  bool body_can_be_empty = (body->min_match() == 0);
3816  int body_start_reg = RegExpCompiler::kNoRegister;
3817  Interval capture_registers = body->CaptureRegisters();
3818  bool needs_capture_clearing = !capture_registers.is_empty();
3819  if (body_can_be_empty) {
3820    body_start_reg = compiler->AllocateRegister();
3821  } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
3822    // Only unroll if there are no captures and the body can't be
3823    // empty.
3824    {
3825      RegExpExpansionLimiter limiter(
3826          compiler, min + ((max != min) ? 1 : 0));
3827      if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
3828        int new_max = (max == kInfinity) ? max : max - min;
3829        // Recurse once to get the loop or optional matches after the fixed
3830        // ones.
3831        RegExpNode* answer = ToNode(
3832            0, new_max, is_greedy, body, compiler, on_success, true);
3833        // Unroll the forced matches from 0 to min.  This can cause chains of
3834        // TextNodes (which the parser does not generate).  These should be
3835        // combined if it turns out they hinder good code generation.
3836        for (int i = 0; i < min; i++) {
3837          answer = body->ToNode(compiler, answer);
3838        }
3839        return answer;
3840      }
3841    }
3842    if (max <= kMaxUnrolledMaxMatches && min == 0) {
3843      ASSERT(max > 0);  // Due to the 'if' above.
3844      RegExpExpansionLimiter limiter(compiler, max);
3845      if (limiter.ok_to_expand()) {
3846        // Unroll the optional matches up to max.
3847        RegExpNode* answer = on_success;
3848        for (int i = 0; i < max; i++) {
3849          ChoiceNode* alternation = new ChoiceNode(2);
3850          if (is_greedy) {
3851            alternation->AddAlternative(
3852                GuardedAlternative(body->ToNode(compiler, answer)));
3853            alternation->AddAlternative(GuardedAlternative(on_success));
3854          } else {
3855            alternation->AddAlternative(GuardedAlternative(on_success));
3856            alternation->AddAlternative(
3857                GuardedAlternative(body->ToNode(compiler, answer)));
3858          }
3859          answer = alternation;
3860          if (not_at_start) alternation->set_not_at_start();
3861        }
3862        return answer;
3863      }
3864    }
3865  }
3866  bool has_min = min > 0;
3867  bool has_max = max < RegExpTree::kInfinity;
3868  bool needs_counter = has_min || has_max;
3869  int reg_ctr = needs_counter
3870      ? compiler->AllocateRegister()
3871      : RegExpCompiler::kNoRegister;
3872  LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
3873  if (not_at_start) center->set_not_at_start();
3874  RegExpNode* loop_return = needs_counter
3875      ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3876      : static_cast<RegExpNode*>(center);
3877  if (body_can_be_empty) {
3878    // If the body can be empty we need to check if it was and then
3879    // backtrack.
3880    loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
3881                                              reg_ctr,
3882                                              min,
3883                                              loop_return);
3884  }
3885  RegExpNode* body_node = body->ToNode(compiler, loop_return);
3886  if (body_can_be_empty) {
3887    // If the body can be empty we need to store the start position
3888    // so we can bail out if it was empty.
3889    body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
3890  }
3891  if (needs_capture_clearing) {
3892    // Before entering the body of this loop we need to clear captures.
3893    body_node = ActionNode::ClearCaptures(capture_registers, body_node);
3894  }
3895  GuardedAlternative body_alt(body_node);
3896  if (has_max) {
3897    Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3898    body_alt.AddGuard(body_guard);
3899  }
3900  GuardedAlternative rest_alt(on_success);
3901  if (has_min) {
3902    Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3903    rest_alt.AddGuard(rest_guard);
3904  }
3905  if (is_greedy) {
3906    center->AddLoopAlternative(body_alt);
3907    center->AddContinueAlternative(rest_alt);
3908  } else {
3909    center->AddContinueAlternative(rest_alt);
3910    center->AddLoopAlternative(body_alt);
3911  }
3912  if (needs_counter) {
3913    return ActionNode::SetRegister(reg_ctr, 0, center);
3914  } else {
3915    return center;
3916  }
3917}
3918
3919
3920RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
3921                                    RegExpNode* on_success) {
3922  NodeInfo info;
3923  switch (type()) {
3924    case START_OF_LINE:
3925      return AssertionNode::AfterNewline(on_success);
3926    case START_OF_INPUT:
3927      return AssertionNode::AtStart(on_success);
3928    case BOUNDARY:
3929      return AssertionNode::AtBoundary(on_success);
3930    case NON_BOUNDARY:
3931      return AssertionNode::AtNonBoundary(on_success);
3932    case END_OF_INPUT:
3933      return AssertionNode::AtEnd(on_success);
3934    case END_OF_LINE: {
3935      // Compile $ in multiline regexps as an alternation with a positive
3936      // lookahead in one side and an end-of-input on the other side.
3937      // We need two registers for the lookahead.
3938      int stack_pointer_register = compiler->AllocateRegister();
3939      int position_register = compiler->AllocateRegister();
3940      // The ChoiceNode to distinguish between a newline and end-of-input.
3941      ChoiceNode* result = new ChoiceNode(2);
3942      // Create a newline atom.
3943      ZoneList<CharacterRange>* newline_ranges =
3944          new ZoneList<CharacterRange>(3);
3945      CharacterRange::AddClassEscape('n', newline_ranges);
3946      RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
3947      TextNode* newline_matcher = new TextNode(
3948         newline_atom,
3949         ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3950                                             position_register,
3951                                             0,  // No captures inside.
3952                                             -1,  // Ignored if no captures.
3953                                             on_success));
3954      // Create an end-of-input matcher.
3955      RegExpNode* end_of_line = ActionNode::BeginSubmatch(
3956          stack_pointer_register,
3957          position_register,
3958          newline_matcher);
3959      // Add the two alternatives to the ChoiceNode.
3960      GuardedAlternative eol_alternative(end_of_line);
3961      result->AddAlternative(eol_alternative);
3962      GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
3963      result->AddAlternative(end_alternative);
3964      return result;
3965    }
3966    default:
3967      UNREACHABLE();
3968  }
3969  return on_success;
3970}
3971
3972
3973RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
3974                                        RegExpNode* on_success) {
3975  return new BackReferenceNode(RegExpCapture::StartRegister(index()),
3976                               RegExpCapture::EndRegister(index()),
3977                               on_success);
3978}
3979
3980
3981RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
3982                                RegExpNode* on_success) {
3983  return on_success;
3984}
3985
3986
3987RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
3988                                    RegExpNode* on_success) {
3989  int stack_pointer_register = compiler->AllocateRegister();
3990  int position_register = compiler->AllocateRegister();
3991
3992  const int registers_per_capture = 2;
3993  const int register_of_first_capture = 2;
3994  int register_count = capture_count_ * registers_per_capture;
3995  int register_start =
3996    register_of_first_capture + capture_from_ * registers_per_capture;
3997
3998  RegExpNode* success;
3999  if (is_positive()) {
4000    RegExpNode* node = ActionNode::BeginSubmatch(
4001        stack_pointer_register,
4002        position_register,
4003        body()->ToNode(
4004            compiler,
4005            ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
4006                                                position_register,
4007                                                register_count,
4008                                                register_start,
4009                                                on_success)));
4010    return node;
4011  } else {
4012    // We use a ChoiceNode for a negative lookahead because it has most of
4013    // the characteristics we need.  It has the body of the lookahead as its
4014    // first alternative and the expression after the lookahead of the second
4015    // alternative.  If the first alternative succeeds then the
4016    // NegativeSubmatchSuccess will unwind the stack including everything the
4017    // choice node set up and backtrack.  If the first alternative fails then
4018    // the second alternative is tried, which is exactly the desired result
4019    // for a negative lookahead.  The NegativeLookaheadChoiceNode is a special
4020    // ChoiceNode that knows to ignore the first exit when calculating quick
4021    // checks.
4022    GuardedAlternative body_alt(
4023        body()->ToNode(
4024            compiler,
4025            success = new NegativeSubmatchSuccess(stack_pointer_register,
4026                                                  position_register,
4027                                                  register_count,
4028                                                  register_start)));
4029    ChoiceNode* choice_node =
4030        new NegativeLookaheadChoiceNode(body_alt,
4031                                        GuardedAlternative(on_success));
4032    return ActionNode::BeginSubmatch(stack_pointer_register,
4033                                     position_register,
4034                                     choice_node);
4035  }
4036}
4037
4038
4039RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
4040                                  RegExpNode* on_success) {
4041  return ToNode(body(), index(), compiler, on_success);
4042}
4043
4044
4045RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
4046                                  int index,
4047                                  RegExpCompiler* compiler,
4048                                  RegExpNode* on_success) {
4049  int start_reg = RegExpCapture::StartRegister(index);
4050  int end_reg = RegExpCapture::EndRegister(index);
4051  RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
4052  RegExpNode* body_node = body->ToNode(compiler, store_end);
4053  return ActionNode::StorePosition(start_reg, true, body_node);
4054}
4055
4056
4057RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
4058                                      RegExpNode* on_success) {
4059  ZoneList<RegExpTree*>* children = nodes();
4060  RegExpNode* current = on_success;
4061  for (int i = children->length() - 1; i >= 0; i--) {
4062    current = children->at(i)->ToNode(compiler, current);
4063  }
4064  return current;
4065}
4066
4067
4068static void AddClass(const uc16* elmv,
4069                     int elmc,
4070                     ZoneList<CharacterRange>* ranges) {
4071  for (int i = 0; i < elmc; i += 2) {
4072    ASSERT(elmv[i] <= elmv[i + 1]);
4073    ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
4074  }
4075}
4076
4077
4078static void AddClassNegated(const uc16 *elmv,
4079                            int elmc,
4080                            ZoneList<CharacterRange>* ranges) {
4081  ASSERT(elmv[0] != 0x0000);
4082  ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
4083  uc16 last = 0x0000;
4084  for (int i = 0; i < elmc; i += 2) {
4085    ASSERT(last <= elmv[i] - 1);
4086    ASSERT(elmv[i] <= elmv[i + 1]);
4087    ranges->Add(CharacterRange(last, elmv[i] - 1));
4088    last = elmv[i + 1] + 1;
4089  }
4090  ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit));
4091}
4092
4093
4094void CharacterRange::AddClassEscape(uc16 type,
4095                                    ZoneList<CharacterRange>* ranges) {
4096  switch (type) {
4097    case 's':
4098      AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
4099      break;
4100    case 'S':
4101      AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
4102      break;
4103    case 'w':
4104      AddClass(kWordRanges, kWordRangeCount, ranges);
4105      break;
4106    case 'W':
4107      AddClassNegated(kWordRanges, kWordRangeCount, ranges);
4108      break;
4109    case 'd':
4110      AddClass(kDigitRanges, kDigitRangeCount, ranges);
4111      break;
4112    case 'D':
4113      AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
4114      break;
4115    case '.':
4116      AddClassNegated(kLineTerminatorRanges,
4117                      kLineTerminatorRangeCount,
4118                      ranges);
4119      break;
4120    // This is not a character range as defined by the spec but a
4121    // convenient shorthand for a character class that matches any
4122    // character.
4123    case '*':
4124      ranges->Add(CharacterRange::Everything());
4125      break;
4126    // This is the set of characters matched by the $ and ^ symbols
4127    // in multiline mode.
4128    case 'n':
4129      AddClass(kLineTerminatorRanges,
4130               kLineTerminatorRangeCount,
4131               ranges);
4132      break;
4133    default:
4134      UNREACHABLE();
4135  }
4136}
4137
4138
4139Vector<const uc16> CharacterRange::GetWordBounds() {
4140  return Vector<const uc16>(kWordRanges, kWordRangeCount);
4141}
4142
4143
4144class CharacterRangeSplitter {
4145 public:
4146  CharacterRangeSplitter(ZoneList<CharacterRange>** included,
4147                          ZoneList<CharacterRange>** excluded)
4148      : included_(included),
4149        excluded_(excluded) { }
4150  void Call(uc16 from, DispatchTable::Entry entry);
4151
4152  static const int kInBase = 0;
4153  static const int kInOverlay = 1;
4154
4155 private:
4156  ZoneList<CharacterRange>** included_;
4157  ZoneList<CharacterRange>** excluded_;
4158};
4159
4160
4161void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
4162  if (!entry.out_set()->Get(kInBase)) return;
4163  ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
4164    ? included_
4165    : excluded_;
4166  if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
4167  (*target)->Add(CharacterRange(entry.from(), entry.to()));
4168}
4169
4170
4171void CharacterRange::Split(ZoneList<CharacterRange>* base,
4172                           Vector<const uc16> overlay,
4173                           ZoneList<CharacterRange>** included,
4174                           ZoneList<CharacterRange>** excluded) {
4175  ASSERT_EQ(NULL, *included);
4176  ASSERT_EQ(NULL, *excluded);
4177  DispatchTable table;
4178  for (int i = 0; i < base->length(); i++)
4179    table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
4180  for (int i = 0; i < overlay.length(); i += 2) {
4181    table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
4182                   CharacterRangeSplitter::kInOverlay);
4183  }
4184  CharacterRangeSplitter callback(included, excluded);
4185  table.ForEach(&callback);
4186}
4187
4188
4189void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
4190                                        bool is_ascii) {
4191  Isolate* isolate = Isolate::Current();
4192  uc16 bottom = from();
4193  uc16 top = to();
4194  if (is_ascii) {
4195    if (bottom > String::kMaxAsciiCharCode) return;
4196    if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
4197  }
4198  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4199  if (top == bottom) {
4200    // If this is a singleton we just expand the one character.
4201    int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
4202    for (int i = 0; i < length; i++) {
4203      uc32 chr = chars[i];
4204      if (chr != bottom) {
4205        ranges->Add(CharacterRange::Singleton(chars[i]));
4206      }
4207    }
4208  } else {
4209    // If this is a range we expand the characters block by block,
4210    // expanding contiguous subranges (blocks) one at a time.
4211    // The approach is as follows.  For a given start character we
4212    // look up the remainder of the block that contains it (represented
4213    // by the end point), for instance we find 'z' if the character
4214    // is 'c'.  A block is characterized by the property
4215    // that all characters uncanonicalize in the same way, except that
4216    // each entry in the result is incremented by the distance from the first
4217    // element.  So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
4218    // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
4219    // Once we've found the end point we look up its uncanonicalization
4220    // and produce a range for each element.  For instance for [c-f]
4221    // we look up ['z', 'Z'] and produce [c-f] and [C-F].  We then only
4222    // add a range if it is not already contained in the input, so [c-f]
4223    // will be skipped but [C-F] will be added.  If this range is not
4224    // completely contained in a block we do this for all the blocks
4225    // covered by the range (handling characters that is not in a block
4226    // as a "singleton block").
4227    unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4228    int pos = bottom;
4229    while (pos < top) {
4230      int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
4231      uc16 block_end;
4232      if (length == 0) {
4233        block_end = pos;
4234      } else {
4235        ASSERT_EQ(1, length);
4236        block_end = range[0];
4237      }
4238      int end = (block_end > top) ? top : block_end;
4239      length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
4240      for (int i = 0; i < length; i++) {
4241        uc32 c = range[i];
4242        uc16 range_from = c - (block_end - pos);
4243        uc16 range_to = c - (block_end - end);
4244        if (!(bottom <= range_from && range_to <= top)) {
4245          ranges->Add(CharacterRange(range_from, range_to));
4246        }
4247      }
4248      pos = end + 1;
4249    }
4250  }
4251}
4252
4253
4254bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
4255  ASSERT_NOT_NULL(ranges);
4256  int n = ranges->length();
4257  if (n <= 1) return true;
4258  int max = ranges->at(0).to();
4259  for (int i = 1; i < n; i++) {
4260    CharacterRange next_range = ranges->at(i);
4261    if (next_range.from() <= max + 1) return false;
4262    max = next_range.to();
4263  }
4264  return true;
4265}
4266
4267SetRelation CharacterRange::WordCharacterRelation(
4268    ZoneList<CharacterRange>* range) {
4269  ASSERT(IsCanonical(range));
4270  int i = 0;  // Word character range index.
4271  int j = 0;  // Argument range index.
4272  ASSERT_NE(0, kWordRangeCount);
4273  SetRelation result;
4274  if (range->length() == 0) {
4275    result.SetElementsInSecondSet();
4276    return result;
4277  }
4278  CharacterRange argument_range = range->at(0);
4279  CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
4280  while (i < kWordRangeCount && j < range->length()) {
4281    // Check the two ranges for the five cases:
4282    // - no overlap.
4283    // - partial overlap (there are elements in both ranges that isn't
4284    //   in the other, and there are also elements that are in both).
4285    // - argument range entirely inside word range.
4286    // - word range entirely inside argument range.
4287    // - ranges are completely equal.
4288
4289    // First check for no overlap. The earlier range is not in the other set.
4290    if (argument_range.from() > word_range.to()) {
4291      // Ranges are disjoint. The earlier word range contains elements that
4292      // cannot be in the argument set.
4293      result.SetElementsInSecondSet();
4294    } else if (word_range.from() > argument_range.to()) {
4295      // Ranges are disjoint. The earlier argument range contains elements that
4296      // cannot be in the word set.
4297      result.SetElementsInFirstSet();
4298    } else if (word_range.from() <= argument_range.from() &&
4299               word_range.to() >= argument_range.from()) {
4300      result.SetElementsInBothSets();
4301      // argument range completely inside word range.
4302      if (word_range.from() < argument_range.from() ||
4303          word_range.to() > argument_range.from()) {
4304        result.SetElementsInSecondSet();
4305      }
4306    } else if (word_range.from() >= argument_range.from() &&
4307               word_range.to() <= argument_range.from()) {
4308      result.SetElementsInBothSets();
4309      result.SetElementsInFirstSet();
4310    } else {
4311      // There is overlap, and neither is a subrange of the other
4312      result.SetElementsInFirstSet();
4313      result.SetElementsInSecondSet();
4314      result.SetElementsInBothSets();
4315    }
4316    if (result.NonTrivialIntersection()) {
4317      // The result is as (im)precise as we can possibly make it.
4318      return result;
4319    }
4320    // Progress the range(s) with minimal to-character.
4321    uc16 word_to = word_range.to();
4322    uc16 argument_to = argument_range.to();
4323    if (argument_to <= word_to) {
4324      j++;
4325      if (j < range->length()) {
4326        argument_range = range->at(j);
4327      }
4328    }
4329    if (word_to <= argument_to) {
4330      i += 2;
4331      if (i < kWordRangeCount) {
4332        word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
4333      }
4334    }
4335  }
4336  // Check if anything wasn't compared in the loop.
4337  if (i < kWordRangeCount) {
4338    // word range contains something not in argument range.
4339    result.SetElementsInSecondSet();
4340  } else if (j < range->length()) {
4341    // Argument range contains something not in word range.
4342    result.SetElementsInFirstSet();
4343  }
4344
4345  return result;
4346}
4347
4348
4349ZoneList<CharacterRange>* CharacterSet::ranges() {
4350  if (ranges_ == NULL) {
4351    ranges_ = new ZoneList<CharacterRange>(2);
4352    CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4353  }
4354  return ranges_;
4355}
4356
4357
4358// Move a number of elements in a zonelist to another position
4359// in the same list. Handles overlapping source and target areas.
4360static void MoveRanges(ZoneList<CharacterRange>* list,
4361                       int from,
4362                       int to,
4363                       int count) {
4364  // Ranges are potentially overlapping.
4365  if (from < to) {
4366    for (int i = count - 1; i >= 0; i--) {
4367      list->at(to + i) = list->at(from + i);
4368    }
4369  } else {
4370    for (int i = 0; i < count; i++) {
4371      list->at(to + i) = list->at(from + i);
4372    }
4373  }
4374}
4375
4376
4377static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
4378                                      int count,
4379                                      CharacterRange insert) {
4380  // Inserts a range into list[0..count[, which must be sorted
4381  // by from value and non-overlapping and non-adjacent, using at most
4382  // list[0..count] for the result. Returns the number of resulting
4383  // canonicalized ranges. Inserting a range may collapse existing ranges into
4384  // fewer ranges, so the return value can be anything in the range 1..count+1.
4385  uc16 from = insert.from();
4386  uc16 to = insert.to();
4387  int start_pos = 0;
4388  int end_pos = count;
4389  for (int i = count - 1; i >= 0; i--) {
4390    CharacterRange current = list->at(i);
4391    if (current.from() > to + 1) {
4392      end_pos = i;
4393    } else if (current.to() + 1 < from) {
4394      start_pos = i + 1;
4395      break;
4396    }
4397  }
4398
4399  // Inserted range overlaps, or is adjacent to, ranges at positions
4400  // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
4401  // not affected by the insertion.
4402  // If start_pos == end_pos, the range must be inserted before start_pos.
4403  // if start_pos < end_pos, the entire range from start_pos to end_pos
4404  // must be merged with the insert range.
4405
4406  if (start_pos == end_pos) {
4407    // Insert between existing ranges at position start_pos.
4408    if (start_pos < count) {
4409      MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
4410    }
4411    list->at(start_pos) = insert;
4412    return count + 1;
4413  }
4414  if (start_pos + 1 == end_pos) {
4415    // Replace single existing range at position start_pos.
4416    CharacterRange to_replace = list->at(start_pos);
4417    int new_from = Min(to_replace.from(), from);
4418    int new_to = Max(to_replace.to(), to);
4419    list->at(start_pos) = CharacterRange(new_from, new_to);
4420    return count;
4421  }
4422  // Replace a number of existing ranges from start_pos to end_pos - 1.
4423  // Move the remaining ranges down.
4424
4425  int new_from = Min(list->at(start_pos).from(), from);
4426  int new_to = Max(list->at(end_pos - 1).to(), to);
4427  if (end_pos < count) {
4428    MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
4429  }
4430  list->at(start_pos) = CharacterRange(new_from, new_to);
4431  return count - (end_pos - start_pos) + 1;
4432}
4433
4434
4435void CharacterSet::Canonicalize() {
4436  // Special/default classes are always considered canonical. The result
4437  // of calling ranges() will be sorted.
4438  if (ranges_ == NULL) return;
4439  CharacterRange::Canonicalize(ranges_);
4440}
4441
4442
4443void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
4444  if (character_ranges->length() <= 1) return;
4445  // Check whether ranges are already canonical (increasing, non-overlapping,
4446  // non-adjacent).
4447  int n = character_ranges->length();
4448  int max = character_ranges->at(0).to();
4449  int i = 1;
4450  while (i < n) {
4451    CharacterRange current = character_ranges->at(i);
4452    if (current.from() <= max + 1) {
4453      break;
4454    }
4455    max = current.to();
4456    i++;
4457  }
4458  // Canonical until the i'th range. If that's all of them, we are done.
4459  if (i == n) return;
4460
4461  // The ranges at index i and forward are not canonicalized. Make them so by
4462  // doing the equivalent of insertion sort (inserting each into the previous
4463  // list, in order).
4464  // Notice that inserting a range can reduce the number of ranges in the
4465  // result due to combining of adjacent and overlapping ranges.
4466  int read = i;  // Range to insert.
4467  int num_canonical = i;  // Length of canonicalized part of list.
4468  do {
4469    num_canonical = InsertRangeInCanonicalList(character_ranges,
4470                                               num_canonical,
4471                                               character_ranges->at(read));
4472    read++;
4473  } while (read < n);
4474  character_ranges->Rewind(num_canonical);
4475
4476  ASSERT(CharacterRange::IsCanonical(character_ranges));
4477}
4478
4479
4480// Utility function for CharacterRange::Merge. Adds a range at the end of
4481// a canonicalized range list, if necessary merging the range with the last
4482// range of the list.
4483static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
4484  if (set == NULL) return;
4485  ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from());
4486  int n = set->length();
4487  if (n > 0) {
4488    CharacterRange lastRange = set->at(n - 1);
4489    if (lastRange.to() == range.from() - 1) {
4490      set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
4491      return;
4492    }
4493  }
4494  set->Add(range);
4495}
4496
4497
4498static void AddRangeToSelectedSet(int selector,
4499                                  ZoneList<CharacterRange>* first_set,
4500                                  ZoneList<CharacterRange>* second_set,
4501                                  ZoneList<CharacterRange>* intersection_set,
4502                                  CharacterRange range) {
4503  switch (selector) {
4504    case kInsideFirst:
4505      AddRangeToSet(first_set, range);
4506      break;
4507    case kInsideSecond:
4508      AddRangeToSet(second_set, range);
4509      break;
4510    case kInsideBoth:
4511      AddRangeToSet(intersection_set, range);
4512      break;
4513  }
4514}
4515
4516
4517
4518void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
4519                           ZoneList<CharacterRange>* second_set,
4520                           ZoneList<CharacterRange>* first_set_only_out,
4521                           ZoneList<CharacterRange>* second_set_only_out,
4522                           ZoneList<CharacterRange>* both_sets_out) {
4523  // Inputs are canonicalized.
4524  ASSERT(CharacterRange::IsCanonical(first_set));
4525  ASSERT(CharacterRange::IsCanonical(second_set));
4526  // Outputs are empty, if applicable.
4527  ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0);
4528  ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0);
4529  ASSERT(both_sets_out == NULL || both_sets_out->length() == 0);
4530
4531  // Merge sets by iterating through the lists in order of lowest "from" value,
4532  // and putting intervals into one of three sets.
4533
4534  if (first_set->length() == 0) {
4535    second_set_only_out->AddAll(*second_set);
4536    return;
4537  }
4538  if (second_set->length() == 0) {
4539    first_set_only_out->AddAll(*first_set);
4540    return;
4541  }
4542  // Indices into input lists.
4543  int i1 = 0;
4544  int i2 = 0;
4545  // Cache length of input lists.
4546  int n1 = first_set->length();
4547  int n2 = second_set->length();
4548  // Current range. May be invalid if state is kInsideNone.
4549  int from = 0;
4550  int to = -1;
4551  // Where current range comes from.
4552  int state = kInsideNone;
4553
4554  while (i1 < n1 || i2 < n2) {
4555    CharacterRange next_range;
4556    int range_source;
4557    if (i2 == n2 ||
4558        (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {
4559      // Next smallest element is in first set.
4560      next_range = first_set->at(i1++);
4561      range_source = kInsideFirst;
4562    } else {
4563      // Next smallest element is in second set.
4564      next_range = second_set->at(i2++);
4565      range_source = kInsideSecond;
4566    }
4567    if (to < next_range.from()) {
4568      // Ranges disjoint: |current|  |next|
4569      AddRangeToSelectedSet(state,
4570                            first_set_only_out,
4571                            second_set_only_out,
4572                            both_sets_out,
4573                            CharacterRange(from, to));
4574      from = next_range.from();
4575      to = next_range.to();
4576      state = range_source;
4577    } else {
4578      if (from < next_range.from()) {
4579        AddRangeToSelectedSet(state,
4580                              first_set_only_out,
4581                              second_set_only_out,
4582                              both_sets_out,
4583                              CharacterRange(from, next_range.from()-1));
4584      }
4585      if (to < next_range.to()) {
4586        // Ranges overlap:  |current|
4587        //                       |next|
4588        AddRangeToSelectedSet(state | range_source,
4589                              first_set_only_out,
4590                              second_set_only_out,
4591                              both_sets_out,
4592                              CharacterRange(next_range.from(), to));
4593        from = to + 1;
4594        to = next_range.to();
4595        state = range_source;
4596      } else {
4597        // Range included:    |current| , possibly ending at same character.
4598        //                      |next|
4599        AddRangeToSelectedSet(
4600            state | range_source,
4601            first_set_only_out,
4602            second_set_only_out,
4603            both_sets_out,
4604            CharacterRange(next_range.from(), next_range.to()));
4605        from = next_range.to() + 1;
4606        // If ranges end at same character, both ranges are consumed completely.
4607        if (next_range.to() == to) state = kInsideNone;
4608      }
4609    }
4610  }
4611  AddRangeToSelectedSet(state,
4612                        first_set_only_out,
4613                        second_set_only_out,
4614                        both_sets_out,
4615                        CharacterRange(from, to));
4616}
4617
4618
4619void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
4620                            ZoneList<CharacterRange>* negated_ranges) {
4621  ASSERT(CharacterRange::IsCanonical(ranges));
4622  ASSERT_EQ(0, negated_ranges->length());
4623  int range_count = ranges->length();
4624  uc16 from = 0;
4625  int i = 0;
4626  if (range_count > 0 && ranges->at(0).from() == 0) {
4627    from = ranges->at(0).to();
4628    i = 1;
4629  }
4630  while (i < range_count) {
4631    CharacterRange range = ranges->at(i);
4632    negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
4633    from = range.to();
4634    i++;
4635  }
4636  if (from < String::kMaxUtf16CodeUnit) {
4637    negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit));
4638  }
4639}
4640
4641
4642
4643// -------------------------------------------------------------------
4644// Interest propagation
4645
4646
4647RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4648  for (int i = 0; i < siblings_.length(); i++) {
4649    RegExpNode* sibling = siblings_.Get(i);
4650    if (sibling->info()->Matches(info))
4651      return sibling;
4652  }
4653  return NULL;
4654}
4655
4656
4657RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4658  ASSERT_EQ(false, *cloned);
4659  siblings_.Ensure(this);
4660  RegExpNode* result = TryGetSibling(info);
4661  if (result != NULL) return result;
4662  result = this->Clone();
4663  NodeInfo* new_info = result->info();
4664  new_info->ResetCompilationState();
4665  new_info->AddFromPreceding(info);
4666  AddSibling(result);
4667  *cloned = true;
4668  return result;
4669}
4670
4671
4672template <class C>
4673static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4674  NodeInfo full_info(*node->info());
4675  full_info.AddFromPreceding(info);
4676  bool cloned = false;
4677  return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4678}
4679
4680
4681// -------------------------------------------------------------------
4682// Splay tree
4683
4684
4685OutSet* OutSet::Extend(unsigned value) {
4686  if (Get(value))
4687    return this;
4688  if (successors() != NULL) {
4689    for (int i = 0; i < successors()->length(); i++) {
4690      OutSet* successor = successors()->at(i);
4691      if (successor->Get(value))
4692        return successor;
4693    }
4694  } else {
4695    successors_ = new ZoneList<OutSet*>(2);
4696  }
4697  OutSet* result = new OutSet(first_, remaining_);
4698  result->Set(value);
4699  successors()->Add(result);
4700  return result;
4701}
4702
4703
4704void OutSet::Set(unsigned value) {
4705  if (value < kFirstLimit) {
4706    first_ |= (1 << value);
4707  } else {
4708    if (remaining_ == NULL)
4709      remaining_ = new ZoneList<unsigned>(1);
4710    if (remaining_->is_empty() || !remaining_->Contains(value))
4711      remaining_->Add(value);
4712  }
4713}
4714
4715
4716bool OutSet::Get(unsigned value) {
4717  if (value < kFirstLimit) {
4718    return (first_ & (1 << value)) != 0;
4719  } else if (remaining_ == NULL) {
4720    return false;
4721  } else {
4722    return remaining_->Contains(value);
4723  }
4724}
4725
4726
4727const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4728
4729
4730void DispatchTable::AddRange(CharacterRange full_range, int value) {
4731  CharacterRange current = full_range;
4732  if (tree()->is_empty()) {
4733    // If this is the first range we just insert into the table.
4734    ZoneSplayTree<Config>::Locator loc;
4735    ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4736    loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4737    return;
4738  }
4739  // First see if there is a range to the left of this one that
4740  // overlaps.
4741  ZoneSplayTree<Config>::Locator loc;
4742  if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4743    Entry* entry = &loc.value();
4744    // If we've found a range that overlaps with this one, and it
4745    // starts strictly to the left of this one, we have to fix it
4746    // because the following code only handles ranges that start on
4747    // or after the start point of the range we're adding.
4748    if (entry->from() < current.from() && entry->to() >= current.from()) {
4749      // Snap the overlapping range in half around the start point of
4750      // the range we're adding.
4751      CharacterRange left(entry->from(), current.from() - 1);
4752      CharacterRange right(current.from(), entry->to());
4753      // The left part of the overlapping range doesn't overlap.
4754      // Truncate the whole entry to be just the left part.
4755      entry->set_to(left.to());
4756      // The right part is the one that overlaps.  We add this part
4757      // to the map and let the next step deal with merging it with
4758      // the range we're adding.
4759      ZoneSplayTree<Config>::Locator loc;
4760      ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4761      loc.set_value(Entry(right.from(),
4762                          right.to(),
4763                          entry->out_set()));
4764    }
4765  }
4766  while (current.is_valid()) {
4767    if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4768        (loc.value().from() <= current.to()) &&
4769        (loc.value().to() >= current.from())) {
4770      Entry* entry = &loc.value();
4771      // We have overlap.  If there is space between the start point of
4772      // the range we're adding and where the overlapping range starts
4773      // then we have to add a range covering just that space.
4774      if (current.from() < entry->from()) {
4775        ZoneSplayTree<Config>::Locator ins;
4776        ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4777        ins.set_value(Entry(current.from(),
4778                            entry->from() - 1,
4779                            empty()->Extend(value)));
4780        current.set_from(entry->from());
4781      }
4782      ASSERT_EQ(current.from(), entry->from());
4783      // If the overlapping range extends beyond the one we want to add
4784      // we have to snap the right part off and add it separately.
4785      if (entry->to() > current.to()) {
4786        ZoneSplayTree<Config>::Locator ins;
4787        ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4788        ins.set_value(Entry(current.to() + 1,
4789                            entry->to(),
4790                            entry->out_set()));
4791        entry->set_to(current.to());
4792      }
4793      ASSERT(entry->to() <= current.to());
4794      // The overlapping range is now completely contained by the range
4795      // we're adding so we can just update it and move the start point
4796      // of the range we're adding just past it.
4797      entry->AddValue(value);
4798      // Bail out if the last interval ended at 0xFFFF since otherwise
4799      // adding 1 will wrap around to 0.
4800      if (entry->to() == String::kMaxUtf16CodeUnit)
4801        break;
4802      ASSERT(entry->to() + 1 > current.from());
4803      current.set_from(entry->to() + 1);
4804    } else {
4805      // There is no overlap so we can just add the range
4806      ZoneSplayTree<Config>::Locator ins;
4807      ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4808      ins.set_value(Entry(current.from(),
4809                          current.to(),
4810                          empty()->Extend(value)));
4811      break;
4812    }
4813  }
4814}
4815
4816
4817OutSet* DispatchTable::Get(uc16 value) {
4818  ZoneSplayTree<Config>::Locator loc;
4819  if (!tree()->FindGreatestLessThan(value, &loc))
4820    return empty();
4821  Entry* entry = &loc.value();
4822  if (value <= entry->to())
4823    return entry->out_set();
4824  else
4825    return empty();
4826}
4827
4828
4829// -------------------------------------------------------------------
4830// Analysis
4831
4832
4833void Analysis::EnsureAnalyzed(RegExpNode* that) {
4834  StackLimitCheck check(Isolate::Current());
4835  if (check.HasOverflowed()) {
4836    fail("Stack overflow");
4837    return;
4838  }
4839  if (that->info()->been_analyzed || that->info()->being_analyzed)
4840    return;
4841  that->info()->being_analyzed = true;
4842  that->Accept(this);
4843  that->info()->being_analyzed = false;
4844  that->info()->been_analyzed = true;
4845}
4846
4847
4848void Analysis::VisitEnd(EndNode* that) {
4849  // nothing to do
4850}
4851
4852
4853void TextNode::CalculateOffsets() {
4854  int element_count = elements()->length();
4855  // Set up the offsets of the elements relative to the start.  This is a fixed
4856  // quantity since a TextNode can only contain fixed-width things.
4857  int cp_offset = 0;
4858  for (int i = 0; i < element_count; i++) {
4859    TextElement& elm = elements()->at(i);
4860    elm.cp_offset = cp_offset;
4861    if (elm.type == TextElement::ATOM) {
4862      cp_offset += elm.data.u_atom->data().length();
4863    } else {
4864      cp_offset++;
4865    }
4866  }
4867}
4868
4869
4870void Analysis::VisitText(TextNode* that) {
4871  if (ignore_case_) {
4872    that->MakeCaseIndependent(is_ascii_);
4873  }
4874  EnsureAnalyzed(that->on_success());
4875  if (!has_failed()) {
4876    that->CalculateOffsets();
4877  }
4878}
4879
4880
4881void Analysis::VisitAction(ActionNode* that) {
4882  RegExpNode* target = that->on_success();
4883  EnsureAnalyzed(target);
4884  if (!has_failed()) {
4885    // If the next node is interested in what it follows then this node
4886    // has to be interested too so it can pass the information on.
4887    that->info()->AddFromFollowing(target->info());
4888  }
4889}
4890
4891
4892void Analysis::VisitChoice(ChoiceNode* that) {
4893  NodeInfo* info = that->info();
4894  for (int i = 0; i < that->alternatives()->length(); i++) {
4895    RegExpNode* node = that->alternatives()->at(i).node();
4896    EnsureAnalyzed(node);
4897    if (has_failed()) return;
4898    // Anything the following nodes need to know has to be known by
4899    // this node also, so it can pass it on.
4900    info->AddFromFollowing(node->info());
4901  }
4902}
4903
4904
4905void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4906  NodeInfo* info = that->info();
4907  for (int i = 0; i < that->alternatives()->length(); i++) {
4908    RegExpNode* node = that->alternatives()->at(i).node();
4909    if (node != that->loop_node()) {
4910      EnsureAnalyzed(node);
4911      if (has_failed()) return;
4912      info->AddFromFollowing(node->info());
4913    }
4914  }
4915  // Check the loop last since it may need the value of this node
4916  // to get a correct result.
4917  EnsureAnalyzed(that->loop_node());
4918  if (!has_failed()) {
4919    info->AddFromFollowing(that->loop_node()->info());
4920  }
4921}
4922
4923
4924void Analysis::VisitBackReference(BackReferenceNode* that) {
4925  EnsureAnalyzed(that->on_success());
4926}
4927
4928
4929void Analysis::VisitAssertion(AssertionNode* that) {
4930  EnsureAnalyzed(that->on_success());
4931  AssertionNode::AssertionNodeType type = that->type();
4932  if (type == AssertionNode::AT_BOUNDARY ||
4933      type == AssertionNode::AT_NON_BOUNDARY) {
4934    // Check if the following character is known to be a word character
4935    // or known to not be a word character.
4936    ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
4937
4938    CharacterRange::Canonicalize(following_chars);
4939
4940    SetRelation word_relation =
4941        CharacterRange::WordCharacterRelation(following_chars);
4942    if (word_relation.Disjoint()) {
4943      // Includes the case where following_chars is empty (e.g., end-of-input).
4944      // Following character is definitely *not* a word character.
4945      type = (type == AssertionNode::AT_BOUNDARY) ?
4946                 AssertionNode::AFTER_WORD_CHARACTER :
4947                 AssertionNode::AFTER_NONWORD_CHARACTER;
4948      that->set_type(type);
4949    } else if (word_relation.ContainedIn()) {
4950      // Following character is definitely a word character.
4951      type = (type == AssertionNode::AT_BOUNDARY) ?
4952                 AssertionNode::AFTER_NONWORD_CHARACTER :
4953                 AssertionNode::AFTER_WORD_CHARACTER;
4954      that->set_type(type);
4955    }
4956  }
4957}
4958
4959
4960ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
4961  if (first_character_set_ == NULL) {
4962    if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
4963      // If we can't find an exact solution within the budget, we
4964      // set the value to the set of every character, i.e., all characters
4965      // are possible.
4966      ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
4967      all_set->Add(CharacterRange::Everything());
4968      first_character_set_ = all_set;
4969    }
4970  }
4971  return first_character_set_;
4972}
4973
4974
4975int RegExpNode::ComputeFirstCharacterSet(int budget) {
4976  // Default behavior is to not be able to determine the first character.
4977  return kComputeFirstCharacterSetFail;
4978}
4979
4980
4981int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
4982  budget--;
4983  if (budget >= 0) {
4984    // Find loop min-iteration. It's the value of the guarded choice node
4985    // with a GEQ guard, if any.
4986    int min_repetition = 0;
4987
4988    for (int i = 0; i <= 1; i++) {
4989      GuardedAlternative alternative = alternatives()->at(i);
4990      ZoneList<Guard*>* guards = alternative.guards();
4991      if (guards != NULL && guards->length() > 0) {
4992        Guard* guard = guards->at(0);
4993        if (guard->op() == Guard::GEQ) {
4994          min_repetition = guard->value();
4995          break;
4996        }
4997      }
4998    }
4999
5000    budget = loop_node()->ComputeFirstCharacterSet(budget);
5001    if (budget >= 0) {
5002      ZoneList<CharacterRange>* character_set =
5003          loop_node()->first_character_set();
5004      if (body_can_be_zero_length() || min_repetition == 0) {
5005        budget = continue_node()->ComputeFirstCharacterSet(budget);
5006        if (budget < 0) return budget;
5007        ZoneList<CharacterRange>* body_set =
5008            continue_node()->first_character_set();
5009        ZoneList<CharacterRange>* union_set =
5010          new ZoneList<CharacterRange>(Max(character_set->length(),
5011                                           body_set->length()));
5012        CharacterRange::Merge(character_set,
5013                              body_set,
5014                              union_set,
5015                              union_set,
5016                              union_set);
5017        character_set = union_set;
5018      }
5019      set_first_character_set(character_set);
5020    }
5021  }
5022  return budget;
5023}
5024
5025
5026int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
5027  budget--;
5028  if (budget >= 0) {
5029    GuardedAlternative successor = this->alternatives()->at(1);
5030    RegExpNode* successor_node = successor.node();
5031    budget = successor_node->ComputeFirstCharacterSet(budget);
5032    if (budget >= 0) {
5033      set_first_character_set(successor_node->first_character_set());
5034    }
5035  }
5036  return budget;
5037}
5038
5039
5040// The first character set of an EndNode is unknowable. Just use the
5041// default implementation that fails and returns all characters as possible.
5042
5043
5044int AssertionNode::ComputeFirstCharacterSet(int budget) {
5045  budget -= 1;
5046  if (budget >= 0) {
5047    switch (type_) {
5048      case AT_END: {
5049        set_first_character_set(new ZoneList<CharacterRange>(0));
5050        break;
5051      }
5052      case AT_START:
5053      case AT_BOUNDARY:
5054      case AT_NON_BOUNDARY:
5055      case AFTER_NEWLINE:
5056      case AFTER_NONWORD_CHARACTER:
5057      case AFTER_WORD_CHARACTER: {
5058        ASSERT_NOT_NULL(on_success());
5059        budget = on_success()->ComputeFirstCharacterSet(budget);
5060        if (budget >= 0) {
5061          set_first_character_set(on_success()->first_character_set());
5062        }
5063        break;
5064      }
5065    }
5066  }
5067  return budget;
5068}
5069
5070
5071int ActionNode::ComputeFirstCharacterSet(int budget) {
5072  if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
5073  budget--;
5074  if (budget >= 0) {
5075    ASSERT_NOT_NULL(on_success());
5076    budget = on_success()->ComputeFirstCharacterSet(budget);
5077    if (budget >= 0) {
5078      set_first_character_set(on_success()->first_character_set());
5079    }
5080  }
5081  return budget;
5082}
5083
5084
5085int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
5086  // We don't know anything about the first character of a backreference
5087  // at this point.
5088  // The potential first characters are the first characters of the capture,
5089  // and the first characters of the on_success node, depending on whether the
5090  // capture can be empty and whether it is known to be participating or known
5091  // not to be.
5092  return kComputeFirstCharacterSetFail;
5093}
5094
5095
5096int TextNode::ComputeFirstCharacterSet(int budget) {
5097  budget--;
5098  if (budget >= 0) {
5099    ASSERT_NE(0, elements()->length());
5100    TextElement text = elements()->at(0);
5101    if (text.type == TextElement::ATOM) {
5102      RegExpAtom* atom = text.data.u_atom;
5103      ASSERT_NE(0, atom->length());
5104      uc16 first_char = atom->data()[0];
5105      ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
5106      range->Add(CharacterRange(first_char, first_char));
5107      set_first_character_set(range);
5108    } else {
5109      ASSERT(text.type == TextElement::CHAR_CLASS);
5110      RegExpCharacterClass* char_class = text.data.u_char_class;
5111      ZoneList<CharacterRange>* ranges = char_class->ranges();
5112      // TODO(lrn): Canonicalize ranges when they are created
5113      // instead of waiting until now.
5114      CharacterRange::Canonicalize(ranges);
5115      if (char_class->is_negated()) {
5116        int length = ranges->length();
5117        int new_length = length + 1;
5118        if (length > 0) {
5119          if (ranges->at(0).from() == 0) new_length--;
5120          if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) {
5121            new_length--;
5122          }
5123        }
5124        ZoneList<CharacterRange>* negated_ranges =
5125            new ZoneList<CharacterRange>(new_length);
5126        CharacterRange::Negate(ranges, negated_ranges);
5127        set_first_character_set(negated_ranges);
5128      } else {
5129        set_first_character_set(ranges);
5130      }
5131    }
5132  }
5133  return budget;
5134}
5135
5136
5137
5138// -------------------------------------------------------------------
5139// Dispatch table construction
5140
5141
5142void DispatchTableConstructor::VisitEnd(EndNode* that) {
5143  AddRange(CharacterRange::Everything());
5144}
5145
5146
5147void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
5148  node->set_being_calculated(true);
5149  ZoneList<GuardedAlternative>* alternatives = node->alternatives();
5150  for (int i = 0; i < alternatives->length(); i++) {
5151    set_choice_index(i);
5152    alternatives->at(i).node()->Accept(this);
5153  }
5154  node->set_being_calculated(false);
5155}
5156
5157
5158class AddDispatchRange {
5159 public:
5160  explicit AddDispatchRange(DispatchTableConstructor* constructor)
5161    : constructor_(constructor) { }
5162  void Call(uc32 from, DispatchTable::Entry entry);
5163 private:
5164  DispatchTableConstructor* constructor_;
5165};
5166
5167
5168void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
5169  CharacterRange range(from, entry.to());
5170  constructor_->AddRange(range);
5171}
5172
5173
5174void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
5175  if (node->being_calculated())
5176    return;
5177  DispatchTable* table = node->GetTable(ignore_case_);
5178  AddDispatchRange adder(this);
5179  table->ForEach(&adder);
5180}
5181
5182
5183void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
5184  // TODO(160): Find the node that we refer back to and propagate its start
5185  // set back to here.  For now we just accept anything.
5186  AddRange(CharacterRange::Everything());
5187}
5188
5189
5190void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
5191  RegExpNode* target = that->on_success();
5192  target->Accept(this);
5193}
5194
5195
5196static int CompareRangeByFrom(const CharacterRange* a,
5197                              const CharacterRange* b) {
5198  return Compare<uc16>(a->from(), b->from());
5199}
5200
5201
5202void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
5203  ranges->Sort(CompareRangeByFrom);
5204  uc16 last = 0;
5205  for (int i = 0; i < ranges->length(); i++) {
5206    CharacterRange range = ranges->at(i);
5207    if (last < range.from())
5208      AddRange(CharacterRange(last, range.from() - 1));
5209    if (range.to() >= last) {
5210      if (range.to() == String::kMaxUtf16CodeUnit) {
5211        return;
5212      } else {
5213        last = range.to() + 1;
5214      }
5215    }
5216  }
5217  AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
5218}
5219
5220
5221void DispatchTableConstructor::VisitText(TextNode* that) {
5222  TextElement elm = that->elements()->at(0);
5223  switch (elm.type) {
5224    case TextElement::ATOM: {
5225      uc16 c = elm.data.u_atom->data()[0];
5226      AddRange(CharacterRange(c, c));
5227      break;
5228    }
5229    case TextElement::CHAR_CLASS: {
5230      RegExpCharacterClass* tree = elm.data.u_char_class;
5231      ZoneList<CharacterRange>* ranges = tree->ranges();
5232      if (tree->is_negated()) {
5233        AddInverse(ranges);
5234      } else {
5235        for (int i = 0; i < ranges->length(); i++)
5236          AddRange(ranges->at(i));
5237      }
5238      break;
5239    }
5240    default: {
5241      UNIMPLEMENTED();
5242    }
5243  }
5244}
5245
5246
5247void DispatchTableConstructor::VisitAction(ActionNode* that) {
5248  RegExpNode* target = that->on_success();
5249  target->Accept(this);
5250}
5251
5252
5253RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
5254                                                      bool ignore_case,
5255                                                      bool is_multiline,
5256                                                      Handle<String> pattern,
5257                                                      bool is_ascii) {
5258  if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
5259    return IrregexpRegExpTooBig();
5260  }
5261  RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
5262  // Wrap the body of the regexp in capture #0.
5263  RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
5264                                                    0,
5265                                                    &compiler,
5266                                                    compiler.accept());
5267  RegExpNode* node = captured_body;
5268  bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5269  bool is_start_anchored = data->tree->IsAnchoredAtStart();
5270  int max_length = data->tree->max_match();
5271  if (!is_start_anchored) {
5272    // Add a .*? at the beginning, outside the body capture, unless
5273    // this expression is anchored at the beginning.
5274    RegExpNode* loop_node =
5275        RegExpQuantifier::ToNode(0,
5276                                 RegExpTree::kInfinity,
5277                                 false,
5278                                 new RegExpCharacterClass('*'),
5279                                 &compiler,
5280                                 captured_body,
5281                                 data->contains_anchor);
5282
5283    if (data->contains_anchor) {
5284      // Unroll loop once, to take care of the case that might start
5285      // at the start of input.
5286      ChoiceNode* first_step_node = new ChoiceNode(2);
5287      first_step_node->AddAlternative(GuardedAlternative(captured_body));
5288      first_step_node->AddAlternative(GuardedAlternative(
5289          new TextNode(new RegExpCharacterClass('*'), loop_node)));
5290      node = first_step_node;
5291    } else {
5292      node = loop_node;
5293    }
5294  }
5295  data->node = node;
5296  Analysis analysis(ignore_case, is_ascii);
5297  analysis.EnsureAnalyzed(node);
5298  if (analysis.has_failed()) {
5299    const char* error_message = analysis.error_message();
5300    return CompilationResult(error_message);
5301  }
5302
5303  // Create the correct assembler for the architecture.
5304#ifndef V8_INTERPRETED_REGEXP
5305  // Native regexp implementation.
5306
5307  NativeRegExpMacroAssembler::Mode mode =
5308      is_ascii ? NativeRegExpMacroAssembler::ASCII
5309               : NativeRegExpMacroAssembler::UC16;
5310
5311#if V8_TARGET_ARCH_IA32
5312  RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
5313#elif V8_TARGET_ARCH_X64
5314  RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
5315#elif V8_TARGET_ARCH_ARM
5316  RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
5317#elif V8_TARGET_ARCH_MIPS
5318  RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2);
5319#endif
5320
5321#else  // V8_INTERPRETED_REGEXP
5322  // Interpreted regexp implementation.
5323  EmbeddedVector<byte, 1024> codes;
5324  RegExpMacroAssemblerIrregexp macro_assembler(codes);
5325#endif  // V8_INTERPRETED_REGEXP
5326
5327  // Inserted here, instead of in Assembler, because it depends on information
5328  // in the AST that isn't replicated in the Node structure.
5329  static const int kMaxBacksearchLimit = 1024;
5330  if (is_end_anchored &&
5331      !is_start_anchored &&
5332      max_length < kMaxBacksearchLimit) {
5333    macro_assembler.SetCurrentPositionFromEnd(max_length);
5334  }
5335
5336  return compiler.Assemble(&macro_assembler,
5337                           node,
5338                           data->capture_count,
5339                           pattern);
5340}
5341
5342
5343}}  // namespace v8::internal
5344