1// Copyright 2006-2009 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
30#include "ast.h"
31#include "compiler.h"
32#include "execution.h"
33#include "factory.h"
34#include "jsregexp.h"
35#include "platform.h"
36#include "string-search.h"
37#include "runtime.h"
38#include "compilation-cache.h"
39#include "string-stream.h"
40#include "parser.h"
41#include "regexp-macro-assembler.h"
42#include "regexp-macro-assembler-tracer.h"
43#include "regexp-macro-assembler-irregexp.h"
44#include "regexp-stack.h"
45
46#ifndef V8_INTERPRETED_REGEXP
47#if V8_TARGET_ARCH_IA32
48#include "ia32/regexp-macro-assembler-ia32.h"
49#elif V8_TARGET_ARCH_X64
50#include "x64/regexp-macro-assembler-x64.h"
51#elif V8_TARGET_ARCH_ARM
52#include "arm/regexp-macro-assembler-arm.h"
53#elif V8_TARGET_ARCH_MIPS
54#include "mips/regexp-macro-assembler-mips.h"
55#else
56#error Unsupported target architecture.
57#endif
58#endif
59
60#include "interpreter-irregexp.h"
61
62
63namespace v8 {
64namespace internal {
65
66Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
67                                               Handle<String> pattern,
68                                               Handle<String> flags,
69                                               bool* has_pending_exception) {
70  // Call the construct code with 2 arguments.
71  Object** argv[2] = { Handle<Object>::cast(pattern).location(),
72                       Handle<Object>::cast(flags).location() };
73  return Execution::New(constructor, 2, argv, has_pending_exception);
74}
75
76
77static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
78  int flags = JSRegExp::NONE;
79  for (int i = 0; i < str->length(); i++) {
80    switch (str->Get(i)) {
81      case 'i':
82        flags |= JSRegExp::IGNORE_CASE;
83        break;
84      case 'g':
85        flags |= JSRegExp::GLOBAL;
86        break;
87      case 'm':
88        flags |= JSRegExp::MULTILINE;
89        break;
90    }
91  }
92  return JSRegExp::Flags(flags);
93}
94
95
96static inline void ThrowRegExpException(Handle<JSRegExp> re,
97                                        Handle<String> pattern,
98                                        Handle<String> error_text,
99                                        const char* message) {
100  Isolate* isolate = re->GetIsolate();
101  Factory* factory = isolate->factory();
102  Handle<FixedArray> elements = factory->NewFixedArray(2);
103  elements->set(0, *pattern);
104  elements->set(1, *error_text);
105  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
106  Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
107  isolate->Throw(*regexp_err);
108}
109
110
111// Generic RegExp methods. Dispatches to implementation specific methods.
112
113
114Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
115                                   Handle<String> pattern,
116                                   Handle<String> flag_str) {
117  Isolate* isolate = re->GetIsolate();
118  JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
119  CompilationCache* compilation_cache = isolate->compilation_cache();
120  Handle<FixedArray> cached = compilation_cache->LookupRegExp(pattern, flags);
121  bool in_cache = !cached.is_null();
122  LOG(isolate, RegExpCompileEvent(re, in_cache));
123
124  Handle<Object> result;
125  if (in_cache) {
126    re->set_data(*cached);
127    return re;
128  }
129  pattern = FlattenGetString(pattern);
130  CompilationZoneScope zone_scope(DELETE_ON_EXIT);
131  PostponeInterruptsScope postpone(isolate);
132  RegExpCompileData parse_result;
133  FlatStringReader reader(isolate, pattern);
134  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
135                                 &parse_result)) {
136    // Throw an exception if we fail to parse the pattern.
137    ThrowRegExpException(re,
138                         pattern,
139                         parse_result.error,
140                         "malformed_regexp");
141    return Handle<Object>::null();
142  }
143
144  if (parse_result.simple && !flags.is_ignore_case()) {
145    // Parse-tree is a single atom that is equal to the pattern.
146    AtomCompile(re, pattern, flags, pattern);
147  } else if (parse_result.tree->IsAtom() &&
148      !flags.is_ignore_case() &&
149      parse_result.capture_count == 0) {
150    RegExpAtom* atom = parse_result.tree->AsAtom();
151    Vector<const uc16> atom_pattern = atom->data();
152    Handle<String> atom_string =
153        isolate->factory()->NewStringFromTwoByte(atom_pattern);
154    AtomCompile(re, pattern, flags, atom_string);
155  } else {
156    IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
157  }
158  ASSERT(re->data()->IsFixedArray());
159  // Compilation succeeded so the data is set on the regexp
160  // and we can store it in the cache.
161  Handle<FixedArray> data(FixedArray::cast(re->data()));
162  compilation_cache->PutRegExp(pattern, flags, data);
163
164  return re;
165}
166
167
168Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
169                                Handle<String> subject,
170                                int index,
171                                Handle<JSArray> last_match_info) {
172  switch (regexp->TypeTag()) {
173    case JSRegExp::ATOM:
174      return AtomExec(regexp, subject, index, last_match_info);
175    case JSRegExp::IRREGEXP: {
176      Handle<Object> result =
177          IrregexpExec(regexp, subject, index, last_match_info);
178      ASSERT(!result.is_null() || Isolate::Current()->has_pending_exception());
179      return result;
180    }
181    default:
182      UNREACHABLE();
183      return Handle<Object>::null();
184  }
185}
186
187
188// RegExp Atom implementation: Simple string search using indexOf.
189
190
191void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
192                             Handle<String> pattern,
193                             JSRegExp::Flags flags,
194                             Handle<String> match_pattern) {
195  re->GetIsolate()->factory()->SetRegExpAtomData(re,
196                                                 JSRegExp::ATOM,
197                                                 pattern,
198                                                 flags,
199                                                 match_pattern);
200}
201
202
203static void SetAtomLastCapture(FixedArray* array,
204                               String* subject,
205                               int from,
206                               int to) {
207  NoHandleAllocation no_handles;
208  RegExpImpl::SetLastCaptureCount(array, 2);
209  RegExpImpl::SetLastSubject(array, subject);
210  RegExpImpl::SetLastInput(array, subject);
211  RegExpImpl::SetCapture(array, 0, from);
212  RegExpImpl::SetCapture(array, 1, to);
213}
214
215  /* template <typename SubjectChar>, typename PatternChar>
216static int ReStringMatch(Vector<const SubjectChar> sub_vector,
217                         Vector<const PatternChar> pat_vector,
218                         int start_index) {
219
220  int pattern_length = pat_vector.length();
221  if (pattern_length == 0) return start_index;
222
223  int subject_length = sub_vector.length();
224  if (start_index + pattern_length > subject_length) return -1;
225  return SearchString(sub_vector, pat_vector, start_index);
226}
227  */
228Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
229                                    Handle<String> subject,
230                                    int index,
231                                    Handle<JSArray> last_match_info) {
232  Isolate* isolate = re->GetIsolate();
233
234  ASSERT(0 <= index);
235  ASSERT(index <= subject->length());
236
237  if (!subject->IsFlat()) FlattenString(subject);
238  AssertNoAllocation no_heap_allocation;  // ensure vectors stay valid
239  // Extract flattened substrings of cons strings before determining asciiness.
240  String* seq_sub = *subject;
241  if (seq_sub->IsConsString()) seq_sub = ConsString::cast(seq_sub)->first();
242
243  String* needle = String::cast(re->DataAt(JSRegExp::kAtomPatternIndex));
244  int needle_len = needle->length();
245
246  if (needle_len != 0) {
247    if (index + needle_len > subject->length())
248        return isolate->factory()->null_value();
249
250    // dispatch on type of strings
251    index = (needle->IsAsciiRepresentation()
252             ? (seq_sub->IsAsciiRepresentation()
253                ? SearchString(isolate,
254                               seq_sub->ToAsciiVector(),
255                               needle->ToAsciiVector(),
256                               index)
257                : SearchString(isolate,
258                               seq_sub->ToUC16Vector(),
259                               needle->ToAsciiVector(),
260                               index))
261             : (seq_sub->IsAsciiRepresentation()
262                ? SearchString(isolate,
263                               seq_sub->ToAsciiVector(),
264                               needle->ToUC16Vector(),
265                               index)
266                : SearchString(isolate,
267                               seq_sub->ToUC16Vector(),
268                               needle->ToUC16Vector(),
269                               index)));
270    if (index == -1) return FACTORY->null_value();
271  }
272  ASSERT(last_match_info->HasFastElements());
273
274  {
275    NoHandleAllocation no_handles;
276    FixedArray* array = FixedArray::cast(last_match_info->elements());
277    SetAtomLastCapture(array, *subject, index, index + needle_len);
278  }
279  return last_match_info;
280}
281
282
283// Irregexp implementation.
284
285// Ensures that the regexp object contains a compiled version of the
286// source for either ASCII or non-ASCII strings.
287// If the compiled version doesn't already exist, it is compiled
288// from the source pattern.
289// If compilation fails, an exception is thrown and this function
290// returns false.
291bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
292  Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
293#ifdef V8_INTERPRETED_REGEXP
294  if (compiled_code->IsByteArray()) return true;
295#else  // V8_INTERPRETED_REGEXP (RegExp native code)
296  if (compiled_code->IsCode()) return true;
297#endif
298  return CompileIrregexp(re, is_ascii);
299}
300
301
302bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
303  // Compile the RegExp.
304  Isolate* isolate = re->GetIsolate();
305  CompilationZoneScope zone_scope(DELETE_ON_EXIT);
306  PostponeInterruptsScope postpone(isolate);
307  Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
308  if (entry->IsJSObject()) {
309    // If it's a JSObject, a previous compilation failed and threw this object.
310    // Re-throw the object without trying again.
311    isolate->Throw(entry);
312    return false;
313  }
314  ASSERT(entry->IsTheHole());
315
316  JSRegExp::Flags flags = re->GetFlags();
317
318  Handle<String> pattern(re->Pattern());
319  if (!pattern->IsFlat()) {
320    FlattenString(pattern);
321  }
322
323  RegExpCompileData compile_data;
324  FlatStringReader reader(isolate, pattern);
325  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
326                                 &compile_data)) {
327    // Throw an exception if we fail to parse the pattern.
328    // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
329    ThrowRegExpException(re,
330                         pattern,
331                         compile_data.error,
332                         "malformed_regexp");
333    return false;
334  }
335  RegExpEngine::CompilationResult result =
336      RegExpEngine::Compile(&compile_data,
337                            flags.is_ignore_case(),
338                            flags.is_multiline(),
339                            pattern,
340                            is_ascii);
341  if (result.error_message != NULL) {
342    // Unable to compile regexp.
343    Factory* factory = isolate->factory();
344    Handle<FixedArray> elements = factory->NewFixedArray(2);
345    elements->set(0, *pattern);
346    Handle<String> error_message =
347        factory->NewStringFromUtf8(CStrVector(result.error_message));
348    elements->set(1, *error_message);
349    Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
350    Handle<Object> regexp_err =
351        factory->NewSyntaxError("malformed_regexp", array);
352    isolate->Throw(*regexp_err);
353    re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err);
354    return false;
355  }
356
357  Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
358  data->set(JSRegExp::code_index(is_ascii), result.code);
359  int register_max = IrregexpMaxRegisterCount(*data);
360  if (result.num_registers > register_max) {
361    SetIrregexpMaxRegisterCount(*data, result.num_registers);
362  }
363
364  return true;
365}
366
367
368int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
369  return Smi::cast(
370      re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
371}
372
373
374void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
375  re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
376}
377
378
379int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
380  return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
381}
382
383
384int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
385  return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
386}
387
388
389ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
390  return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
391}
392
393
394Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
395  return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
396}
397
398
399void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
400                                    Handle<String> pattern,
401                                    JSRegExp::Flags flags,
402                                    int capture_count) {
403  // Initialize compiled code entries to null.
404  re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
405                                                     JSRegExp::IRREGEXP,
406                                                     pattern,
407                                                     flags,
408                                                     capture_count);
409}
410
411
412int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
413                                Handle<String> subject) {
414  if (!subject->IsFlat()) {
415    FlattenString(subject);
416  }
417  // Check the asciiness of the underlying storage.
418  bool is_ascii;
419  {
420    AssertNoAllocation no_gc;
421    String* sequential_string = *subject;
422    if (subject->IsConsString()) {
423      sequential_string = ConsString::cast(*subject)->first();
424    }
425    is_ascii = sequential_string->IsAsciiRepresentation();
426  }
427  if (!EnsureCompiledIrregexp(regexp, is_ascii)) {
428    return -1;
429  }
430#ifdef V8_INTERPRETED_REGEXP
431  // Byte-code regexp needs space allocated for all its registers.
432  return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data()));
433#else  // V8_INTERPRETED_REGEXP
434  // Native regexp only needs room to output captures. Registers are handled
435  // internally.
436  return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
437#endif  // V8_INTERPRETED_REGEXP
438}
439
440
441RegExpImpl::IrregexpResult RegExpImpl::IrregexpExecOnce(
442    Handle<JSRegExp> regexp,
443    Handle<String> subject,
444    int index,
445    Vector<int> output) {
446  Isolate* isolate = regexp->GetIsolate();
447
448  Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
449
450  ASSERT(index >= 0);
451  ASSERT(index <= subject->length());
452  ASSERT(subject->IsFlat());
453
454  // A flat ASCII string might have a two-byte first part.
455  if (subject->IsConsString()) {
456    subject = Handle<String>(ConsString::cast(*subject)->first(), isolate);
457  }
458
459#ifndef V8_INTERPRETED_REGEXP
460  ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
461  do {
462    bool is_ascii = subject->IsAsciiRepresentation();
463    Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
464    NativeRegExpMacroAssembler::Result res =
465        NativeRegExpMacroAssembler::Match(code,
466                                          subject,
467                                          output.start(),
468                                          output.length(),
469                                          index,
470                                          isolate);
471    if (res != NativeRegExpMacroAssembler::RETRY) {
472      ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION ||
473             isolate->has_pending_exception());
474      STATIC_ASSERT(
475          static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
476      STATIC_ASSERT(
477          static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
478      STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
479                    == RE_EXCEPTION);
480      return static_cast<IrregexpResult>(res);
481    }
482    // If result is RETRY, the string has changed representation, and we
483    // must restart from scratch.
484    // In this case, it means we must make sure we are prepared to handle
485    // the, potentially, different subject (the string can switch between
486    // being internal and external, and even between being ASCII and UC16,
487    // but the characters are always the same).
488    IrregexpPrepare(regexp, subject);
489  } while (true);
490  UNREACHABLE();
491  return RE_EXCEPTION;
492#else  // V8_INTERPRETED_REGEXP
493
494  ASSERT(output.length() >= IrregexpNumberOfRegisters(*irregexp));
495  bool is_ascii = subject->IsAsciiRepresentation();
496  // We must have done EnsureCompiledIrregexp, so we can get the number of
497  // registers.
498  int* register_vector = output.start();
499  int number_of_capture_registers =
500      (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
501  for (int i = number_of_capture_registers - 1; i >= 0; i--) {
502    register_vector[i] = -1;
503  }
504  Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
505
506  if (IrregexpInterpreter::Match(isolate,
507                                 byte_codes,
508                                 subject,
509                                 register_vector,
510                                 index)) {
511    return RE_SUCCESS;
512  }
513  return RE_FAILURE;
514#endif  // V8_INTERPRETED_REGEXP
515}
516
517
518Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
519                                        Handle<String> subject,
520                                        int previous_index,
521                                        Handle<JSArray> last_match_info) {
522  ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
523
524  // Prepare space for the return values.
525#ifdef V8_INTERPRETED_REGEXP
526#ifdef DEBUG
527  if (FLAG_trace_regexp_bytecodes) {
528    String* pattern = jsregexp->Pattern();
529    PrintF("\n\nRegexp match:   /%s/\n\n", *(pattern->ToCString()));
530    PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
531  }
532#endif
533#endif
534  int required_registers = RegExpImpl::IrregexpPrepare(jsregexp, subject);
535  if (required_registers < 0) {
536    // Compiling failed with an exception.
537    ASSERT(Isolate::Current()->has_pending_exception());
538    return Handle<Object>::null();
539  }
540
541  OffsetsVector registers(required_registers);
542
543  IrregexpResult res = RegExpImpl::IrregexpExecOnce(
544      jsregexp, subject, previous_index, Vector<int>(registers.vector(),
545                                                     registers.length()));
546  if (res == RE_SUCCESS) {
547    int capture_register_count =
548        (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
549    last_match_info->EnsureSize(capture_register_count + kLastMatchOverhead);
550    AssertNoAllocation no_gc;
551    int* register_vector = registers.vector();
552    FixedArray* array = FixedArray::cast(last_match_info->elements());
553    for (int i = 0; i < capture_register_count; i += 2) {
554      SetCapture(array, i, register_vector[i]);
555      SetCapture(array, i + 1, register_vector[i + 1]);
556    }
557    SetLastCaptureCount(array, capture_register_count);
558    SetLastSubject(array, *subject);
559    SetLastInput(array, *subject);
560    return last_match_info;
561  }
562  if (res == RE_EXCEPTION) {
563    ASSERT(Isolate::Current()->has_pending_exception());
564    return Handle<Object>::null();
565  }
566  ASSERT(res == RE_FAILURE);
567  return Isolate::Current()->factory()->null_value();
568}
569
570
571// -------------------------------------------------------------------
572// Implementation of the Irregexp regular expression engine.
573//
574// The Irregexp regular expression engine is intended to be a complete
575// implementation of ECMAScript regular expressions.  It generates either
576// bytecodes or native code.
577
578//   The Irregexp regexp engine is structured in three steps.
579//   1) The parser generates an abstract syntax tree.  See ast.cc.
580//   2) From the AST a node network is created.  The nodes are all
581//      subclasses of RegExpNode.  The nodes represent states when
582//      executing a regular expression.  Several optimizations are
583//      performed on the node network.
584//   3) From the nodes we generate either byte codes or native code
585//      that can actually execute the regular expression (perform
586//      the search).  The code generation step is described in more
587//      detail below.
588
589// Code generation.
590//
591//   The nodes are divided into four main categories.
592//   * Choice nodes
593//        These represent places where the regular expression can
594//        match in more than one way.  For example on entry to an
595//        alternation (foo|bar) or a repetition (*, +, ? or {}).
596//   * Action nodes
597//        These represent places where some action should be
598//        performed.  Examples include recording the current position
599//        in the input string to a register (in order to implement
600//        captures) or other actions on register for example in order
601//        to implement the counters needed for {} repetitions.
602//   * Matching nodes
603//        These attempt to match some element part of the input string.
604//        Examples of elements include character classes, plain strings
605//        or back references.
606//   * End nodes
607//        These are used to implement the actions required on finding
608//        a successful match or failing to find a match.
609//
610//   The code generated (whether as byte codes or native code) maintains
611//   some state as it runs.  This consists of the following elements:
612//
613//   * The capture registers.  Used for string captures.
614//   * Other registers.  Used for counters etc.
615//   * The current position.
616//   * The stack of backtracking information.  Used when a matching node
617//     fails to find a match and needs to try an alternative.
618//
619// Conceptual regular expression execution model:
620//
621//   There is a simple conceptual model of regular expression execution
622//   which will be presented first.  The actual code generated is a more
623//   efficient simulation of the simple conceptual model:
624//
625//   * Choice nodes are implemented as follows:
626//     For each choice except the last {
627//       push current position
628//       push backtrack code location
629//       <generate code to test for choice>
630//       backtrack code location:
631//       pop current position
632//     }
633//     <generate code to test for last choice>
634//
635//   * Actions nodes are generated as follows
636//     <push affected registers on backtrack stack>
637//     <generate code to perform action>
638//     push backtrack code location
639//     <generate code to test for following nodes>
640//     backtrack code location:
641//     <pop affected registers to restore their state>
642//     <pop backtrack location from stack and go to it>
643//
644//   * Matching nodes are generated as follows:
645//     if input string matches at current position
646//       update current position
647//       <generate code to test for following nodes>
648//     else
649//       <pop backtrack location from stack and go to it>
650//
651//   Thus it can be seen that the current position is saved and restored
652//   by the choice nodes, whereas the registers are saved and restored by
653//   by the action nodes that manipulate them.
654//
655//   The other interesting aspect of this model is that nodes are generated
656//   at the point where they are needed by a recursive call to Emit().  If
657//   the node has already been code generated then the Emit() call will
658//   generate a jump to the previously generated code instead.  In order to
659//   limit recursion it is possible for the Emit() function to put the node
660//   on a work list for later generation and instead generate a jump.  The
661//   destination of the jump is resolved later when the code is generated.
662//
663// Actual regular expression code generation.
664//
665//   Code generation is actually more complicated than the above.  In order
666//   to improve the efficiency of the generated code some optimizations are
667//   performed
668//
669//   * Choice nodes have 1-character lookahead.
670//     A choice node looks at the following character and eliminates some of
671//     the choices immediately based on that character.  This is not yet
672//     implemented.
673//   * Simple greedy loops store reduced backtracking information.
674//     A quantifier like /.*foo/m will greedily match the whole input.  It will
675//     then need to backtrack to a point where it can match "foo".  The naive
676//     implementation of this would push each character position onto the
677//     backtracking stack, then pop them off one by one.  This would use space
678//     proportional to the length of the input string.  However since the "."
679//     can only match in one way and always has a constant length (in this case
680//     of 1) it suffices to store the current position on the top of the stack
681//     once.  Matching now becomes merely incrementing the current position and
682//     backtracking becomes decrementing the current position and checking the
683//     result against the stored current position.  This is faster and saves
684//     space.
685//   * The current state is virtualized.
686//     This is used to defer expensive operations until it is clear that they
687//     are needed and to generate code for a node more than once, allowing
688//     specialized an efficient versions of the code to be created. This is
689//     explained in the section below.
690//
691// Execution state virtualization.
692//
693//   Instead of emitting code, nodes that manipulate the state can record their
694//   manipulation in an object called the Trace.  The Trace object can record a
695//   current position offset, an optional backtrack code location on the top of
696//   the virtualized backtrack stack and some register changes.  When a node is
697//   to be emitted it can flush the Trace or update it.  Flushing the Trace
698//   will emit code to bring the actual state into line with the virtual state.
699//   Avoiding flushing the state can postpone some work (eg updates of capture
700//   registers).  Postponing work can save time when executing the regular
701//   expression since it may be found that the work never has to be done as a
702//   failure to match can occur.  In addition it is much faster to jump to a
703//   known backtrack code location than it is to pop an unknown backtrack
704//   location from the stack and jump there.
705//
706//   The virtual state found in the Trace affects code generation.  For example
707//   the virtual state contains the difference between the actual current
708//   position and the virtual current position, and matching code needs to use
709//   this offset to attempt a match in the correct location of the input
710//   string.  Therefore code generated for a non-trivial trace is specialized
711//   to that trace.  The code generator therefore has the ability to generate
712//   code for each node several times.  In order to limit the size of the
713//   generated code there is an arbitrary limit on how many specialized sets of
714//   code may be generated for a given node.  If the limit is reached, the
715//   trace is flushed and a generic version of the code for a node is emitted.
716//   This is subsequently used for that node.  The code emitted for non-generic
717//   trace is not recorded in the node and so it cannot currently be reused in
718//   the event that code generation is requested for an identical trace.
719
720
721void RegExpTree::AppendToText(RegExpText* text) {
722  UNREACHABLE();
723}
724
725
726void RegExpAtom::AppendToText(RegExpText* text) {
727  text->AddElement(TextElement::Atom(this));
728}
729
730
731void RegExpCharacterClass::AppendToText(RegExpText* text) {
732  text->AddElement(TextElement::CharClass(this));
733}
734
735
736void RegExpText::AppendToText(RegExpText* text) {
737  for (int i = 0; i < elements()->length(); i++)
738    text->AddElement(elements()->at(i));
739}
740
741
742TextElement TextElement::Atom(RegExpAtom* atom) {
743  TextElement result = TextElement(ATOM);
744  result.data.u_atom = atom;
745  return result;
746}
747
748
749TextElement TextElement::CharClass(
750      RegExpCharacterClass* char_class) {
751  TextElement result = TextElement(CHAR_CLASS);
752  result.data.u_char_class = char_class;
753  return result;
754}
755
756
757int TextElement::length() {
758  if (type == ATOM) {
759    return data.u_atom->length();
760  } else {
761    ASSERT(type == CHAR_CLASS);
762    return 1;
763  }
764}
765
766
767DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
768  if (table_ == NULL) {
769    table_ = new DispatchTable();
770    DispatchTableConstructor cons(table_, ignore_case);
771    cons.BuildTable(this);
772  }
773  return table_;
774}
775
776
777class RegExpCompiler {
778 public:
779  RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
780
781  int AllocateRegister() {
782    if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
783      reg_exp_too_big_ = true;
784      return next_register_;
785    }
786    return next_register_++;
787  }
788
789  RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
790                                           RegExpNode* start,
791                                           int capture_count,
792                                           Handle<String> pattern);
793
794  inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
795
796  static const int kImplementationOffset = 0;
797  static const int kNumberOfRegistersOffset = 0;
798  static const int kCodeOffset = 1;
799
800  RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
801  EndNode* accept() { return accept_; }
802
803  static const int kMaxRecursion = 100;
804  inline int recursion_depth() { return recursion_depth_; }
805  inline void IncrementRecursionDepth() { recursion_depth_++; }
806  inline void DecrementRecursionDepth() { recursion_depth_--; }
807
808  void SetRegExpTooBig() { reg_exp_too_big_ = true; }
809
810  inline bool ignore_case() { return ignore_case_; }
811  inline bool ascii() { return ascii_; }
812
813  static const int kNoRegister = -1;
814 private:
815  EndNode* accept_;
816  int next_register_;
817  List<RegExpNode*>* work_list_;
818  int recursion_depth_;
819  RegExpMacroAssembler* macro_assembler_;
820  bool ignore_case_;
821  bool ascii_;
822  bool reg_exp_too_big_;
823};
824
825
826class RecursionCheck {
827 public:
828  explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
829    compiler->IncrementRecursionDepth();
830  }
831  ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
832 private:
833  RegExpCompiler* compiler_;
834};
835
836
837static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
838  return RegExpEngine::CompilationResult("RegExp too big");
839}
840
841
842// Attempts to compile the regexp using an Irregexp code generator.  Returns
843// a fixed array or a null handle depending on whether it succeeded.
844RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
845    : next_register_(2 * (capture_count + 1)),
846      work_list_(NULL),
847      recursion_depth_(0),
848      ignore_case_(ignore_case),
849      ascii_(ascii),
850      reg_exp_too_big_(false) {
851  accept_ = new EndNode(EndNode::ACCEPT);
852  ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
853}
854
855
856RegExpEngine::CompilationResult RegExpCompiler::Assemble(
857    RegExpMacroAssembler* macro_assembler,
858    RegExpNode* start,
859    int capture_count,
860    Handle<String> pattern) {
861  Heap* heap = pattern->GetHeap();
862
863  bool use_slow_safe_regexp_compiler = false;
864  if (heap->total_regexp_code_generated() >
865          RegExpImpl::kRegWxpCompiledLimit &&
866      heap->isolate()->memory_allocator()->SizeExecutable() >
867          RegExpImpl::kRegExpExecutableMemoryLimit) {
868    use_slow_safe_regexp_compiler = true;
869  }
870
871  macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
872
873#ifdef DEBUG
874  if (FLAG_trace_regexp_assembler)
875    macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
876  else
877#endif
878    macro_assembler_ = macro_assembler;
879
880  List <RegExpNode*> work_list(0);
881  work_list_ = &work_list;
882  Label fail;
883  macro_assembler_->PushBacktrack(&fail);
884  Trace new_trace;
885  start->Emit(this, &new_trace);
886  macro_assembler_->Bind(&fail);
887  macro_assembler_->Fail();
888  while (!work_list.is_empty()) {
889    work_list.RemoveLast()->Emit(this, &new_trace);
890  }
891  if (reg_exp_too_big_) return IrregexpRegExpTooBig();
892
893  Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
894  heap->IncreaseTotalRegexpCodeGenerated(code->Size());
895  work_list_ = NULL;
896#ifdef DEBUG
897  if (FLAG_print_code) {
898    Handle<Code>::cast(code)->Disassemble(*pattern->ToCString());
899  }
900  if (FLAG_trace_regexp_assembler) {
901    delete macro_assembler_;
902  }
903#endif
904  return RegExpEngine::CompilationResult(*code, next_register_);
905}
906
907
908bool Trace::DeferredAction::Mentions(int that) {
909  if (type() == ActionNode::CLEAR_CAPTURES) {
910    Interval range = static_cast<DeferredClearCaptures*>(this)->range();
911    return range.Contains(that);
912  } else {
913    return reg() == that;
914  }
915}
916
917
918bool Trace::mentions_reg(int reg) {
919  for (DeferredAction* action = actions_;
920       action != NULL;
921       action = action->next()) {
922    if (action->Mentions(reg))
923      return true;
924  }
925  return false;
926}
927
928
929bool Trace::GetStoredPosition(int reg, int* cp_offset) {
930  ASSERT_EQ(0, *cp_offset);
931  for (DeferredAction* action = actions_;
932       action != NULL;
933       action = action->next()) {
934    if (action->Mentions(reg)) {
935      if (action->type() == ActionNode::STORE_POSITION) {
936        *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
937        return true;
938      } else {
939        return false;
940      }
941    }
942  }
943  return false;
944}
945
946
947int Trace::FindAffectedRegisters(OutSet* affected_registers) {
948  int max_register = RegExpCompiler::kNoRegister;
949  for (DeferredAction* action = actions_;
950       action != NULL;
951       action = action->next()) {
952    if (action->type() == ActionNode::CLEAR_CAPTURES) {
953      Interval range = static_cast<DeferredClearCaptures*>(action)->range();
954      for (int i = range.from(); i <= range.to(); i++)
955        affected_registers->Set(i);
956      if (range.to() > max_register) max_register = range.to();
957    } else {
958      affected_registers->Set(action->reg());
959      if (action->reg() > max_register) max_register = action->reg();
960    }
961  }
962  return max_register;
963}
964
965
966void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
967                                     int max_register,
968                                     OutSet& registers_to_pop,
969                                     OutSet& registers_to_clear) {
970  for (int reg = max_register; reg >= 0; reg--) {
971    if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
972    else if (registers_to_clear.Get(reg)) {
973      int clear_to = reg;
974      while (reg > 0 && registers_to_clear.Get(reg - 1)) {
975        reg--;
976      }
977      assembler->ClearRegisters(reg, clear_to);
978    }
979  }
980}
981
982
983void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
984                                   int max_register,
985                                   OutSet& affected_registers,
986                                   OutSet* registers_to_pop,
987                                   OutSet* registers_to_clear) {
988  // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
989  const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
990
991  // Count pushes performed to force a stack limit check occasionally.
992  int pushes = 0;
993
994  for (int reg = 0; reg <= max_register; reg++) {
995    if (!affected_registers.Get(reg)) {
996      continue;
997    }
998
999    // The chronologically first deferred action in the trace
1000    // is used to infer the action needed to restore a register
1001    // to its previous state (or not, if it's safe to ignore it).
1002    enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1003    DeferredActionUndoType undo_action = IGNORE;
1004
1005    int value = 0;
1006    bool absolute = false;
1007    bool clear = false;
1008    int store_position = -1;
1009    // This is a little tricky because we are scanning the actions in reverse
1010    // historical order (newest first).
1011    for (DeferredAction* action = actions_;
1012         action != NULL;
1013         action = action->next()) {
1014      if (action->Mentions(reg)) {
1015        switch (action->type()) {
1016          case ActionNode::SET_REGISTER: {
1017            Trace::DeferredSetRegister* psr =
1018                static_cast<Trace::DeferredSetRegister*>(action);
1019            if (!absolute) {
1020              value += psr->value();
1021              absolute = true;
1022            }
1023            // SET_REGISTER is currently only used for newly introduced loop
1024            // counters. They can have a significant previous value if they
1025            // occour in a loop. TODO(lrn): Propagate this information, so
1026            // we can set undo_action to IGNORE if we know there is no value to
1027            // restore.
1028            undo_action = RESTORE;
1029            ASSERT_EQ(store_position, -1);
1030            ASSERT(!clear);
1031            break;
1032          }
1033          case ActionNode::INCREMENT_REGISTER:
1034            if (!absolute) {
1035              value++;
1036            }
1037            ASSERT_EQ(store_position, -1);
1038            ASSERT(!clear);
1039            undo_action = RESTORE;
1040            break;
1041          case ActionNode::STORE_POSITION: {
1042            Trace::DeferredCapture* pc =
1043                static_cast<Trace::DeferredCapture*>(action);
1044            if (!clear && store_position == -1) {
1045              store_position = pc->cp_offset();
1046            }
1047
1048            // For captures we know that stores and clears alternate.
1049            // Other register, are never cleared, and if the occur
1050            // inside a loop, they might be assigned more than once.
1051            if (reg <= 1) {
1052              // Registers zero and one, aka "capture zero", is
1053              // always set correctly if we succeed. There is no
1054              // need to undo a setting on backtrack, because we
1055              // will set it again or fail.
1056              undo_action = IGNORE;
1057            } else {
1058              undo_action = pc->is_capture() ? CLEAR : RESTORE;
1059            }
1060            ASSERT(!absolute);
1061            ASSERT_EQ(value, 0);
1062            break;
1063          }
1064          case ActionNode::CLEAR_CAPTURES: {
1065            // Since we're scanning in reverse order, if we've already
1066            // set the position we have to ignore historically earlier
1067            // clearing operations.
1068            if (store_position == -1) {
1069              clear = true;
1070            }
1071            undo_action = RESTORE;
1072            ASSERT(!absolute);
1073            ASSERT_EQ(value, 0);
1074            break;
1075          }
1076          default:
1077            UNREACHABLE();
1078            break;
1079        }
1080      }
1081    }
1082    // Prepare for the undo-action (e.g., push if it's going to be popped).
1083    if (undo_action == RESTORE) {
1084      pushes++;
1085      RegExpMacroAssembler::StackCheckFlag stack_check =
1086          RegExpMacroAssembler::kNoStackLimitCheck;
1087      if (pushes == push_limit) {
1088        stack_check = RegExpMacroAssembler::kCheckStackLimit;
1089        pushes = 0;
1090      }
1091
1092      assembler->PushRegister(reg, stack_check);
1093      registers_to_pop->Set(reg);
1094    } else if (undo_action == CLEAR) {
1095      registers_to_clear->Set(reg);
1096    }
1097    // Perform the chronologically last action (or accumulated increment)
1098    // for the register.
1099    if (store_position != -1) {
1100      assembler->WriteCurrentPositionToRegister(reg, store_position);
1101    } else if (clear) {
1102      assembler->ClearRegisters(reg, reg);
1103    } else if (absolute) {
1104      assembler->SetRegister(reg, value);
1105    } else if (value != 0) {
1106      assembler->AdvanceRegister(reg, value);
1107    }
1108  }
1109}
1110
1111
1112// This is called as we come into a loop choice node and some other tricky
1113// nodes.  It normalizes the state of the code generator to ensure we can
1114// generate generic code.
1115void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1116  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1117
1118  ASSERT(!is_trivial());
1119
1120  if (actions_ == NULL && backtrack() == NULL) {
1121    // Here we just have some deferred cp advances to fix and we are back to
1122    // a normal situation.  We may also have to forget some information gained
1123    // through a quick check that was already performed.
1124    if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1125    // Create a new trivial state and generate the node with that.
1126    Trace new_state;
1127    successor->Emit(compiler, &new_state);
1128    return;
1129  }
1130
1131  // Generate deferred actions here along with code to undo them again.
1132  OutSet affected_registers;
1133
1134  if (backtrack() != NULL) {
1135    // Here we have a concrete backtrack location.  These are set up by choice
1136    // nodes and so they indicate that we have a deferred save of the current
1137    // position which we may need to emit here.
1138    assembler->PushCurrentPosition();
1139  }
1140
1141  int max_register = FindAffectedRegisters(&affected_registers);
1142  OutSet registers_to_pop;
1143  OutSet registers_to_clear;
1144  PerformDeferredActions(assembler,
1145                         max_register,
1146                         affected_registers,
1147                         &registers_to_pop,
1148                         &registers_to_clear);
1149  if (cp_offset_ != 0) {
1150    assembler->AdvanceCurrentPosition(cp_offset_);
1151  }
1152
1153  // Create a new trivial state and generate the node with that.
1154  Label undo;
1155  assembler->PushBacktrack(&undo);
1156  Trace new_state;
1157  successor->Emit(compiler, &new_state);
1158
1159  // On backtrack we need to restore state.
1160  assembler->Bind(&undo);
1161  RestoreAffectedRegisters(assembler,
1162                           max_register,
1163                           registers_to_pop,
1164                           registers_to_clear);
1165  if (backtrack() == NULL) {
1166    assembler->Backtrack();
1167  } else {
1168    assembler->PopCurrentPosition();
1169    assembler->GoTo(backtrack());
1170  }
1171}
1172
1173
1174void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1175  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1176
1177  // Omit flushing the trace. We discard the entire stack frame anyway.
1178
1179  if (!label()->is_bound()) {
1180    // We are completely independent of the trace, since we ignore it,
1181    // so this code can be used as the generic version.
1182    assembler->Bind(label());
1183  }
1184
1185  // Throw away everything on the backtrack stack since the start
1186  // of the negative submatch and restore the character position.
1187  assembler->ReadCurrentPositionFromRegister(current_position_register_);
1188  assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1189  if (clear_capture_count_ > 0) {
1190    // Clear any captures that might have been performed during the success
1191    // of the body of the negative look-ahead.
1192    int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1193    assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1194  }
1195  // Now that we have unwound the stack we find at the top of the stack the
1196  // backtrack that the BeginSubmatch node got.
1197  assembler->Backtrack();
1198}
1199
1200
1201void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1202  if (!trace->is_trivial()) {
1203    trace->Flush(compiler, this);
1204    return;
1205  }
1206  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1207  if (!label()->is_bound()) {
1208    assembler->Bind(label());
1209  }
1210  switch (action_) {
1211    case ACCEPT:
1212      assembler->Succeed();
1213      return;
1214    case BACKTRACK:
1215      assembler->GoTo(trace->backtrack());
1216      return;
1217    case NEGATIVE_SUBMATCH_SUCCESS:
1218      // This case is handled in a different virtual method.
1219      UNREACHABLE();
1220  }
1221  UNIMPLEMENTED();
1222}
1223
1224
1225void GuardedAlternative::AddGuard(Guard* guard) {
1226  if (guards_ == NULL)
1227    guards_ = new ZoneList<Guard*>(1);
1228  guards_->Add(guard);
1229}
1230
1231
1232ActionNode* ActionNode::SetRegister(int reg,
1233                                    int val,
1234                                    RegExpNode* on_success) {
1235  ActionNode* result = new ActionNode(SET_REGISTER, on_success);
1236  result->data_.u_store_register.reg = reg;
1237  result->data_.u_store_register.value = val;
1238  return result;
1239}
1240
1241
1242ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1243  ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1244  result->data_.u_increment_register.reg = reg;
1245  return result;
1246}
1247
1248
1249ActionNode* ActionNode::StorePosition(int reg,
1250                                      bool is_capture,
1251                                      RegExpNode* on_success) {
1252  ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1253  result->data_.u_position_register.reg = reg;
1254  result->data_.u_position_register.is_capture = is_capture;
1255  return result;
1256}
1257
1258
1259ActionNode* ActionNode::ClearCaptures(Interval range,
1260                                      RegExpNode* on_success) {
1261  ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1262  result->data_.u_clear_captures.range_from = range.from();
1263  result->data_.u_clear_captures.range_to = range.to();
1264  return result;
1265}
1266
1267
1268ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1269                                      int position_reg,
1270                                      RegExpNode* on_success) {
1271  ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1272  result->data_.u_submatch.stack_pointer_register = stack_reg;
1273  result->data_.u_submatch.current_position_register = position_reg;
1274  return result;
1275}
1276
1277
1278ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1279                                                int position_reg,
1280                                                int clear_register_count,
1281                                                int clear_register_from,
1282                                                RegExpNode* on_success) {
1283  ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1284  result->data_.u_submatch.stack_pointer_register = stack_reg;
1285  result->data_.u_submatch.current_position_register = position_reg;
1286  result->data_.u_submatch.clear_register_count = clear_register_count;
1287  result->data_.u_submatch.clear_register_from = clear_register_from;
1288  return result;
1289}
1290
1291
1292ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1293                                        int repetition_register,
1294                                        int repetition_limit,
1295                                        RegExpNode* on_success) {
1296  ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1297  result->data_.u_empty_match_check.start_register = start_register;
1298  result->data_.u_empty_match_check.repetition_register = repetition_register;
1299  result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1300  return result;
1301}
1302
1303
1304#define DEFINE_ACCEPT(Type)                                          \
1305  void Type##Node::Accept(NodeVisitor* visitor) {                    \
1306    visitor->Visit##Type(this);                                      \
1307  }
1308FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1309#undef DEFINE_ACCEPT
1310
1311
1312void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1313  visitor->VisitLoopChoice(this);
1314}
1315
1316
1317// -------------------------------------------------------------------
1318// Emit code.
1319
1320
1321void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1322                               Guard* guard,
1323                               Trace* trace) {
1324  switch (guard->op()) {
1325    case Guard::LT:
1326      ASSERT(!trace->mentions_reg(guard->reg()));
1327      macro_assembler->IfRegisterGE(guard->reg(),
1328                                    guard->value(),
1329                                    trace->backtrack());
1330      break;
1331    case Guard::GEQ:
1332      ASSERT(!trace->mentions_reg(guard->reg()));
1333      macro_assembler->IfRegisterLT(guard->reg(),
1334                                    guard->value(),
1335                                    trace->backtrack());
1336      break;
1337  }
1338}
1339
1340
1341// Returns the number of characters in the equivalence class, omitting those
1342// that cannot occur in the source string because it is ASCII.
1343static int GetCaseIndependentLetters(Isolate* isolate,
1344                                     uc16 character,
1345                                     bool ascii_subject,
1346                                     unibrow::uchar* letters) {
1347  int length =
1348      isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1349  // Unibrow returns 0 or 1 for characters where case independence is
1350  // trivial.
1351  if (length == 0) {
1352    letters[0] = character;
1353    length = 1;
1354  }
1355  if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1356    return length;
1357  }
1358  // The standard requires that non-ASCII characters cannot have ASCII
1359  // character codes in their equivalence class.
1360  return 0;
1361}
1362
1363
1364static inline bool EmitSimpleCharacter(Isolate* isolate,
1365                                       RegExpCompiler* compiler,
1366                                       uc16 c,
1367                                       Label* on_failure,
1368                                       int cp_offset,
1369                                       bool check,
1370                                       bool preloaded) {
1371  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1372  bool bound_checked = false;
1373  if (!preloaded) {
1374    assembler->LoadCurrentCharacter(
1375        cp_offset,
1376        on_failure,
1377        check);
1378    bound_checked = true;
1379  }
1380  assembler->CheckNotCharacter(c, on_failure);
1381  return bound_checked;
1382}
1383
1384
1385// Only emits non-letters (things that don't have case).  Only used for case
1386// independent matches.
1387static inline bool EmitAtomNonLetter(Isolate* isolate,
1388                                     RegExpCompiler* compiler,
1389                                     uc16 c,
1390                                     Label* on_failure,
1391                                     int cp_offset,
1392                                     bool check,
1393                                     bool preloaded) {
1394  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1395  bool ascii = compiler->ascii();
1396  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1397  int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
1398  if (length < 1) {
1399    // This can't match.  Must be an ASCII subject and a non-ASCII character.
1400    // We do not need to do anything since the ASCII pass already handled this.
1401    return false;  // Bounds not checked.
1402  }
1403  bool checked = false;
1404  // We handle the length > 1 case in a later pass.
1405  if (length == 1) {
1406    if (ascii && c > String::kMaxAsciiCharCodeU) {
1407      // Can't match - see above.
1408      return false;  // Bounds not checked.
1409    }
1410    if (!preloaded) {
1411      macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1412      checked = check;
1413    }
1414    macro_assembler->CheckNotCharacter(c, on_failure);
1415  }
1416  return checked;
1417}
1418
1419
1420static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1421                                      bool ascii,
1422                                      uc16 c1,
1423                                      uc16 c2,
1424                                      Label* on_failure) {
1425  uc16 char_mask;
1426  if (ascii) {
1427    char_mask = String::kMaxAsciiCharCode;
1428  } else {
1429    char_mask = String::kMaxUC16CharCode;
1430  }
1431  uc16 exor = c1 ^ c2;
1432  // Check whether exor has only one bit set.
1433  if (((exor - 1) & exor) == 0) {
1434    // If c1 and c2 differ only by one bit.
1435    // Ecma262UnCanonicalize always gives the highest number last.
1436    ASSERT(c2 > c1);
1437    uc16 mask = char_mask ^ exor;
1438    macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1439    return true;
1440  }
1441  ASSERT(c2 > c1);
1442  uc16 diff = c2 - c1;
1443  if (((diff - 1) & diff) == 0 && c1 >= diff) {
1444    // If the characters differ by 2^n but don't differ by one bit then
1445    // subtract the difference from the found character, then do the or
1446    // trick.  We avoid the theoretical case where negative numbers are
1447    // involved in order to simplify code generation.
1448    uc16 mask = char_mask ^ diff;
1449    macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1450                                                    diff,
1451                                                    mask,
1452                                                    on_failure);
1453    return true;
1454  }
1455  return false;
1456}
1457
1458
1459typedef bool EmitCharacterFunction(Isolate* isolate,
1460                                   RegExpCompiler* compiler,
1461                                   uc16 c,
1462                                   Label* on_failure,
1463                                   int cp_offset,
1464                                   bool check,
1465                                   bool preloaded);
1466
1467// Only emits letters (things that have case).  Only used for case independent
1468// matches.
1469static inline bool EmitAtomLetter(Isolate* isolate,
1470                                  RegExpCompiler* compiler,
1471                                  uc16 c,
1472                                  Label* on_failure,
1473                                  int cp_offset,
1474                                  bool check,
1475                                  bool preloaded) {
1476  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1477  bool ascii = compiler->ascii();
1478  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1479  int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
1480  if (length <= 1) return false;
1481  // We may not need to check against the end of the input string
1482  // if this character lies before a character that matched.
1483  if (!preloaded) {
1484    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1485  }
1486  Label ok;
1487  ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1488  switch (length) {
1489    case 2: {
1490      if (ShortCutEmitCharacterPair(macro_assembler,
1491                                    ascii,
1492                                    chars[0],
1493                                    chars[1],
1494                                    on_failure)) {
1495      } else {
1496        macro_assembler->CheckCharacter(chars[0], &ok);
1497        macro_assembler->CheckNotCharacter(chars[1], on_failure);
1498        macro_assembler->Bind(&ok);
1499      }
1500      break;
1501    }
1502    case 4:
1503      macro_assembler->CheckCharacter(chars[3], &ok);
1504      // Fall through!
1505    case 3:
1506      macro_assembler->CheckCharacter(chars[0], &ok);
1507      macro_assembler->CheckCharacter(chars[1], &ok);
1508      macro_assembler->CheckNotCharacter(chars[2], on_failure);
1509      macro_assembler->Bind(&ok);
1510      break;
1511    default:
1512      UNREACHABLE();
1513      break;
1514  }
1515  return true;
1516}
1517
1518
1519static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1520                          RegExpCharacterClass* cc,
1521                          bool ascii,
1522                          Label* on_failure,
1523                          int cp_offset,
1524                          bool check_offset,
1525                          bool preloaded) {
1526  ZoneList<CharacterRange>* ranges = cc->ranges();
1527  int max_char;
1528  if (ascii) {
1529    max_char = String::kMaxAsciiCharCode;
1530  } else {
1531    max_char = String::kMaxUC16CharCode;
1532  }
1533
1534  Label success;
1535
1536  Label* char_is_in_class =
1537      cc->is_negated() ? on_failure : &success;
1538
1539  int range_count = ranges->length();
1540
1541  int last_valid_range = range_count - 1;
1542  while (last_valid_range >= 0) {
1543    CharacterRange& range = ranges->at(last_valid_range);
1544    if (range.from() <= max_char) {
1545      break;
1546    }
1547    last_valid_range--;
1548  }
1549
1550  if (last_valid_range < 0) {
1551    if (!cc->is_negated()) {
1552      // TODO(plesner): We can remove this when the node level does our
1553      // ASCII optimizations for us.
1554      macro_assembler->GoTo(on_failure);
1555    }
1556    if (check_offset) {
1557      macro_assembler->CheckPosition(cp_offset, on_failure);
1558    }
1559    return;
1560  }
1561
1562  if (last_valid_range == 0 &&
1563      !cc->is_negated() &&
1564      ranges->at(0).IsEverything(max_char)) {
1565    // This is a common case hit by non-anchored expressions.
1566    if (check_offset) {
1567      macro_assembler->CheckPosition(cp_offset, on_failure);
1568    }
1569    return;
1570  }
1571
1572  if (!preloaded) {
1573    macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1574  }
1575
1576  if (cc->is_standard() &&
1577        macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1578                                                    on_failure)) {
1579      return;
1580  }
1581
1582  for (int i = 0; i < last_valid_range; i++) {
1583    CharacterRange& range = ranges->at(i);
1584    Label next_range;
1585    uc16 from = range.from();
1586    uc16 to = range.to();
1587    if (from > max_char) {
1588      continue;
1589    }
1590    if (to > max_char) to = max_char;
1591    if (to == from) {
1592      macro_assembler->CheckCharacter(to, char_is_in_class);
1593    } else {
1594      if (from != 0) {
1595        macro_assembler->CheckCharacterLT(from, &next_range);
1596      }
1597      if (to != max_char) {
1598        macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1599      } else {
1600        macro_assembler->GoTo(char_is_in_class);
1601      }
1602    }
1603    macro_assembler->Bind(&next_range);
1604  }
1605
1606  CharacterRange& range = ranges->at(last_valid_range);
1607  uc16 from = range.from();
1608  uc16 to = range.to();
1609
1610  if (to > max_char) to = max_char;
1611  ASSERT(to >= from);
1612
1613  if (to == from) {
1614    if (cc->is_negated()) {
1615      macro_assembler->CheckCharacter(to, on_failure);
1616    } else {
1617      macro_assembler->CheckNotCharacter(to, on_failure);
1618    }
1619  } else {
1620    if (from != 0) {
1621      if (cc->is_negated()) {
1622        macro_assembler->CheckCharacterLT(from, &success);
1623      } else {
1624        macro_assembler->CheckCharacterLT(from, on_failure);
1625      }
1626    }
1627    if (to != String::kMaxUC16CharCode) {
1628      if (cc->is_negated()) {
1629        macro_assembler->CheckCharacterLT(to + 1, on_failure);
1630      } else {
1631        macro_assembler->CheckCharacterGT(to, on_failure);
1632      }
1633    } else {
1634      if (cc->is_negated()) {
1635        macro_assembler->GoTo(on_failure);
1636      }
1637    }
1638  }
1639  macro_assembler->Bind(&success);
1640}
1641
1642
1643RegExpNode::~RegExpNode() {
1644}
1645
1646
1647RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1648                                                  Trace* trace) {
1649  // If we are generating a greedy loop then don't stop and don't reuse code.
1650  if (trace->stop_node() != NULL) {
1651    return CONTINUE;
1652  }
1653
1654  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1655  if (trace->is_trivial()) {
1656    if (label_.is_bound()) {
1657      // We are being asked to generate a generic version, but that's already
1658      // been done so just go to it.
1659      macro_assembler->GoTo(&label_);
1660      return DONE;
1661    }
1662    if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1663      // To avoid too deep recursion we push the node to the work queue and just
1664      // generate a goto here.
1665      compiler->AddWork(this);
1666      macro_assembler->GoTo(&label_);
1667      return DONE;
1668    }
1669    // Generate generic version of the node and bind the label for later use.
1670    macro_assembler->Bind(&label_);
1671    return CONTINUE;
1672  }
1673
1674  // We are being asked to make a non-generic version.  Keep track of how many
1675  // non-generic versions we generate so as not to overdo it.
1676  trace_count_++;
1677  if (FLAG_regexp_optimization &&
1678      trace_count_ < kMaxCopiesCodeGenerated &&
1679      compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1680    return CONTINUE;
1681  }
1682
1683  // If we get here code has been generated for this node too many times or
1684  // recursion is too deep.  Time to switch to a generic version.  The code for
1685  // generic versions above can handle deep recursion properly.
1686  trace->Flush(compiler, this);
1687  return DONE;
1688}
1689
1690
1691int ActionNode::EatsAtLeast(int still_to_find,
1692                            int recursion_depth,
1693                            bool not_at_start) {
1694  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1695  if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
1696  return on_success()->EatsAtLeast(still_to_find,
1697                                   recursion_depth + 1,
1698                                   not_at_start);
1699}
1700
1701
1702int AssertionNode::EatsAtLeast(int still_to_find,
1703                               int recursion_depth,
1704                               bool not_at_start) {
1705  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1706  // If we know we are not at the start and we are asked "how many characters
1707  // will you match if you succeed?" then we can answer anything since false
1708  // implies false.  So lets just return the max answer (still_to_find) since
1709  // that won't prevent us from preloading a lot of characters for the other
1710  // branches in the node graph.
1711  if (type() == AT_START && not_at_start) return still_to_find;
1712  return on_success()->EatsAtLeast(still_to_find,
1713                                   recursion_depth + 1,
1714                                   not_at_start);
1715}
1716
1717
1718int BackReferenceNode::EatsAtLeast(int still_to_find,
1719                                   int recursion_depth,
1720                                   bool not_at_start) {
1721  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1722  return on_success()->EatsAtLeast(still_to_find,
1723                                   recursion_depth + 1,
1724                                   not_at_start);
1725}
1726
1727
1728int TextNode::EatsAtLeast(int still_to_find,
1729                          int recursion_depth,
1730                          bool not_at_start) {
1731  int answer = Length();
1732  if (answer >= still_to_find) return answer;
1733  if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
1734  // We are not at start after this node so we set the last argument to 'true'.
1735  return answer + on_success()->EatsAtLeast(still_to_find - answer,
1736                                            recursion_depth + 1,
1737                                            true);
1738}
1739
1740
1741int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
1742                                             int recursion_depth,
1743                                             bool not_at_start) {
1744  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1745  // Alternative 0 is the negative lookahead, alternative 1 is what comes
1746  // afterwards.
1747  RegExpNode* node = alternatives_->at(1).node();
1748  return node->EatsAtLeast(still_to_find, recursion_depth + 1, not_at_start);
1749}
1750
1751
1752void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
1753    QuickCheckDetails* details,
1754    RegExpCompiler* compiler,
1755    int filled_in,
1756    bool not_at_start) {
1757  // Alternative 0 is the negative lookahead, alternative 1 is what comes
1758  // afterwards.
1759  RegExpNode* node = alternatives_->at(1).node();
1760  return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1761}
1762
1763
1764int ChoiceNode::EatsAtLeastHelper(int still_to_find,
1765                                  int recursion_depth,
1766                                  RegExpNode* ignore_this_node,
1767                                  bool not_at_start) {
1768  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1769  int min = 100;
1770  int choice_count = alternatives_->length();
1771  for (int i = 0; i < choice_count; i++) {
1772    RegExpNode* node = alternatives_->at(i).node();
1773    if (node == ignore_this_node) continue;
1774    int node_eats_at_least = node->EatsAtLeast(still_to_find,
1775                                               recursion_depth + 1,
1776                                               not_at_start);
1777    if (node_eats_at_least < min) min = node_eats_at_least;
1778  }
1779  return min;
1780}
1781
1782
1783int LoopChoiceNode::EatsAtLeast(int still_to_find,
1784                                int recursion_depth,
1785                                bool not_at_start) {
1786  return EatsAtLeastHelper(still_to_find,
1787                           recursion_depth,
1788                           loop_node_,
1789                           not_at_start);
1790}
1791
1792
1793int ChoiceNode::EatsAtLeast(int still_to_find,
1794                            int recursion_depth,
1795                            bool not_at_start) {
1796  return EatsAtLeastHelper(still_to_find,
1797                           recursion_depth,
1798                           NULL,
1799                           not_at_start);
1800}
1801
1802
1803// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1804static inline uint32_t SmearBitsRight(uint32_t v) {
1805  v |= v >> 1;
1806  v |= v >> 2;
1807  v |= v >> 4;
1808  v |= v >> 8;
1809  v |= v >> 16;
1810  return v;
1811}
1812
1813
1814bool QuickCheckDetails::Rationalize(bool asc) {
1815  bool found_useful_op = false;
1816  uint32_t char_mask;
1817  if (asc) {
1818    char_mask = String::kMaxAsciiCharCode;
1819  } else {
1820    char_mask = String::kMaxUC16CharCode;
1821  }
1822  mask_ = 0;
1823  value_ = 0;
1824  int char_shift = 0;
1825  for (int i = 0; i < characters_; i++) {
1826    Position* pos = &positions_[i];
1827    if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
1828      found_useful_op = true;
1829    }
1830    mask_ |= (pos->mask & char_mask) << char_shift;
1831    value_ |= (pos->value & char_mask) << char_shift;
1832    char_shift += asc ? 8 : 16;
1833  }
1834  return found_useful_op;
1835}
1836
1837
1838bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1839                                Trace* trace,
1840                                bool preload_has_checked_bounds,
1841                                Label* on_possible_success,
1842                                QuickCheckDetails* details,
1843                                bool fall_through_on_failure) {
1844  if (details->characters() == 0) return false;
1845  GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
1846  if (details->cannot_match()) return false;
1847  if (!details->Rationalize(compiler->ascii())) return false;
1848  ASSERT(details->characters() == 1 ||
1849         compiler->macro_assembler()->CanReadUnaligned());
1850  uint32_t mask = details->mask();
1851  uint32_t value = details->value();
1852
1853  RegExpMacroAssembler* assembler = compiler->macro_assembler();
1854
1855  if (trace->characters_preloaded() != details->characters()) {
1856    assembler->LoadCurrentCharacter(trace->cp_offset(),
1857                                    trace->backtrack(),
1858                                    !preload_has_checked_bounds,
1859                                    details->characters());
1860  }
1861
1862
1863  bool need_mask = true;
1864
1865  if (details->characters() == 1) {
1866    // If number of characters preloaded is 1 then we used a byte or 16 bit
1867    // load so the value is already masked down.
1868    uint32_t char_mask;
1869    if (compiler->ascii()) {
1870      char_mask = String::kMaxAsciiCharCode;
1871    } else {
1872      char_mask = String::kMaxUC16CharCode;
1873    }
1874    if ((mask & char_mask) == char_mask) need_mask = false;
1875    mask &= char_mask;
1876  } else {
1877    // For 2-character preloads in ASCII mode or 1-character preloads in
1878    // TWO_BYTE mode we also use a 16 bit load with zero extend.
1879    if (details->characters() == 2 && compiler->ascii()) {
1880      if ((mask & 0x7f7f) == 0x7f7f) need_mask = false;
1881    } else if (details->characters() == 1 && !compiler->ascii()) {
1882      if ((mask & 0xffff) == 0xffff) need_mask = false;
1883    } else {
1884      if (mask == 0xffffffff) need_mask = false;
1885    }
1886  }
1887
1888  if (fall_through_on_failure) {
1889    if (need_mask) {
1890      assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1891    } else {
1892      assembler->CheckCharacter(value, on_possible_success);
1893    }
1894  } else {
1895    if (need_mask) {
1896      assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1897    } else {
1898      assembler->CheckNotCharacter(value, trace->backtrack());
1899    }
1900  }
1901  return true;
1902}
1903
1904
1905// Here is the meat of GetQuickCheckDetails (see also the comment on the
1906// super-class in the .h file).
1907//
1908// We iterate along the text object, building up for each character a
1909// mask and value that can be used to test for a quick failure to match.
1910// The masks and values for the positions will be combined into a single
1911// machine word for the current character width in order to be used in
1912// generating a quick check.
1913void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1914                                    RegExpCompiler* compiler,
1915                                    int characters_filled_in,
1916                                    bool not_at_start) {
1917  Isolate* isolate = Isolate::Current();
1918  ASSERT(characters_filled_in < details->characters());
1919  int characters = details->characters();
1920  int char_mask;
1921  int char_shift;
1922  if (compiler->ascii()) {
1923    char_mask = String::kMaxAsciiCharCode;
1924    char_shift = 8;
1925  } else {
1926    char_mask = String::kMaxUC16CharCode;
1927    char_shift = 16;
1928  }
1929  for (int k = 0; k < elms_->length(); k++) {
1930    TextElement elm = elms_->at(k);
1931    if (elm.type == TextElement::ATOM) {
1932      Vector<const uc16> quarks = elm.data.u_atom->data();
1933      for (int i = 0; i < characters && i < quarks.length(); i++) {
1934        QuickCheckDetails::Position* pos =
1935            details->positions(characters_filled_in);
1936        uc16 c = quarks[i];
1937        if (c > char_mask) {
1938          // If we expect a non-ASCII character from an ASCII string,
1939          // there is no way we can match. Not even case independent
1940          // matching can turn an ASCII character into non-ASCII or
1941          // vice versa.
1942          details->set_cannot_match();
1943          pos->determines_perfectly = false;
1944          return;
1945        }
1946        if (compiler->ignore_case()) {
1947          unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1948          int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
1949                                                 chars);
1950          ASSERT(length != 0);  // Can only happen if c > char_mask (see above).
1951          if (length == 1) {
1952            // This letter has no case equivalents, so it's nice and simple
1953            // and the mask-compare will determine definitely whether we have
1954            // a match at this character position.
1955            pos->mask = char_mask;
1956            pos->value = c;
1957            pos->determines_perfectly = true;
1958          } else {
1959            uint32_t common_bits = char_mask;
1960            uint32_t bits = chars[0];
1961            for (int j = 1; j < length; j++) {
1962              uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1963              common_bits ^= differing_bits;
1964              bits &= common_bits;
1965            }
1966            // If length is 2 and common bits has only one zero in it then
1967            // our mask and compare instruction will determine definitely
1968            // whether we have a match at this character position.  Otherwise
1969            // it can only be an approximate check.
1970            uint32_t one_zero = (common_bits | ~char_mask);
1971            if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1972              pos->determines_perfectly = true;
1973            }
1974            pos->mask = common_bits;
1975            pos->value = bits;
1976          }
1977        } else {
1978          // Don't ignore case.  Nice simple case where the mask-compare will
1979          // determine definitely whether we have a match at this character
1980          // position.
1981          pos->mask = char_mask;
1982          pos->value = c;
1983          pos->determines_perfectly = true;
1984        }
1985        characters_filled_in++;
1986        ASSERT(characters_filled_in <= details->characters());
1987        if (characters_filled_in == details->characters()) {
1988          return;
1989        }
1990      }
1991    } else {
1992      QuickCheckDetails::Position* pos =
1993          details->positions(characters_filled_in);
1994      RegExpCharacterClass* tree = elm.data.u_char_class;
1995      ZoneList<CharacterRange>* ranges = tree->ranges();
1996      if (tree->is_negated()) {
1997        // A quick check uses multi-character mask and compare.  There is no
1998        // useful way to incorporate a negative char class into this scheme
1999        // so we just conservatively create a mask and value that will always
2000        // succeed.
2001        pos->mask = 0;
2002        pos->value = 0;
2003      } else {
2004        int first_range = 0;
2005        while (ranges->at(first_range).from() > char_mask) {
2006          first_range++;
2007          if (first_range == ranges->length()) {
2008            details->set_cannot_match();
2009            pos->determines_perfectly = false;
2010            return;
2011          }
2012        }
2013        CharacterRange range = ranges->at(first_range);
2014        uc16 from = range.from();
2015        uc16 to = range.to();
2016        if (to > char_mask) {
2017          to = char_mask;
2018        }
2019        uint32_t differing_bits = (from ^ to);
2020        // A mask and compare is only perfect if the differing bits form a
2021        // number like 00011111 with one single block of trailing 1s.
2022        if ((differing_bits & (differing_bits + 1)) == 0 &&
2023             from + differing_bits == to) {
2024          pos->determines_perfectly = true;
2025        }
2026        uint32_t common_bits = ~SmearBitsRight(differing_bits);
2027        uint32_t bits = (from & common_bits);
2028        for (int i = first_range + 1; i < ranges->length(); i++) {
2029          CharacterRange range = ranges->at(i);
2030          uc16 from = range.from();
2031          uc16 to = range.to();
2032          if (from > char_mask) continue;
2033          if (to > char_mask) to = char_mask;
2034          // Here we are combining more ranges into the mask and compare
2035          // value.  With each new range the mask becomes more sparse and
2036          // so the chances of a false positive rise.  A character class
2037          // with multiple ranges is assumed never to be equivalent to a
2038          // mask and compare operation.
2039          pos->determines_perfectly = false;
2040          uint32_t new_common_bits = (from ^ to);
2041          new_common_bits = ~SmearBitsRight(new_common_bits);
2042          common_bits &= new_common_bits;
2043          bits &= new_common_bits;
2044          uint32_t differing_bits = (from & common_bits) ^ bits;
2045          common_bits ^= differing_bits;
2046          bits &= common_bits;
2047        }
2048        pos->mask = common_bits;
2049        pos->value = bits;
2050      }
2051      characters_filled_in++;
2052      ASSERT(characters_filled_in <= details->characters());
2053      if (characters_filled_in == details->characters()) {
2054        return;
2055      }
2056    }
2057  }
2058  ASSERT(characters_filled_in != details->characters());
2059  on_success()-> GetQuickCheckDetails(details,
2060                                      compiler,
2061                                      characters_filled_in,
2062                                      true);
2063}
2064
2065
2066void QuickCheckDetails::Clear() {
2067  for (int i = 0; i < characters_; i++) {
2068    positions_[i].mask = 0;
2069    positions_[i].value = 0;
2070    positions_[i].determines_perfectly = false;
2071  }
2072  characters_ = 0;
2073}
2074
2075
2076void QuickCheckDetails::Advance(int by, bool ascii) {
2077  ASSERT(by >= 0);
2078  if (by >= characters_) {
2079    Clear();
2080    return;
2081  }
2082  for (int i = 0; i < characters_ - by; i++) {
2083    positions_[i] = positions_[by + i];
2084  }
2085  for (int i = characters_ - by; i < characters_; i++) {
2086    positions_[i].mask = 0;
2087    positions_[i].value = 0;
2088    positions_[i].determines_perfectly = false;
2089  }
2090  characters_ -= by;
2091  // We could change mask_ and value_ here but we would never advance unless
2092  // they had already been used in a check and they won't be used again because
2093  // it would gain us nothing.  So there's no point.
2094}
2095
2096
2097void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2098  ASSERT(characters_ == other->characters_);
2099  if (other->cannot_match_) {
2100    return;
2101  }
2102  if (cannot_match_) {
2103    *this = *other;
2104    return;
2105  }
2106  for (int i = from_index; i < characters_; i++) {
2107    QuickCheckDetails::Position* pos = positions(i);
2108    QuickCheckDetails::Position* other_pos = other->positions(i);
2109    if (pos->mask != other_pos->mask ||
2110        pos->value != other_pos->value ||
2111        !other_pos->determines_perfectly) {
2112      // Our mask-compare operation will be approximate unless we have the
2113      // exact same operation on both sides of the alternation.
2114      pos->determines_perfectly = false;
2115    }
2116    pos->mask &= other_pos->mask;
2117    pos->value &= pos->mask;
2118    other_pos->value &= pos->mask;
2119    uc16 differing_bits = (pos->value ^ other_pos->value);
2120    pos->mask &= ~differing_bits;
2121    pos->value &= pos->mask;
2122  }
2123}
2124
2125
2126class VisitMarker {
2127 public:
2128  explicit VisitMarker(NodeInfo* info) : info_(info) {
2129    ASSERT(!info->visited);
2130    info->visited = true;
2131  }
2132  ~VisitMarker() {
2133    info_->visited = false;
2134  }
2135 private:
2136  NodeInfo* info_;
2137};
2138
2139
2140void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2141                                          RegExpCompiler* compiler,
2142                                          int characters_filled_in,
2143                                          bool not_at_start) {
2144  if (body_can_be_zero_length_ || info()->visited) return;
2145  VisitMarker marker(info());
2146  return ChoiceNode::GetQuickCheckDetails(details,
2147                                          compiler,
2148                                          characters_filled_in,
2149                                          not_at_start);
2150}
2151
2152
2153void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2154                                      RegExpCompiler* compiler,
2155                                      int characters_filled_in,
2156                                      bool not_at_start) {
2157  not_at_start = (not_at_start || not_at_start_);
2158  int choice_count = alternatives_->length();
2159  ASSERT(choice_count > 0);
2160  alternatives_->at(0).node()->GetQuickCheckDetails(details,
2161                                                    compiler,
2162                                                    characters_filled_in,
2163                                                    not_at_start);
2164  for (int i = 1; i < choice_count; i++) {
2165    QuickCheckDetails new_details(details->characters());
2166    RegExpNode* node = alternatives_->at(i).node();
2167    node->GetQuickCheckDetails(&new_details, compiler,
2168                               characters_filled_in,
2169                               not_at_start);
2170    // Here we merge the quick match details of the two branches.
2171    details->Merge(&new_details, characters_filled_in);
2172  }
2173}
2174
2175
2176// Check for [0-9A-Z_a-z].
2177static void EmitWordCheck(RegExpMacroAssembler* assembler,
2178                          Label* word,
2179                          Label* non_word,
2180                          bool fall_through_on_word) {
2181  if (assembler->CheckSpecialCharacterClass(
2182          fall_through_on_word ? 'w' : 'W',
2183          fall_through_on_word ? non_word : word)) {
2184    // Optimized implementation available.
2185    return;
2186  }
2187  assembler->CheckCharacterGT('z', non_word);
2188  assembler->CheckCharacterLT('0', non_word);
2189  assembler->CheckCharacterGT('a' - 1, word);
2190  assembler->CheckCharacterLT('9' + 1, word);
2191  assembler->CheckCharacterLT('A', non_word);
2192  assembler->CheckCharacterLT('Z' + 1, word);
2193  if (fall_through_on_word) {
2194    assembler->CheckNotCharacter('_', non_word);
2195  } else {
2196    assembler->CheckCharacter('_', word);
2197  }
2198}
2199
2200
2201// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2202// that matches newline or the start of input).
2203static void EmitHat(RegExpCompiler* compiler,
2204                    RegExpNode* on_success,
2205                    Trace* trace) {
2206  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2207  // We will be loading the previous character into the current character
2208  // register.
2209  Trace new_trace(*trace);
2210  new_trace.InvalidateCurrentCharacter();
2211
2212  Label ok;
2213  if (new_trace.cp_offset() == 0) {
2214    // The start of input counts as a newline in this context, so skip to
2215    // ok if we are at the start.
2216    assembler->CheckAtStart(&ok);
2217  }
2218  // We already checked that we are not at the start of input so it must be
2219  // OK to load the previous character.
2220  assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2221                                  new_trace.backtrack(),
2222                                  false);
2223  if (!assembler->CheckSpecialCharacterClass('n',
2224                                             new_trace.backtrack())) {
2225    // Newline means \n, \r, 0x2028 or 0x2029.
2226    if (!compiler->ascii()) {
2227      assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2228    }
2229    assembler->CheckCharacter('\n', &ok);
2230    assembler->CheckNotCharacter('\r', new_trace.backtrack());
2231  }
2232  assembler->Bind(&ok);
2233  on_success->Emit(compiler, &new_trace);
2234}
2235
2236
2237// Emit the code to handle \b and \B (word-boundary or non-word-boundary)
2238// when we know whether the next character must be a word character or not.
2239static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
2240                                  RegExpCompiler* compiler,
2241                                  RegExpNode* on_success,
2242                                  Trace* trace) {
2243  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2244  Label done;
2245
2246  Trace new_trace(*trace);
2247
2248  bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
2249  Label* on_word = expect_word_character ? &done : new_trace.backtrack();
2250  Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
2251
2252  // Check whether previous character was a word character.
2253  switch (trace->at_start()) {
2254    case Trace::TRUE:
2255      if (expect_word_character) {
2256        assembler->GoTo(on_non_word);
2257      }
2258      break;
2259    case Trace::UNKNOWN:
2260      ASSERT_EQ(0, trace->cp_offset());
2261      assembler->CheckAtStart(on_non_word);
2262      // Fall through.
2263    case Trace::FALSE:
2264      int prev_char_offset = trace->cp_offset() - 1;
2265      assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
2266      EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
2267      // We may or may not have loaded the previous character.
2268      new_trace.InvalidateCurrentCharacter();
2269  }
2270
2271  assembler->Bind(&done);
2272
2273  on_success->Emit(compiler, &new_trace);
2274}
2275
2276
2277// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2278static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2279                              RegExpCompiler* compiler,
2280                              RegExpNode* on_success,
2281                              Trace* trace) {
2282  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2283  Label before_non_word;
2284  Label before_word;
2285  if (trace->characters_preloaded() != 1) {
2286    assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2287  }
2288  // Fall through on non-word.
2289  EmitWordCheck(assembler, &before_word, &before_non_word, false);
2290
2291  // We will be loading the previous character into the current character
2292  // register.
2293  Trace new_trace(*trace);
2294  new_trace.InvalidateCurrentCharacter();
2295
2296  Label ok;
2297  Label* boundary;
2298  Label* not_boundary;
2299  if (type == AssertionNode::AT_BOUNDARY) {
2300    boundary = &ok;
2301    not_boundary = new_trace.backtrack();
2302  } else {
2303    not_boundary = &ok;
2304    boundary = new_trace.backtrack();
2305  }
2306
2307  // Next character is not a word character.
2308  assembler->Bind(&before_non_word);
2309  if (new_trace.cp_offset() == 0) {
2310    // The start of input counts as a non-word character, so the question is
2311    // decided if we are at the start.
2312    assembler->CheckAtStart(not_boundary);
2313  }
2314  // We already checked that we are not at the start of input so it must be
2315  // OK to load the previous character.
2316  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2317                                  &ok,  // Unused dummy label in this call.
2318                                  false);
2319  // Fall through on non-word.
2320  EmitWordCheck(assembler, boundary, not_boundary, false);
2321  assembler->GoTo(not_boundary);
2322
2323  // Next character is a word character.
2324  assembler->Bind(&before_word);
2325  if (new_trace.cp_offset() == 0) {
2326    // The start of input counts as a non-word character, so the question is
2327    // decided if we are at the start.
2328    assembler->CheckAtStart(boundary);
2329  }
2330  // We already checked that we are not at the start of input so it must be
2331  // OK to load the previous character.
2332  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2333                                  &ok,  // Unused dummy label in this call.
2334                                  false);
2335  bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2336  EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2337
2338  assembler->Bind(&ok);
2339
2340  on_success->Emit(compiler, &new_trace);
2341}
2342
2343
2344void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2345                                         RegExpCompiler* compiler,
2346                                         int filled_in,
2347                                         bool not_at_start) {
2348  if (type_ == AT_START && not_at_start) {
2349    details->set_cannot_match();
2350    return;
2351  }
2352  return on_success()->GetQuickCheckDetails(details,
2353                                            compiler,
2354                                            filled_in,
2355                                            not_at_start);
2356}
2357
2358
2359void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2360  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2361  switch (type_) {
2362    case AT_END: {
2363      Label ok;
2364      assembler->CheckPosition(trace->cp_offset(), &ok);
2365      assembler->GoTo(trace->backtrack());
2366      assembler->Bind(&ok);
2367      break;
2368    }
2369    case AT_START: {
2370      if (trace->at_start() == Trace::FALSE) {
2371        assembler->GoTo(trace->backtrack());
2372        return;
2373      }
2374      if (trace->at_start() == Trace::UNKNOWN) {
2375        assembler->CheckNotAtStart(trace->backtrack());
2376        Trace at_start_trace = *trace;
2377        at_start_trace.set_at_start(true);
2378        on_success()->Emit(compiler, &at_start_trace);
2379        return;
2380      }
2381    }
2382    break;
2383    case AFTER_NEWLINE:
2384      EmitHat(compiler, on_success(), trace);
2385      return;
2386    case AT_BOUNDARY:
2387    case AT_NON_BOUNDARY: {
2388      EmitBoundaryCheck(type_, compiler, on_success(), trace);
2389      return;
2390    }
2391    case AFTER_WORD_CHARACTER:
2392    case AFTER_NONWORD_CHARACTER: {
2393      EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
2394    }
2395  }
2396  on_success()->Emit(compiler, trace);
2397}
2398
2399
2400static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2401  if (quick_check == NULL) return false;
2402  if (offset >= quick_check->characters()) return false;
2403  return quick_check->positions(offset)->determines_perfectly;
2404}
2405
2406
2407static void UpdateBoundsCheck(int index, int* checked_up_to) {
2408  if (index > *checked_up_to) {
2409    *checked_up_to = index;
2410  }
2411}
2412
2413
2414// We call this repeatedly to generate code for each pass over the text node.
2415// The passes are in increasing order of difficulty because we hope one
2416// of the first passes will fail in which case we are saved the work of the
2417// later passes.  for example for the case independent regexp /%[asdfghjkl]a/
2418// we will check the '%' in the first pass, the case independent 'a' in the
2419// second pass and the character class in the last pass.
2420//
2421// The passes are done from right to left, so for example to test for /bar/
2422// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2423// and then a 'b' with offset 0.  This means we can avoid the end-of-input
2424// bounds check most of the time.  In the example we only need to check for
2425// end-of-input when loading the putative 'r'.
2426//
2427// A slight complication involves the fact that the first character may already
2428// be fetched into a register by the previous node.  In this case we want to
2429// do the test for that character first.  We do this in separate passes.  The
2430// 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
2431// pass has been performed then subsequent passes will have true in
2432// first_element_checked to indicate that that character does not need to be
2433// checked again.
2434//
2435// In addition to all this we are passed a Trace, which can
2436// contain an AlternativeGeneration object.  In this AlternativeGeneration
2437// object we can see details of any quick check that was already passed in
2438// order to get to the code we are now generating.  The quick check can involve
2439// loading characters, which means we do not need to recheck the bounds
2440// up to the limit the quick check already checked.  In addition the quick
2441// check can have involved a mask and compare operation which may simplify
2442// or obviate the need for further checks at some character positions.
2443void TextNode::TextEmitPass(RegExpCompiler* compiler,
2444                            TextEmitPassType pass,
2445                            bool preloaded,
2446                            Trace* trace,
2447                            bool first_element_checked,
2448                            int* checked_up_to) {
2449  Isolate* isolate = Isolate::Current();
2450  RegExpMacroAssembler* assembler = compiler->macro_assembler();
2451  bool ascii = compiler->ascii();
2452  Label* backtrack = trace->backtrack();
2453  QuickCheckDetails* quick_check = trace->quick_check_performed();
2454  int element_count = elms_->length();
2455  for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2456    TextElement elm = elms_->at(i);
2457    int cp_offset = trace->cp_offset() + elm.cp_offset;
2458    if (elm.type == TextElement::ATOM) {
2459      Vector<const uc16> quarks = elm.data.u_atom->data();
2460      for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2461        if (first_element_checked && i == 0 && j == 0) continue;
2462        if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
2463        EmitCharacterFunction* emit_function = NULL;
2464        switch (pass) {
2465          case NON_ASCII_MATCH:
2466            ASSERT(ascii);
2467            if (quarks[j] > String::kMaxAsciiCharCode) {
2468              assembler->GoTo(backtrack);
2469              return;
2470            }
2471            break;
2472          case NON_LETTER_CHARACTER_MATCH:
2473            emit_function = &EmitAtomNonLetter;
2474            break;
2475          case SIMPLE_CHARACTER_MATCH:
2476            emit_function = &EmitSimpleCharacter;
2477            break;
2478          case CASE_CHARACTER_MATCH:
2479            emit_function = &EmitAtomLetter;
2480            break;
2481          default:
2482            break;
2483        }
2484        if (emit_function != NULL) {
2485          bool bound_checked = emit_function(isolate,
2486                                             compiler,
2487                                             quarks[j],
2488                                             backtrack,
2489                                             cp_offset + j,
2490                                             *checked_up_to < cp_offset + j,
2491                                             preloaded);
2492          if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2493        }
2494      }
2495    } else {
2496      ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
2497      if (pass == CHARACTER_CLASS_MATCH) {
2498        if (first_element_checked && i == 0) continue;
2499        if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
2500        RegExpCharacterClass* cc = elm.data.u_char_class;
2501        EmitCharClass(assembler,
2502                      cc,
2503                      ascii,
2504                      backtrack,
2505                      cp_offset,
2506                      *checked_up_to < cp_offset,
2507                      preloaded);
2508        UpdateBoundsCheck(cp_offset, checked_up_to);
2509      }
2510    }
2511  }
2512}
2513
2514
2515int TextNode::Length() {
2516  TextElement elm = elms_->last();
2517  ASSERT(elm.cp_offset >= 0);
2518  if (elm.type == TextElement::ATOM) {
2519    return elm.cp_offset + elm.data.u_atom->data().length();
2520  } else {
2521    return elm.cp_offset + 1;
2522  }
2523}
2524
2525
2526bool TextNode::SkipPass(int int_pass, bool ignore_case) {
2527  TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
2528  if (ignore_case) {
2529    return pass == SIMPLE_CHARACTER_MATCH;
2530  } else {
2531    return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2532  }
2533}
2534
2535
2536// This generates the code to match a text node.  A text node can contain
2537// straight character sequences (possibly to be matched in a case-independent
2538// way) and character classes.  For efficiency we do not do this in a single
2539// pass from left to right.  Instead we pass over the text node several times,
2540// emitting code for some character positions every time.  See the comment on
2541// TextEmitPass for details.
2542void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2543  LimitResult limit_result = LimitVersions(compiler, trace);
2544  if (limit_result == DONE) return;
2545  ASSERT(limit_result == CONTINUE);
2546
2547  if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2548    compiler->SetRegExpTooBig();
2549    return;
2550  }
2551
2552  if (compiler->ascii()) {
2553    int dummy = 0;
2554    TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
2555  }
2556
2557  bool first_elt_done = false;
2558  int bound_checked_to = trace->cp_offset() - 1;
2559  bound_checked_to += trace->bound_checked_up_to();
2560
2561  // If a character is preloaded into the current character register then
2562  // check that now.
2563  if (trace->characters_preloaded() == 1) {
2564    for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2565      if (!SkipPass(pass, compiler->ignore_case())) {
2566        TextEmitPass(compiler,
2567                     static_cast<TextEmitPassType>(pass),
2568                     true,
2569                     trace,
2570                     false,
2571                     &bound_checked_to);
2572      }
2573    }
2574    first_elt_done = true;
2575  }
2576
2577  for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2578    if (!SkipPass(pass, compiler->ignore_case())) {
2579      TextEmitPass(compiler,
2580                   static_cast<TextEmitPassType>(pass),
2581                   false,
2582                   trace,
2583                   first_elt_done,
2584                   &bound_checked_to);
2585    }
2586  }
2587
2588  Trace successor_trace(*trace);
2589  successor_trace.set_at_start(false);
2590  successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
2591  RecursionCheck rc(compiler);
2592  on_success()->Emit(compiler, &successor_trace);
2593}
2594
2595
2596void Trace::InvalidateCurrentCharacter() {
2597  characters_preloaded_ = 0;
2598}
2599
2600
2601void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
2602  ASSERT(by > 0);
2603  // We don't have an instruction for shifting the current character register
2604  // down or for using a shifted value for anything so lets just forget that
2605  // we preloaded any characters into it.
2606  characters_preloaded_ = 0;
2607  // Adjust the offsets of the quick check performed information.  This
2608  // information is used to find out what we already determined about the
2609  // characters by means of mask and compare.
2610  quick_check_performed_.Advance(by, compiler->ascii());
2611  cp_offset_ += by;
2612  if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2613    compiler->SetRegExpTooBig();
2614    cp_offset_ = 0;
2615  }
2616  bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
2617}
2618
2619
2620void TextNode::MakeCaseIndependent(bool is_ascii) {
2621  int element_count = elms_->length();
2622  for (int i = 0; i < element_count; i++) {
2623    TextElement elm = elms_->at(i);
2624    if (elm.type == TextElement::CHAR_CLASS) {
2625      RegExpCharacterClass* cc = elm.data.u_char_class;
2626      // None of the standard character classses is different in the case
2627      // independent case and it slows us down if we don't know that.
2628      if (cc->is_standard()) continue;
2629      ZoneList<CharacterRange>* ranges = cc->ranges();
2630      int range_count = ranges->length();
2631      for (int j = 0; j < range_count; j++) {
2632        ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
2633      }
2634    }
2635  }
2636}
2637
2638
2639int TextNode::GreedyLoopTextLength() {
2640  TextElement elm = elms_->at(elms_->length() - 1);
2641  if (elm.type == TextElement::CHAR_CLASS) {
2642    return elm.cp_offset + 1;
2643  } else {
2644    return elm.cp_offset + elm.data.u_atom->data().length();
2645  }
2646}
2647
2648
2649// Finds the fixed match length of a sequence of nodes that goes from
2650// this alternative and back to this choice node.  If there are variable
2651// length nodes or other complications in the way then return a sentinel
2652// value indicating that a greedy loop cannot be constructed.
2653int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2654  int length = 0;
2655  RegExpNode* node = alternative->node();
2656  // Later we will generate code for all these text nodes using recursion
2657  // so we have to limit the max number.
2658  int recursion_depth = 0;
2659  while (node != this) {
2660    if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2661      return kNodeIsTooComplexForGreedyLoops;
2662    }
2663    int node_length = node->GreedyLoopTextLength();
2664    if (node_length == kNodeIsTooComplexForGreedyLoops) {
2665      return kNodeIsTooComplexForGreedyLoops;
2666    }
2667    length += node_length;
2668    SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2669    node = seq_node->on_success();
2670  }
2671  return length;
2672}
2673
2674
2675void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2676  ASSERT_EQ(loop_node_, NULL);
2677  AddAlternative(alt);
2678  loop_node_ = alt.node();
2679}
2680
2681
2682void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2683  ASSERT_EQ(continue_node_, NULL);
2684  AddAlternative(alt);
2685  continue_node_ = alt.node();
2686}
2687
2688
2689void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2690  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2691  if (trace->stop_node() == this) {
2692    int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2693    ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2694    // Update the counter-based backtracking info on the stack.  This is an
2695    // optimization for greedy loops (see below).
2696    ASSERT(trace->cp_offset() == text_length);
2697    macro_assembler->AdvanceCurrentPosition(text_length);
2698    macro_assembler->GoTo(trace->loop_label());
2699    return;
2700  }
2701  ASSERT(trace->stop_node() == NULL);
2702  if (!trace->is_trivial()) {
2703    trace->Flush(compiler, this);
2704    return;
2705  }
2706  ChoiceNode::Emit(compiler, trace);
2707}
2708
2709
2710int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2711                                           bool not_at_start) {
2712  int preload_characters = EatsAtLeast(4, 0, not_at_start);
2713  if (compiler->macro_assembler()->CanReadUnaligned()) {
2714    bool ascii = compiler->ascii();
2715    if (ascii) {
2716      if (preload_characters > 4) preload_characters = 4;
2717      // We can't preload 3 characters because there is no machine instruction
2718      // to do that.  We can't just load 4 because we could be reading
2719      // beyond the end of the string, which could cause a memory fault.
2720      if (preload_characters == 3) preload_characters = 2;
2721    } else {
2722      if (preload_characters > 2) preload_characters = 2;
2723    }
2724  } else {
2725    if (preload_characters > 1) preload_characters = 1;
2726  }
2727  return preload_characters;
2728}
2729
2730
2731// This class is used when generating the alternatives in a choice node.  It
2732// records the way the alternative is being code generated.
2733class AlternativeGeneration: public Malloced {
2734 public:
2735  AlternativeGeneration()
2736      : possible_success(),
2737        expects_preload(false),
2738        after(),
2739        quick_check_details() { }
2740  Label possible_success;
2741  bool expects_preload;
2742  Label after;
2743  QuickCheckDetails quick_check_details;
2744};
2745
2746
2747// Creates a list of AlternativeGenerations.  If the list has a reasonable
2748// size then it is on the stack, otherwise the excess is on the heap.
2749class AlternativeGenerationList {
2750 public:
2751  explicit AlternativeGenerationList(int count)
2752      : alt_gens_(count) {
2753    for (int i = 0; i < count && i < kAFew; i++) {
2754      alt_gens_.Add(a_few_alt_gens_ + i);
2755    }
2756    for (int i = kAFew; i < count; i++) {
2757      alt_gens_.Add(new AlternativeGeneration());
2758    }
2759  }
2760  ~AlternativeGenerationList() {
2761    for (int i = kAFew; i < alt_gens_.length(); i++) {
2762      delete alt_gens_[i];
2763      alt_gens_[i] = NULL;
2764    }
2765  }
2766
2767  AlternativeGeneration* at(int i) {
2768    return alt_gens_[i];
2769  }
2770 private:
2771  static const int kAFew = 10;
2772  ZoneList<AlternativeGeneration*> alt_gens_;
2773  AlternativeGeneration a_few_alt_gens_[kAFew];
2774};
2775
2776
2777/* Code generation for choice nodes.
2778 *
2779 * We generate quick checks that do a mask and compare to eliminate a
2780 * choice.  If the quick check succeeds then it jumps to the continuation to
2781 * do slow checks and check subsequent nodes.  If it fails (the common case)
2782 * it falls through to the next choice.
2783 *
2784 * Here is the desired flow graph.  Nodes directly below each other imply
2785 * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
2786 * 3 doesn't have a quick check so we have to call the slow check.
2787 * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
2788 * regexp continuation is generated directly after the Sn node, up to the
2789 * next GoTo if we decide to reuse some already generated code.  Some
2790 * nodes expect preload_characters to be preloaded into the current
2791 * character register.  R nodes do this preloading.  Vertices are marked
2792 * F for failures and S for success (possible success in the case of quick
2793 * nodes).  L, V, < and > are used as arrow heads.
2794 *
2795 * ----------> R
2796 *             |
2797 *             V
2798 *            Q1 -----> S1
2799 *             |   S   /
2800 *            F|      /
2801 *             |    F/
2802 *             |    /
2803 *             |   R
2804 *             |  /
2805 *             V L
2806 *            Q2 -----> S2
2807 *             |   S   /
2808 *            F|      /
2809 *             |    F/
2810 *             |    /
2811 *             |   R
2812 *             |  /
2813 *             V L
2814 *            S3
2815 *             |
2816 *            F|
2817 *             |
2818 *             R
2819 *             |
2820 * backtrack   V
2821 * <----------Q4
2822 *   \    F    |
2823 *    \        |S
2824 *     \   F   V
2825 *      \-----S4
2826 *
2827 * For greedy loops we reverse our expectation and expect to match rather
2828 * than fail. Therefore we want the loop code to look like this (U is the
2829 * unwind code that steps back in the greedy loop).  The following alternatives
2830 * look the same as above.
2831 *              _____
2832 *             /     \
2833 *             V     |
2834 * ----------> S1    |
2835 *            /|     |
2836 *           / |S    |
2837 *         F/  \_____/
2838 *         /
2839 *        |<-----------
2840 *        |            \
2841 *        V             \
2842 *        Q2 ---> S2     \
2843 *        |  S   /       |
2844 *       F|     /        |
2845 *        |   F/         |
2846 *        |   /          |
2847 *        |  R           |
2848 *        | /            |
2849 *   F    VL             |
2850 * <------U              |
2851 * back   |S             |
2852 *        \______________/
2853 */
2854
2855
2856void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2857  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2858  int choice_count = alternatives_->length();
2859#ifdef DEBUG
2860  for (int i = 0; i < choice_count - 1; i++) {
2861    GuardedAlternative alternative = alternatives_->at(i);
2862    ZoneList<Guard*>* guards = alternative.guards();
2863    int guard_count = (guards == NULL) ? 0 : guards->length();
2864    for (int j = 0; j < guard_count; j++) {
2865      ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
2866    }
2867  }
2868#endif
2869
2870  LimitResult limit_result = LimitVersions(compiler, trace);
2871  if (limit_result == DONE) return;
2872  ASSERT(limit_result == CONTINUE);
2873
2874  int new_flush_budget = trace->flush_budget() / choice_count;
2875  if (trace->flush_budget() == 0 && trace->actions() != NULL) {
2876    trace->Flush(compiler, this);
2877    return;
2878  }
2879
2880  RecursionCheck rc(compiler);
2881
2882  Trace* current_trace = trace;
2883
2884  int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2885  bool greedy_loop = false;
2886  Label greedy_loop_label;
2887  Trace counter_backtrack_trace;
2888  counter_backtrack_trace.set_backtrack(&greedy_loop_label);
2889  if (not_at_start()) counter_backtrack_trace.set_at_start(false);
2890
2891  if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2892    // Here we have special handling for greedy loops containing only text nodes
2893    // and other simple nodes.  These are handled by pushing the current
2894    // position on the stack and then incrementing the current position each
2895    // time around the switch.  On backtrack we decrement the current position
2896    // and check it against the pushed value.  This avoids pushing backtrack
2897    // information for each iteration of the loop, which could take up a lot of
2898    // space.
2899    greedy_loop = true;
2900    ASSERT(trace->stop_node() == NULL);
2901    macro_assembler->PushCurrentPosition();
2902    current_trace = &counter_backtrack_trace;
2903    Label greedy_match_failed;
2904    Trace greedy_match_trace;
2905    if (not_at_start()) greedy_match_trace.set_at_start(false);
2906    greedy_match_trace.set_backtrack(&greedy_match_failed);
2907    Label loop_label;
2908    macro_assembler->Bind(&loop_label);
2909    greedy_match_trace.set_stop_node(this);
2910    greedy_match_trace.set_loop_label(&loop_label);
2911    alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
2912    macro_assembler->Bind(&greedy_match_failed);
2913  }
2914
2915  Label second_choice;  // For use in greedy matches.
2916  macro_assembler->Bind(&second_choice);
2917
2918  int first_normal_choice = greedy_loop ? 1 : 0;
2919
2920  int preload_characters =
2921      CalculatePreloadCharacters(compiler,
2922                                 current_trace->at_start() == Trace::FALSE);
2923  bool preload_is_current =
2924      (current_trace->characters_preloaded() == preload_characters);
2925  bool preload_has_checked_bounds = preload_is_current;
2926
2927  AlternativeGenerationList alt_gens(choice_count);
2928
2929  // For now we just call all choices one after the other.  The idea ultimately
2930  // is to use the Dispatch table to try only the relevant ones.
2931  for (int i = first_normal_choice; i < choice_count; i++) {
2932    GuardedAlternative alternative = alternatives_->at(i);
2933    AlternativeGeneration* alt_gen = alt_gens.at(i);
2934    alt_gen->quick_check_details.set_characters(preload_characters);
2935    ZoneList<Guard*>* guards = alternative.guards();
2936    int guard_count = (guards == NULL) ? 0 : guards->length();
2937    Trace new_trace(*current_trace);
2938    new_trace.set_characters_preloaded(preload_is_current ?
2939                                         preload_characters :
2940                                         0);
2941    if (preload_has_checked_bounds) {
2942      new_trace.set_bound_checked_up_to(preload_characters);
2943    }
2944    new_trace.quick_check_performed()->Clear();
2945    if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
2946    alt_gen->expects_preload = preload_is_current;
2947    bool generate_full_check_inline = false;
2948    if (FLAG_regexp_optimization &&
2949        try_to_emit_quick_check_for_alternative(i) &&
2950        alternative.node()->EmitQuickCheck(compiler,
2951                                           &new_trace,
2952                                           preload_has_checked_bounds,
2953                                           &alt_gen->possible_success,
2954                                           &alt_gen->quick_check_details,
2955                                           i < choice_count - 1)) {
2956      // Quick check was generated for this choice.
2957      preload_is_current = true;
2958      preload_has_checked_bounds = true;
2959      // On the last choice in the ChoiceNode we generated the quick
2960      // check to fall through on possible success.  So now we need to
2961      // generate the full check inline.
2962      if (i == choice_count - 1) {
2963        macro_assembler->Bind(&alt_gen->possible_success);
2964        new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2965        new_trace.set_characters_preloaded(preload_characters);
2966        new_trace.set_bound_checked_up_to(preload_characters);
2967        generate_full_check_inline = true;
2968      }
2969    } else if (alt_gen->quick_check_details.cannot_match()) {
2970      if (i == choice_count - 1 && !greedy_loop) {
2971        macro_assembler->GoTo(trace->backtrack());
2972      }
2973      continue;
2974    } else {
2975      // No quick check was generated.  Put the full code here.
2976      // If this is not the first choice then there could be slow checks from
2977      // previous cases that go here when they fail.  There's no reason to
2978      // insist that they preload characters since the slow check we are about
2979      // to generate probably can't use it.
2980      if (i != first_normal_choice) {
2981        alt_gen->expects_preload = false;
2982        new_trace.InvalidateCurrentCharacter();
2983      }
2984      if (i < choice_count - 1) {
2985        new_trace.set_backtrack(&alt_gen->after);
2986      }
2987      generate_full_check_inline = true;
2988    }
2989    if (generate_full_check_inline) {
2990      if (new_trace.actions() != NULL) {
2991        new_trace.set_flush_budget(new_flush_budget);
2992      }
2993      for (int j = 0; j < guard_count; j++) {
2994        GenerateGuard(macro_assembler, guards->at(j), &new_trace);
2995      }
2996      alternative.node()->Emit(compiler, &new_trace);
2997      preload_is_current = false;
2998    }
2999    macro_assembler->Bind(&alt_gen->after);
3000  }
3001  if (greedy_loop) {
3002    macro_assembler->Bind(&greedy_loop_label);
3003    // If we have unwound to the bottom then backtrack.
3004    macro_assembler->CheckGreedyLoop(trace->backtrack());
3005    // Otherwise try the second priority at an earlier position.
3006    macro_assembler->AdvanceCurrentPosition(-text_length);
3007    macro_assembler->GoTo(&second_choice);
3008  }
3009
3010  // At this point we need to generate slow checks for the alternatives where
3011  // the quick check was inlined.  We can recognize these because the associated
3012  // label was bound.
3013  for (int i = first_normal_choice; i < choice_count - 1; i++) {
3014    AlternativeGeneration* alt_gen = alt_gens.at(i);
3015    Trace new_trace(*current_trace);
3016    // If there are actions to be flushed we have to limit how many times
3017    // they are flushed.  Take the budget of the parent trace and distribute
3018    // it fairly amongst the children.
3019    if (new_trace.actions() != NULL) {
3020      new_trace.set_flush_budget(new_flush_budget);
3021    }
3022    EmitOutOfLineContinuation(compiler,
3023                              &new_trace,
3024                              alternatives_->at(i),
3025                              alt_gen,
3026                              preload_characters,
3027                              alt_gens.at(i + 1)->expects_preload);
3028  }
3029}
3030
3031
3032void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3033                                           Trace* trace,
3034                                           GuardedAlternative alternative,
3035                                           AlternativeGeneration* alt_gen,
3036                                           int preload_characters,
3037                                           bool next_expects_preload) {
3038  if (!alt_gen->possible_success.is_linked()) return;
3039
3040  RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3041  macro_assembler->Bind(&alt_gen->possible_success);
3042  Trace out_of_line_trace(*trace);
3043  out_of_line_trace.set_characters_preloaded(preload_characters);
3044  out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3045  if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
3046  ZoneList<Guard*>* guards = alternative.guards();
3047  int guard_count = (guards == NULL) ? 0 : guards->length();
3048  if (next_expects_preload) {
3049    Label reload_current_char;
3050    out_of_line_trace.set_backtrack(&reload_current_char);
3051    for (int j = 0; j < guard_count; j++) {
3052      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3053    }
3054    alternative.node()->Emit(compiler, &out_of_line_trace);
3055    macro_assembler->Bind(&reload_current_char);
3056    // Reload the current character, since the next quick check expects that.
3057    // We don't need to check bounds here because we only get into this
3058    // code through a quick check which already did the checked load.
3059    macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
3060                                          NULL,
3061                                          false,
3062                                          preload_characters);
3063    macro_assembler->GoTo(&(alt_gen->after));
3064  } else {
3065    out_of_line_trace.set_backtrack(&(alt_gen->after));
3066    for (int j = 0; j < guard_count; j++) {
3067      GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3068    }
3069    alternative.node()->Emit(compiler, &out_of_line_trace);
3070  }
3071}
3072
3073
3074void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3075  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3076  LimitResult limit_result = LimitVersions(compiler, trace);
3077  if (limit_result == DONE) return;
3078  ASSERT(limit_result == CONTINUE);
3079
3080  RecursionCheck rc(compiler);
3081
3082  switch (type_) {
3083    case STORE_POSITION: {
3084      Trace::DeferredCapture
3085          new_capture(data_.u_position_register.reg,
3086                      data_.u_position_register.is_capture,
3087                      trace);
3088      Trace new_trace = *trace;
3089      new_trace.add_action(&new_capture);
3090      on_success()->Emit(compiler, &new_trace);
3091      break;
3092    }
3093    case INCREMENT_REGISTER: {
3094      Trace::DeferredIncrementRegister
3095          new_increment(data_.u_increment_register.reg);
3096      Trace new_trace = *trace;
3097      new_trace.add_action(&new_increment);
3098      on_success()->Emit(compiler, &new_trace);
3099      break;
3100    }
3101    case SET_REGISTER: {
3102      Trace::DeferredSetRegister
3103          new_set(data_.u_store_register.reg, data_.u_store_register.value);
3104      Trace new_trace = *trace;
3105      new_trace.add_action(&new_set);
3106      on_success()->Emit(compiler, &new_trace);
3107      break;
3108    }
3109    case CLEAR_CAPTURES: {
3110      Trace::DeferredClearCaptures
3111        new_capture(Interval(data_.u_clear_captures.range_from,
3112                             data_.u_clear_captures.range_to));
3113      Trace new_trace = *trace;
3114      new_trace.add_action(&new_capture);
3115      on_success()->Emit(compiler, &new_trace);
3116      break;
3117    }
3118    case BEGIN_SUBMATCH:
3119      if (!trace->is_trivial()) {
3120        trace->Flush(compiler, this);
3121      } else {
3122        assembler->WriteCurrentPositionToRegister(
3123            data_.u_submatch.current_position_register, 0);
3124        assembler->WriteStackPointerToRegister(
3125            data_.u_submatch.stack_pointer_register);
3126        on_success()->Emit(compiler, trace);
3127      }
3128      break;
3129    case EMPTY_MATCH_CHECK: {
3130      int start_pos_reg = data_.u_empty_match_check.start_register;
3131      int stored_pos = 0;
3132      int rep_reg = data_.u_empty_match_check.repetition_register;
3133      bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3134      bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3135      if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3136        // If we know we haven't advanced and there is no minimum we
3137        // can just backtrack immediately.
3138        assembler->GoTo(trace->backtrack());
3139      } else if (know_dist && stored_pos < trace->cp_offset()) {
3140        // If we know we've advanced we can generate the continuation
3141        // immediately.
3142        on_success()->Emit(compiler, trace);
3143      } else if (!trace->is_trivial()) {
3144        trace->Flush(compiler, this);
3145      } else {
3146        Label skip_empty_check;
3147        // If we have a minimum number of repetitions we check the current
3148        // number first and skip the empty check if it's not enough.
3149        if (has_minimum) {
3150          int limit = data_.u_empty_match_check.repetition_limit;
3151          assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3152        }
3153        // If the match is empty we bail out, otherwise we fall through
3154        // to the on-success continuation.
3155        assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3156                                   trace->backtrack());
3157        assembler->Bind(&skip_empty_check);
3158        on_success()->Emit(compiler, trace);
3159      }
3160      break;
3161    }
3162    case POSITIVE_SUBMATCH_SUCCESS: {
3163      if (!trace->is_trivial()) {
3164        trace->Flush(compiler, this);
3165        return;
3166      }
3167      assembler->ReadCurrentPositionFromRegister(
3168          data_.u_submatch.current_position_register);
3169      assembler->ReadStackPointerFromRegister(
3170          data_.u_submatch.stack_pointer_register);
3171      int clear_register_count = data_.u_submatch.clear_register_count;
3172      if (clear_register_count == 0) {
3173        on_success()->Emit(compiler, trace);
3174        return;
3175      }
3176      int clear_registers_from = data_.u_submatch.clear_register_from;
3177      Label clear_registers_backtrack;
3178      Trace new_trace = *trace;
3179      new_trace.set_backtrack(&clear_registers_backtrack);
3180      on_success()->Emit(compiler, &new_trace);
3181
3182      assembler->Bind(&clear_registers_backtrack);
3183      int clear_registers_to = clear_registers_from + clear_register_count - 1;
3184      assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3185
3186      ASSERT(trace->backtrack() == NULL);
3187      assembler->Backtrack();
3188      return;
3189    }
3190    default:
3191      UNREACHABLE();
3192  }
3193}
3194
3195
3196void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3197  RegExpMacroAssembler* assembler = compiler->macro_assembler();
3198  if (!trace->is_trivial()) {
3199    trace->Flush(compiler, this);
3200    return;
3201  }
3202
3203  LimitResult limit_result = LimitVersions(compiler, trace);
3204  if (limit_result == DONE) return;
3205  ASSERT(limit_result == CONTINUE);
3206
3207  RecursionCheck rc(compiler);
3208
3209  ASSERT_EQ(start_reg_ + 1, end_reg_);
3210  if (compiler->ignore_case()) {
3211    assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3212                                               trace->backtrack());
3213  } else {
3214    assembler->CheckNotBackReference(start_reg_, trace->backtrack());
3215  }
3216  on_success()->Emit(compiler, trace);
3217}
3218
3219
3220// -------------------------------------------------------------------
3221// Dot/dotty output
3222
3223
3224#ifdef DEBUG
3225
3226
3227class DotPrinter: public NodeVisitor {
3228 public:
3229  explicit DotPrinter(bool ignore_case)
3230      : ignore_case_(ignore_case),
3231        stream_(&alloc_) { }
3232  void PrintNode(const char* label, RegExpNode* node);
3233  void Visit(RegExpNode* node);
3234  void PrintAttributes(RegExpNode* from);
3235  StringStream* stream() { return &stream_; }
3236  void PrintOnFailure(RegExpNode* from, RegExpNode* to);
3237#define DECLARE_VISIT(Type)                                          \
3238  virtual void Visit##Type(Type##Node* that);
3239FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3240#undef DECLARE_VISIT
3241 private:
3242  bool ignore_case_;
3243  HeapStringAllocator alloc_;
3244  StringStream stream_;
3245};
3246
3247
3248void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3249  stream()->Add("digraph G {\n  graph [label=\"");
3250  for (int i = 0; label[i]; i++) {
3251    switch (label[i]) {
3252      case '\\':
3253        stream()->Add("\\\\");
3254        break;
3255      case '"':
3256        stream()->Add("\"");
3257        break;
3258      default:
3259        stream()->Put(label[i]);
3260        break;
3261    }
3262  }
3263  stream()->Add("\"];\n");
3264  Visit(node);
3265  stream()->Add("}\n");
3266  printf("%s", *(stream()->ToCString()));
3267}
3268
3269
3270void DotPrinter::Visit(RegExpNode* node) {
3271  if (node->info()->visited) return;
3272  node->info()->visited = true;
3273  node->Accept(this);
3274}
3275
3276
3277void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
3278  stream()->Add("  n%p -> n%p [style=dotted];\n", from, on_failure);
3279  Visit(on_failure);
3280}
3281
3282
3283class TableEntryBodyPrinter {
3284 public:
3285  TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3286      : stream_(stream), choice_(choice) { }
3287  void Call(uc16 from, DispatchTable::Entry entry) {
3288    OutSet* out_set = entry.out_set();
3289    for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3290      if (out_set->Get(i)) {
3291        stream()->Add("    n%p:s%io%i -> n%p;\n",
3292                      choice(),
3293                      from,
3294                      i,
3295                      choice()->alternatives()->at(i).node());
3296      }
3297    }
3298  }
3299 private:
3300  StringStream* stream() { return stream_; }
3301  ChoiceNode* choice() { return choice_; }
3302  StringStream* stream_;
3303  ChoiceNode* choice_;
3304};
3305
3306
3307class TableEntryHeaderPrinter {
3308 public:
3309  explicit TableEntryHeaderPrinter(StringStream* stream)
3310      : first_(true), stream_(stream) { }
3311  void Call(uc16 from, DispatchTable::Entry entry) {
3312    if (first_) {
3313      first_ = false;
3314    } else {
3315      stream()->Add("|");
3316    }
3317    stream()->Add("{\\%k-\\%k|{", from, entry.to());
3318    OutSet* out_set = entry.out_set();
3319    int priority = 0;
3320    for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3321      if (out_set->Get(i)) {
3322        if (priority > 0) stream()->Add("|");
3323        stream()->Add("<s%io%i> %i", from, i, priority);
3324        priority++;
3325      }
3326    }
3327    stream()->Add("}}");
3328  }
3329 private:
3330  bool first_;
3331  StringStream* stream() { return stream_; }
3332  StringStream* stream_;
3333};
3334
3335
3336class AttributePrinter {
3337 public:
3338  explicit AttributePrinter(DotPrinter* out)
3339      : out_(out), first_(true) { }
3340  void PrintSeparator() {
3341    if (first_) {
3342      first_ = false;
3343    } else {
3344      out_->stream()->Add("|");
3345    }
3346  }
3347  void PrintBit(const char* name, bool value) {
3348    if (!value) return;
3349    PrintSeparator();
3350    out_->stream()->Add("{%s}", name);
3351  }
3352  void PrintPositive(const char* name, int value) {
3353    if (value < 0) return;
3354    PrintSeparator();
3355    out_->stream()->Add("{%s|%x}", name, value);
3356  }
3357 private:
3358  DotPrinter* out_;
3359  bool first_;
3360};
3361
3362
3363void DotPrinter::PrintAttributes(RegExpNode* that) {
3364  stream()->Add("  a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3365                "margin=0.1, fontsize=10, label=\"{",
3366                that);
3367  AttributePrinter printer(this);
3368  NodeInfo* info = that->info();
3369  printer.PrintBit("NI", info->follows_newline_interest);
3370  printer.PrintBit("WI", info->follows_word_interest);
3371  printer.PrintBit("SI", info->follows_start_interest);
3372  Label* label = that->label();
3373  if (label->is_bound())
3374    printer.PrintPositive("@", label->pos());
3375  stream()->Add("}\"];\n");
3376  stream()->Add("  a%p -> n%p [style=dashed, color=grey, "
3377                "arrowhead=none];\n", that, that);
3378}
3379
3380
3381static const bool kPrintDispatchTable = false;
3382void DotPrinter::VisitChoice(ChoiceNode* that) {
3383  if (kPrintDispatchTable) {
3384    stream()->Add("  n%p [shape=Mrecord, label=\"", that);
3385    TableEntryHeaderPrinter header_printer(stream());
3386    that->GetTable(ignore_case_)->ForEach(&header_printer);
3387    stream()->Add("\"]\n", that);
3388    PrintAttributes(that);
3389    TableEntryBodyPrinter body_printer(stream(), that);
3390    that->GetTable(ignore_case_)->ForEach(&body_printer);
3391  } else {
3392    stream()->Add("  n%p [shape=Mrecord, label=\"?\"];\n", that);
3393    for (int i = 0; i < that->alternatives()->length(); i++) {
3394      GuardedAlternative alt = that->alternatives()->at(i);
3395      stream()->Add("  n%p -> n%p;\n", that, alt.node());
3396    }
3397  }
3398  for (int i = 0; i < that->alternatives()->length(); i++) {
3399    GuardedAlternative alt = that->alternatives()->at(i);
3400    alt.node()->Accept(this);
3401  }
3402}
3403
3404
3405void DotPrinter::VisitText(TextNode* that) {
3406  stream()->Add("  n%p [label=\"", that);
3407  for (int i = 0; i < that->elements()->length(); i++) {
3408    if (i > 0) stream()->Add(" ");
3409    TextElement elm = that->elements()->at(i);
3410    switch (elm.type) {
3411      case TextElement::ATOM: {
3412        stream()->Add("'%w'", elm.data.u_atom->data());
3413        break;
3414      }
3415      case TextElement::CHAR_CLASS: {
3416        RegExpCharacterClass* node = elm.data.u_char_class;
3417        stream()->Add("[");
3418        if (node->is_negated())
3419          stream()->Add("^");
3420        for (int j = 0; j < node->ranges()->length(); j++) {
3421          CharacterRange range = node->ranges()->at(j);
3422          stream()->Add("%k-%k", range.from(), range.to());
3423        }
3424        stream()->Add("]");
3425        break;
3426      }
3427      default:
3428        UNREACHABLE();
3429    }
3430  }
3431  stream()->Add("\", shape=box, peripheries=2];\n");
3432  PrintAttributes(that);
3433  stream()->Add("  n%p -> n%p;\n", that, that->on_success());
3434  Visit(that->on_success());
3435}
3436
3437
3438void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3439  stream()->Add("  n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3440                that,
3441                that->start_register(),
3442                that->end_register());
3443  PrintAttributes(that);
3444  stream()->Add("  n%p -> n%p;\n", that, that->on_success());
3445  Visit(that->on_success());
3446}
3447
3448
3449void DotPrinter::VisitEnd(EndNode* that) {
3450  stream()->Add("  n%p [style=bold, shape=point];\n", that);
3451  PrintAttributes(that);
3452}
3453
3454
3455void DotPrinter::VisitAssertion(AssertionNode* that) {
3456  stream()->Add("  n%p [", that);
3457  switch (that->type()) {
3458    case AssertionNode::AT_END:
3459      stream()->Add("label=\"$\", shape=septagon");
3460      break;
3461    case AssertionNode::AT_START:
3462      stream()->Add("label=\"^\", shape=septagon");
3463      break;
3464    case AssertionNode::AT_BOUNDARY:
3465      stream()->Add("label=\"\\b\", shape=septagon");
3466      break;
3467    case AssertionNode::AT_NON_BOUNDARY:
3468      stream()->Add("label=\"\\B\", shape=septagon");
3469      break;
3470    case AssertionNode::AFTER_NEWLINE:
3471      stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3472      break;
3473    case AssertionNode::AFTER_WORD_CHARACTER:
3474      stream()->Add("label=\"(?<=\\w)\", shape=septagon");
3475      break;
3476    case AssertionNode::AFTER_NONWORD_CHARACTER:
3477      stream()->Add("label=\"(?<=\\W)\", shape=septagon");
3478      break;
3479  }
3480  stream()->Add("];\n");
3481  PrintAttributes(that);
3482  RegExpNode* successor = that->on_success();
3483  stream()->Add("  n%p -> n%p;\n", that, successor);
3484  Visit(successor);
3485}
3486
3487
3488void DotPrinter::VisitAction(ActionNode* that) {
3489  stream()->Add("  n%p [", that);
3490  switch (that->type_) {
3491    case ActionNode::SET_REGISTER:
3492      stream()->Add("label=\"$%i:=%i\", shape=octagon",
3493                    that->data_.u_store_register.reg,
3494                    that->data_.u_store_register.value);
3495      break;
3496    case ActionNode::INCREMENT_REGISTER:
3497      stream()->Add("label=\"$%i++\", shape=octagon",
3498                    that->data_.u_increment_register.reg);
3499      break;
3500    case ActionNode::STORE_POSITION:
3501      stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3502                    that->data_.u_position_register.reg);
3503      break;
3504    case ActionNode::BEGIN_SUBMATCH:
3505      stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3506                    that->data_.u_submatch.current_position_register);
3507      break;
3508    case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3509      stream()->Add("label=\"escape\", shape=septagon");
3510      break;
3511    case ActionNode::EMPTY_MATCH_CHECK:
3512      stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3513                    that->data_.u_empty_match_check.start_register,
3514                    that->data_.u_empty_match_check.repetition_register,
3515                    that->data_.u_empty_match_check.repetition_limit);
3516      break;
3517    case ActionNode::CLEAR_CAPTURES: {
3518      stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3519                    that->data_.u_clear_captures.range_from,
3520                    that->data_.u_clear_captures.range_to);
3521      break;
3522    }
3523  }
3524  stream()->Add("];\n");
3525  PrintAttributes(that);
3526  RegExpNode* successor = that->on_success();
3527  stream()->Add("  n%p -> n%p;\n", that, successor);
3528  Visit(successor);
3529}
3530
3531
3532class DispatchTableDumper {
3533 public:
3534  explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3535  void Call(uc16 key, DispatchTable::Entry entry);
3536  StringStream* stream() { return stream_; }
3537 private:
3538  StringStream* stream_;
3539};
3540
3541
3542void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3543  stream()->Add("[%k-%k]: {", key, entry.to());
3544  OutSet* set = entry.out_set();
3545  bool first = true;
3546  for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3547    if (set->Get(i)) {
3548      if (first) {
3549        first = false;
3550      } else {
3551        stream()->Add(", ");
3552      }
3553      stream()->Add("%i", i);
3554    }
3555  }
3556  stream()->Add("}\n");
3557}
3558
3559
3560void DispatchTable::Dump() {
3561  HeapStringAllocator alloc;
3562  StringStream stream(&alloc);
3563  DispatchTableDumper dumper(&stream);
3564  tree()->ForEach(&dumper);
3565  OS::PrintError("%s", *stream.ToCString());
3566}
3567
3568
3569void RegExpEngine::DotPrint(const char* label,
3570                            RegExpNode* node,
3571                            bool ignore_case) {
3572  DotPrinter printer(ignore_case);
3573  printer.PrintNode(label, node);
3574}
3575
3576
3577#endif  // DEBUG
3578
3579
3580// -------------------------------------------------------------------
3581// Tree to graph conversion
3582
3583static const int kSpaceRangeCount = 20;
3584static const int kSpaceRangeAsciiCount = 4;
3585static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3586    0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3587    0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3588
3589static const int kWordRangeCount = 8;
3590static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3591    '_', 'a', 'z' };
3592
3593static const int kDigitRangeCount = 2;
3594static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3595
3596static const int kLineTerminatorRangeCount = 6;
3597static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3598    0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
3599
3600RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
3601                               RegExpNode* on_success) {
3602  ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3603  elms->Add(TextElement::Atom(this));
3604  return new TextNode(elms, on_success);
3605}
3606
3607
3608RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
3609                               RegExpNode* on_success) {
3610  return new TextNode(elements(), on_success);
3611}
3612
3613static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3614                                 const uc16* special_class,
3615                                 int length) {
3616  ASSERT(ranges->length() != 0);
3617  ASSERT(length != 0);
3618  ASSERT(special_class[0] != 0);
3619  if (ranges->length() != (length >> 1) + 1) {
3620    return false;
3621  }
3622  CharacterRange range = ranges->at(0);
3623  if (range.from() != 0) {
3624    return false;
3625  }
3626  for (int i = 0; i < length; i += 2) {
3627    if (special_class[i] != (range.to() + 1)) {
3628      return false;
3629    }
3630    range = ranges->at((i >> 1) + 1);
3631    if (special_class[i+1] != range.from() - 1) {
3632      return false;
3633    }
3634  }
3635  if (range.to() != 0xffff) {
3636    return false;
3637  }
3638  return true;
3639}
3640
3641
3642static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3643                          const uc16* special_class,
3644                          int length) {
3645  if (ranges->length() * 2 != length) {
3646    return false;
3647  }
3648  for (int i = 0; i < length; i += 2) {
3649    CharacterRange range = ranges->at(i >> 1);
3650    if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3651      return false;
3652    }
3653  }
3654  return true;
3655}
3656
3657
3658bool RegExpCharacterClass::is_standard() {
3659  // TODO(lrn): Remove need for this function, by not throwing away information
3660  // along the way.
3661  if (is_negated_) {
3662    return false;
3663  }
3664  if (set_.is_standard()) {
3665    return true;
3666  }
3667  if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3668    set_.set_standard_set_type('s');
3669    return true;
3670  }
3671  if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3672    set_.set_standard_set_type('S');
3673    return true;
3674  }
3675  if (CompareInverseRanges(set_.ranges(),
3676                           kLineTerminatorRanges,
3677                           kLineTerminatorRangeCount)) {
3678    set_.set_standard_set_type('.');
3679    return true;
3680  }
3681  if (CompareRanges(set_.ranges(),
3682                    kLineTerminatorRanges,
3683                    kLineTerminatorRangeCount)) {
3684    set_.set_standard_set_type('n');
3685    return true;
3686  }
3687  if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3688    set_.set_standard_set_type('w');
3689    return true;
3690  }
3691  if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3692    set_.set_standard_set_type('W');
3693    return true;
3694  }
3695  return false;
3696}
3697
3698
3699RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
3700                                         RegExpNode* on_success) {
3701  return new TextNode(this, on_success);
3702}
3703
3704
3705RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
3706                                      RegExpNode* on_success) {
3707  ZoneList<RegExpTree*>* alternatives = this->alternatives();
3708  int length = alternatives->length();
3709  ChoiceNode* result = new ChoiceNode(length);
3710  for (int i = 0; i < length; i++) {
3711    GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
3712                                                               on_success));
3713    result->AddAlternative(alternative);
3714  }
3715  return result;
3716}
3717
3718
3719RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
3720                                     RegExpNode* on_success) {
3721  return ToNode(min(),
3722                max(),
3723                is_greedy(),
3724                body(),
3725                compiler,
3726                on_success);
3727}
3728
3729
3730RegExpNode* RegExpQuantifier::ToNode(int min,
3731                                     int max,
3732                                     bool is_greedy,
3733                                     RegExpTree* body,
3734                                     RegExpCompiler* compiler,
3735                                     RegExpNode* on_success,
3736                                     bool not_at_start) {
3737  // x{f, t} becomes this:
3738  //
3739  //             (r++)<-.
3740  //               |     `
3741  //               |     (x)
3742  //               v     ^
3743  //      (r=0)-->(?)---/ [if r < t]
3744  //               |
3745  //   [if r >= f] \----> ...
3746  //
3747
3748  // 15.10.2.5 RepeatMatcher algorithm.
3749  // The parser has already eliminated the case where max is 0.  In the case
3750  // where max_match is zero the parser has removed the quantifier if min was
3751  // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
3752
3753  // If we know that we cannot match zero length then things are a little
3754  // simpler since we don't need to make the special zero length match check
3755  // from step 2.1.  If the min and max are small we can unroll a little in
3756  // this case.
3757  static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
3758  static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
3759  if (max == 0) return on_success;  // This can happen due to recursion.
3760  bool body_can_be_empty = (body->min_match() == 0);
3761  int body_start_reg = RegExpCompiler::kNoRegister;
3762  Interval capture_registers = body->CaptureRegisters();
3763  bool needs_capture_clearing = !capture_registers.is_empty();
3764  if (body_can_be_empty) {
3765    body_start_reg = compiler->AllocateRegister();
3766  } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
3767    // Only unroll if there are no captures and the body can't be
3768    // empty.
3769    if (min > 0 && min <= kMaxUnrolledMinMatches) {
3770      int new_max = (max == kInfinity) ? max : max - min;
3771      // Recurse once to get the loop or optional matches after the fixed ones.
3772      RegExpNode* answer = ToNode(
3773          0, new_max, is_greedy, body, compiler, on_success, true);
3774      // Unroll the forced matches from 0 to min.  This can cause chains of
3775      // TextNodes (which the parser does not generate).  These should be
3776      // combined if it turns out they hinder good code generation.
3777      for (int i = 0; i < min; i++) {
3778        answer = body->ToNode(compiler, answer);
3779      }
3780      return answer;
3781    }
3782    if (max <= kMaxUnrolledMaxMatches) {
3783      ASSERT(min == 0);
3784      // Unroll the optional matches up to max.
3785      RegExpNode* answer = on_success;
3786      for (int i = 0; i < max; i++) {
3787        ChoiceNode* alternation = new ChoiceNode(2);
3788        if (is_greedy) {
3789          alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3790                                                                      answer)));
3791          alternation->AddAlternative(GuardedAlternative(on_success));
3792        } else {
3793          alternation->AddAlternative(GuardedAlternative(on_success));
3794          alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3795                                                                      answer)));
3796        }
3797        answer = alternation;
3798        if (not_at_start) alternation->set_not_at_start();
3799      }
3800      return answer;
3801    }
3802  }
3803  bool has_min = min > 0;
3804  bool has_max = max < RegExpTree::kInfinity;
3805  bool needs_counter = has_min || has_max;
3806  int reg_ctr = needs_counter
3807      ? compiler->AllocateRegister()
3808      : RegExpCompiler::kNoRegister;
3809  LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
3810  if (not_at_start) center->set_not_at_start();
3811  RegExpNode* loop_return = needs_counter
3812      ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3813      : static_cast<RegExpNode*>(center);
3814  if (body_can_be_empty) {
3815    // If the body can be empty we need to check if it was and then
3816    // backtrack.
3817    loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
3818                                              reg_ctr,
3819                                              min,
3820                                              loop_return);
3821  }
3822  RegExpNode* body_node = body->ToNode(compiler, loop_return);
3823  if (body_can_be_empty) {
3824    // If the body can be empty we need to store the start position
3825    // so we can bail out if it was empty.
3826    body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
3827  }
3828  if (needs_capture_clearing) {
3829    // Before entering the body of this loop we need to clear captures.
3830    body_node = ActionNode::ClearCaptures(capture_registers, body_node);
3831  }
3832  GuardedAlternative body_alt(body_node);
3833  if (has_max) {
3834    Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3835    body_alt.AddGuard(body_guard);
3836  }
3837  GuardedAlternative rest_alt(on_success);
3838  if (has_min) {
3839    Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3840    rest_alt.AddGuard(rest_guard);
3841  }
3842  if (is_greedy) {
3843    center->AddLoopAlternative(body_alt);
3844    center->AddContinueAlternative(rest_alt);
3845  } else {
3846    center->AddContinueAlternative(rest_alt);
3847    center->AddLoopAlternative(body_alt);
3848  }
3849  if (needs_counter) {
3850    return ActionNode::SetRegister(reg_ctr, 0, center);
3851  } else {
3852    return center;
3853  }
3854}
3855
3856
3857RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
3858                                    RegExpNode* on_success) {
3859  NodeInfo info;
3860  switch (type()) {
3861    case START_OF_LINE:
3862      return AssertionNode::AfterNewline(on_success);
3863    case START_OF_INPUT:
3864      return AssertionNode::AtStart(on_success);
3865    case BOUNDARY:
3866      return AssertionNode::AtBoundary(on_success);
3867    case NON_BOUNDARY:
3868      return AssertionNode::AtNonBoundary(on_success);
3869    case END_OF_INPUT:
3870      return AssertionNode::AtEnd(on_success);
3871    case END_OF_LINE: {
3872      // Compile $ in multiline regexps as an alternation with a positive
3873      // lookahead in one side and an end-of-input on the other side.
3874      // We need two registers for the lookahead.
3875      int stack_pointer_register = compiler->AllocateRegister();
3876      int position_register = compiler->AllocateRegister();
3877      // The ChoiceNode to distinguish between a newline and end-of-input.
3878      ChoiceNode* result = new ChoiceNode(2);
3879      // Create a newline atom.
3880      ZoneList<CharacterRange>* newline_ranges =
3881          new ZoneList<CharacterRange>(3);
3882      CharacterRange::AddClassEscape('n', newline_ranges);
3883      RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
3884      TextNode* newline_matcher = new TextNode(
3885         newline_atom,
3886         ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3887                                             position_register,
3888                                             0,  // No captures inside.
3889                                             -1,  // Ignored if no captures.
3890                                             on_success));
3891      // Create an end-of-input matcher.
3892      RegExpNode* end_of_line = ActionNode::BeginSubmatch(
3893          stack_pointer_register,
3894          position_register,
3895          newline_matcher);
3896      // Add the two alternatives to the ChoiceNode.
3897      GuardedAlternative eol_alternative(end_of_line);
3898      result->AddAlternative(eol_alternative);
3899      GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
3900      result->AddAlternative(end_alternative);
3901      return result;
3902    }
3903    default:
3904      UNREACHABLE();
3905  }
3906  return on_success;
3907}
3908
3909
3910RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
3911                                        RegExpNode* on_success) {
3912  return new BackReferenceNode(RegExpCapture::StartRegister(index()),
3913                               RegExpCapture::EndRegister(index()),
3914                               on_success);
3915}
3916
3917
3918RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
3919                                RegExpNode* on_success) {
3920  return on_success;
3921}
3922
3923
3924RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
3925                                    RegExpNode* on_success) {
3926  int stack_pointer_register = compiler->AllocateRegister();
3927  int position_register = compiler->AllocateRegister();
3928
3929  const int registers_per_capture = 2;
3930  const int register_of_first_capture = 2;
3931  int register_count = capture_count_ * registers_per_capture;
3932  int register_start =
3933    register_of_first_capture + capture_from_ * registers_per_capture;
3934
3935  RegExpNode* success;
3936  if (is_positive()) {
3937    RegExpNode* node = ActionNode::BeginSubmatch(
3938        stack_pointer_register,
3939        position_register,
3940        body()->ToNode(
3941            compiler,
3942            ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3943                                                position_register,
3944                                                register_count,
3945                                                register_start,
3946                                                on_success)));
3947    return node;
3948  } else {
3949    // We use a ChoiceNode for a negative lookahead because it has most of
3950    // the characteristics we need.  It has the body of the lookahead as its
3951    // first alternative and the expression after the lookahead of the second
3952    // alternative.  If the first alternative succeeds then the
3953    // NegativeSubmatchSuccess will unwind the stack including everything the
3954    // choice node set up and backtrack.  If the first alternative fails then
3955    // the second alternative is tried, which is exactly the desired result
3956    // for a negative lookahead.  The NegativeLookaheadChoiceNode is a special
3957    // ChoiceNode that knows to ignore the first exit when calculating quick
3958    // checks.
3959    GuardedAlternative body_alt(
3960        body()->ToNode(
3961            compiler,
3962            success = new NegativeSubmatchSuccess(stack_pointer_register,
3963                                                  position_register,
3964                                                  register_count,
3965                                                  register_start)));
3966    ChoiceNode* choice_node =
3967        new NegativeLookaheadChoiceNode(body_alt,
3968                                        GuardedAlternative(on_success));
3969    return ActionNode::BeginSubmatch(stack_pointer_register,
3970                                     position_register,
3971                                     choice_node);
3972  }
3973}
3974
3975
3976RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
3977                                  RegExpNode* on_success) {
3978  return ToNode(body(), index(), compiler, on_success);
3979}
3980
3981
3982RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
3983                                  int index,
3984                                  RegExpCompiler* compiler,
3985                                  RegExpNode* on_success) {
3986  int start_reg = RegExpCapture::StartRegister(index);
3987  int end_reg = RegExpCapture::EndRegister(index);
3988  RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
3989  RegExpNode* body_node = body->ToNode(compiler, store_end);
3990  return ActionNode::StorePosition(start_reg, true, body_node);
3991}
3992
3993
3994RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
3995                                      RegExpNode* on_success) {
3996  ZoneList<RegExpTree*>* children = nodes();
3997  RegExpNode* current = on_success;
3998  for (int i = children->length() - 1; i >= 0; i--) {
3999    current = children->at(i)->ToNode(compiler, current);
4000  }
4001  return current;
4002}
4003
4004
4005static void AddClass(const uc16* elmv,
4006                     int elmc,
4007                     ZoneList<CharacterRange>* ranges) {
4008  for (int i = 0; i < elmc; i += 2) {
4009    ASSERT(elmv[i] <= elmv[i + 1]);
4010    ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
4011  }
4012}
4013
4014
4015static void AddClassNegated(const uc16 *elmv,
4016                            int elmc,
4017                            ZoneList<CharacterRange>* ranges) {
4018  ASSERT(elmv[0] != 0x0000);
4019  ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
4020  uc16 last = 0x0000;
4021  for (int i = 0; i < elmc; i += 2) {
4022    ASSERT(last <= elmv[i] - 1);
4023    ASSERT(elmv[i] <= elmv[i + 1]);
4024    ranges->Add(CharacterRange(last, elmv[i] - 1));
4025    last = elmv[i + 1] + 1;
4026  }
4027  ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
4028}
4029
4030
4031void CharacterRange::AddClassEscape(uc16 type,
4032                                    ZoneList<CharacterRange>* ranges) {
4033  switch (type) {
4034    case 's':
4035      AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
4036      break;
4037    case 'S':
4038      AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
4039      break;
4040    case 'w':
4041      AddClass(kWordRanges, kWordRangeCount, ranges);
4042      break;
4043    case 'W':
4044      AddClassNegated(kWordRanges, kWordRangeCount, ranges);
4045      break;
4046    case 'd':
4047      AddClass(kDigitRanges, kDigitRangeCount, ranges);
4048      break;
4049    case 'D':
4050      AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
4051      break;
4052    case '.':
4053      AddClassNegated(kLineTerminatorRanges,
4054                      kLineTerminatorRangeCount,
4055                      ranges);
4056      break;
4057    // This is not a character range as defined by the spec but a
4058    // convenient shorthand for a character class that matches any
4059    // character.
4060    case '*':
4061      ranges->Add(CharacterRange::Everything());
4062      break;
4063    // This is the set of characters matched by the $ and ^ symbols
4064    // in multiline mode.
4065    case 'n':
4066      AddClass(kLineTerminatorRanges,
4067               kLineTerminatorRangeCount,
4068               ranges);
4069      break;
4070    default:
4071      UNREACHABLE();
4072  }
4073}
4074
4075
4076Vector<const uc16> CharacterRange::GetWordBounds() {
4077  return Vector<const uc16>(kWordRanges, kWordRangeCount);
4078}
4079
4080
4081class CharacterRangeSplitter {
4082 public:
4083  CharacterRangeSplitter(ZoneList<CharacterRange>** included,
4084                          ZoneList<CharacterRange>** excluded)
4085      : included_(included),
4086        excluded_(excluded) { }
4087  void Call(uc16 from, DispatchTable::Entry entry);
4088
4089  static const int kInBase = 0;
4090  static const int kInOverlay = 1;
4091
4092 private:
4093  ZoneList<CharacterRange>** included_;
4094  ZoneList<CharacterRange>** excluded_;
4095};
4096
4097
4098void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
4099  if (!entry.out_set()->Get(kInBase)) return;
4100  ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
4101    ? included_
4102    : excluded_;
4103  if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
4104  (*target)->Add(CharacterRange(entry.from(), entry.to()));
4105}
4106
4107
4108void CharacterRange::Split(ZoneList<CharacterRange>* base,
4109                           Vector<const uc16> overlay,
4110                           ZoneList<CharacterRange>** included,
4111                           ZoneList<CharacterRange>** excluded) {
4112  ASSERT_EQ(NULL, *included);
4113  ASSERT_EQ(NULL, *excluded);
4114  DispatchTable table;
4115  for (int i = 0; i < base->length(); i++)
4116    table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
4117  for (int i = 0; i < overlay.length(); i += 2) {
4118    table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
4119                   CharacterRangeSplitter::kInOverlay);
4120  }
4121  CharacterRangeSplitter callback(included, excluded);
4122  table.ForEach(&callback);
4123}
4124
4125
4126static void AddUncanonicals(Isolate* isolate,
4127                            ZoneList<CharacterRange>* ranges,
4128                            int bottom,
4129                            int top);
4130
4131
4132void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
4133                                        bool is_ascii) {
4134  Isolate* isolate = Isolate::Current();
4135  uc16 bottom = from();
4136  uc16 top = to();
4137  if (is_ascii) {
4138    if (bottom > String::kMaxAsciiCharCode) return;
4139    if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
4140  }
4141  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4142  if (top == bottom) {
4143    // If this is a singleton we just expand the one character.
4144    int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
4145    for (int i = 0; i < length; i++) {
4146      uc32 chr = chars[i];
4147      if (chr != bottom) {
4148        ranges->Add(CharacterRange::Singleton(chars[i]));
4149      }
4150    }
4151  } else {
4152    // If this is a range we expand the characters block by block,
4153    // expanding contiguous subranges (blocks) one at a time.
4154    // The approach is as follows.  For a given start character we
4155    // look up the remainder of the block that contains it (represented
4156    // by the end point), for instance we find 'z' if the character
4157    // is 'c'.  A block is characterized by the property
4158    // that all characters uncanonicalize in the same way, except that
4159    // each entry in the result is incremented by the distance from the first
4160    // element.  So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
4161    // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
4162    // Once we've found the end point we look up its uncanonicalization
4163    // and produce a range for each element.  For instance for [c-f]
4164    // we look up ['z', 'Z'] and produce [c-f] and [C-F].  We then only
4165    // add a range if it is not already contained in the input, so [c-f]
4166    // will be skipped but [C-F] will be added.  If this range is not
4167    // completely contained in a block we do this for all the blocks
4168    // covered by the range (handling characters that is not in a block
4169    // as a "singleton block").
4170    unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4171    int pos = bottom;
4172    while (pos < top) {
4173      int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
4174      uc16 block_end;
4175      if (length == 0) {
4176        block_end = pos;
4177      } else {
4178        ASSERT_EQ(1, length);
4179        block_end = range[0];
4180      }
4181      int end = (block_end > top) ? top : block_end;
4182      length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
4183      for (int i = 0; i < length; i++) {
4184        uc32 c = range[i];
4185        uc16 range_from = c - (block_end - pos);
4186        uc16 range_to = c - (block_end - end);
4187        if (!(bottom <= range_from && range_to <= top)) {
4188          ranges->Add(CharacterRange(range_from, range_to));
4189        }
4190      }
4191      pos = end + 1;
4192    }
4193  }
4194}
4195
4196
4197bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
4198  ASSERT_NOT_NULL(ranges);
4199  int n = ranges->length();
4200  if (n <= 1) return true;
4201  int max = ranges->at(0).to();
4202  for (int i = 1; i < n; i++) {
4203    CharacterRange next_range = ranges->at(i);
4204    if (next_range.from() <= max + 1) return false;
4205    max = next_range.to();
4206  }
4207  return true;
4208}
4209
4210SetRelation CharacterRange::WordCharacterRelation(
4211    ZoneList<CharacterRange>* range) {
4212  ASSERT(IsCanonical(range));
4213  int i = 0;  // Word character range index.
4214  int j = 0;  // Argument range index.
4215  ASSERT_NE(0, kWordRangeCount);
4216  SetRelation result;
4217  if (range->length() == 0) {
4218    result.SetElementsInSecondSet();
4219    return result;
4220  }
4221  CharacterRange argument_range = range->at(0);
4222  CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
4223  while (i < kWordRangeCount && j < range->length()) {
4224    // Check the two ranges for the five cases:
4225    // - no overlap.
4226    // - partial overlap (there are elements in both ranges that isn't
4227    //   in the other, and there are also elements that are in both).
4228    // - argument range entirely inside word range.
4229    // - word range entirely inside argument range.
4230    // - ranges are completely equal.
4231
4232    // First check for no overlap. The earlier range is not in the other set.
4233    if (argument_range.from() > word_range.to()) {
4234      // Ranges are disjoint. The earlier word range contains elements that
4235      // cannot be in the argument set.
4236      result.SetElementsInSecondSet();
4237    } else if (word_range.from() > argument_range.to()) {
4238      // Ranges are disjoint. The earlier argument range contains elements that
4239      // cannot be in the word set.
4240      result.SetElementsInFirstSet();
4241    } else if (word_range.from() <= argument_range.from() &&
4242               word_range.to() >= argument_range.from()) {
4243      result.SetElementsInBothSets();
4244      // argument range completely inside word range.
4245      if (word_range.from() < argument_range.from() ||
4246          word_range.to() > argument_range.from()) {
4247        result.SetElementsInSecondSet();
4248      }
4249    } else if (word_range.from() >= argument_range.from() &&
4250               word_range.to() <= argument_range.from()) {
4251      result.SetElementsInBothSets();
4252      result.SetElementsInFirstSet();
4253    } else {
4254      // There is overlap, and neither is a subrange of the other
4255      result.SetElementsInFirstSet();
4256      result.SetElementsInSecondSet();
4257      result.SetElementsInBothSets();
4258    }
4259    if (result.NonTrivialIntersection()) {
4260      // The result is as (im)precise as we can possibly make it.
4261      return result;
4262    }
4263    // Progress the range(s) with minimal to-character.
4264    uc16 word_to = word_range.to();
4265    uc16 argument_to = argument_range.to();
4266    if (argument_to <= word_to) {
4267      j++;
4268      if (j < range->length()) {
4269        argument_range = range->at(j);
4270      }
4271    }
4272    if (word_to <= argument_to) {
4273      i += 2;
4274      if (i < kWordRangeCount) {
4275        word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
4276      }
4277    }
4278  }
4279  // Check if anything wasn't compared in the loop.
4280  if (i < kWordRangeCount) {
4281    // word range contains something not in argument range.
4282    result.SetElementsInSecondSet();
4283  } else if (j < range->length()) {
4284    // Argument range contains something not in word range.
4285    result.SetElementsInFirstSet();
4286  }
4287
4288  return result;
4289}
4290
4291
4292static void AddUncanonicals(Isolate* isolate,
4293                            ZoneList<CharacterRange>* ranges,
4294                            int bottom,
4295                            int top) {
4296  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4297  // Zones with no case mappings.  There is a DEBUG-mode loop to assert that
4298  // this table is correct.
4299  // 0x0600 - 0x0fff
4300  // 0x1100 - 0x1cff
4301  // 0x2000 - 0x20ff
4302  // 0x2200 - 0x23ff
4303  // 0x2500 - 0x2bff
4304  // 0x2e00 - 0xa5ff
4305  // 0xa800 - 0xfaff
4306  // 0xfc00 - 0xfeff
4307  const int boundary_count = 18;
4308  int boundaries[] = {
4309      0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,
4310      0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
4311
4312  // Special ASCII rule from spec can save us some work here.
4313  if (bottom == 0x80 && top == 0xffff) return;
4314
4315  if (top <= boundaries[0]) {
4316    CharacterRange range(bottom, top);
4317    range.AddCaseEquivalents(ranges, false);
4318    return;
4319  }
4320
4321  // Split up very large ranges.  This helps remove ranges where there are no
4322  // case mappings.
4323  for (int i = 0; i < boundary_count; i++) {
4324    if (bottom < boundaries[i] && top >= boundaries[i]) {
4325      AddUncanonicals(isolate, ranges, bottom, boundaries[i] - 1);
4326      AddUncanonicals(isolate, ranges, boundaries[i], top);
4327      return;
4328    }
4329  }
4330
4331  // If we are completely in a zone with no case mappings then we are done.
4332  for (int i = 0; i < boundary_count; i += 2) {
4333    if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
4334#ifdef DEBUG
4335      for (int j = bottom; j <= top; j++) {
4336        unsigned current_char = j;
4337        int length = isolate->jsregexp_uncanonicalize()->get(current_char,
4338                                                             '\0', chars);
4339        for (int k = 0; k < length; k++) {
4340          ASSERT(chars[k] == current_char);
4341        }
4342      }
4343#endif
4344      return;
4345    }
4346  }
4347
4348  // Step through the range finding equivalent characters.
4349  ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
4350  for (int i = bottom; i <= top; i++) {
4351    int length = isolate->jsregexp_uncanonicalize()->get(i, '\0', chars);
4352    for (int j = 0; j < length; j++) {
4353      uc32 chr = chars[j];
4354      if (chr != i && (chr < bottom || chr > top)) {
4355        characters->Add(chr);
4356      }
4357    }
4358  }
4359
4360  // Step through the equivalent characters finding simple ranges and
4361  // adding ranges to the character class.
4362  if (characters->length() > 0) {
4363    int new_from = characters->at(0);
4364    int new_to = new_from;
4365    for (int i = 1; i < characters->length(); i++) {
4366      int chr = characters->at(i);
4367      if (chr == new_to + 1) {
4368        new_to++;
4369      } else {
4370        if (new_to == new_from) {
4371          ranges->Add(CharacterRange::Singleton(new_from));
4372        } else {
4373          ranges->Add(CharacterRange(new_from, new_to));
4374        }
4375        new_from = new_to = chr;
4376      }
4377    }
4378    if (new_to == new_from) {
4379      ranges->Add(CharacterRange::Singleton(new_from));
4380    } else {
4381      ranges->Add(CharacterRange(new_from, new_to));
4382    }
4383  }
4384}
4385
4386
4387ZoneList<CharacterRange>* CharacterSet::ranges() {
4388  if (ranges_ == NULL) {
4389    ranges_ = new ZoneList<CharacterRange>(2);
4390    CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4391  }
4392  return ranges_;
4393}
4394
4395
4396// Move a number of elements in a zonelist to another position
4397// in the same list. Handles overlapping source and target areas.
4398static void MoveRanges(ZoneList<CharacterRange>* list,
4399                       int from,
4400                       int to,
4401                       int count) {
4402  // Ranges are potentially overlapping.
4403  if (from < to) {
4404    for (int i = count - 1; i >= 0; i--) {
4405      list->at(to + i) = list->at(from + i);
4406    }
4407  } else {
4408    for (int i = 0; i < count; i++) {
4409      list->at(to + i) = list->at(from + i);
4410    }
4411  }
4412}
4413
4414
4415static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
4416                                      int count,
4417                                      CharacterRange insert) {
4418  // Inserts a range into list[0..count[, which must be sorted
4419  // by from value and non-overlapping and non-adjacent, using at most
4420  // list[0..count] for the result. Returns the number of resulting
4421  // canonicalized ranges. Inserting a range may collapse existing ranges into
4422  // fewer ranges, so the return value can be anything in the range 1..count+1.
4423  uc16 from = insert.from();
4424  uc16 to = insert.to();
4425  int start_pos = 0;
4426  int end_pos = count;
4427  for (int i = count - 1; i >= 0; i--) {
4428    CharacterRange current = list->at(i);
4429    if (current.from() > to + 1) {
4430      end_pos = i;
4431    } else if (current.to() + 1 < from) {
4432      start_pos = i + 1;
4433      break;
4434    }
4435  }
4436
4437  // Inserted range overlaps, or is adjacent to, ranges at positions
4438  // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
4439  // not affected by the insertion.
4440  // If start_pos == end_pos, the range must be inserted before start_pos.
4441  // if start_pos < end_pos, the entire range from start_pos to end_pos
4442  // must be merged with the insert range.
4443
4444  if (start_pos == end_pos) {
4445    // Insert between existing ranges at position start_pos.
4446    if (start_pos < count) {
4447      MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
4448    }
4449    list->at(start_pos) = insert;
4450    return count + 1;
4451  }
4452  if (start_pos + 1 == end_pos) {
4453    // Replace single existing range at position start_pos.
4454    CharacterRange to_replace = list->at(start_pos);
4455    int new_from = Min(to_replace.from(), from);
4456    int new_to = Max(to_replace.to(), to);
4457    list->at(start_pos) = CharacterRange(new_from, new_to);
4458    return count;
4459  }
4460  // Replace a number of existing ranges from start_pos to end_pos - 1.
4461  // Move the remaining ranges down.
4462
4463  int new_from = Min(list->at(start_pos).from(), from);
4464  int new_to = Max(list->at(end_pos - 1).to(), to);
4465  if (end_pos < count) {
4466    MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
4467  }
4468  list->at(start_pos) = CharacterRange(new_from, new_to);
4469  return count - (end_pos - start_pos) + 1;
4470}
4471
4472
4473void CharacterSet::Canonicalize() {
4474  // Special/default classes are always considered canonical. The result
4475  // of calling ranges() will be sorted.
4476  if (ranges_ == NULL) return;
4477  CharacterRange::Canonicalize(ranges_);
4478}
4479
4480
4481void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
4482  if (character_ranges->length() <= 1) return;
4483  // Check whether ranges are already canonical (increasing, non-overlapping,
4484  // non-adjacent).
4485  int n = character_ranges->length();
4486  int max = character_ranges->at(0).to();
4487  int i = 1;
4488  while (i < n) {
4489    CharacterRange current = character_ranges->at(i);
4490    if (current.from() <= max + 1) {
4491      break;
4492    }
4493    max = current.to();
4494    i++;
4495  }
4496  // Canonical until the i'th range. If that's all of them, we are done.
4497  if (i == n) return;
4498
4499  // The ranges at index i and forward are not canonicalized. Make them so by
4500  // doing the equivalent of insertion sort (inserting each into the previous
4501  // list, in order).
4502  // Notice that inserting a range can reduce the number of ranges in the
4503  // result due to combining of adjacent and overlapping ranges.
4504  int read = i;  // Range to insert.
4505  int num_canonical = i;  // Length of canonicalized part of list.
4506  do {
4507    num_canonical = InsertRangeInCanonicalList(character_ranges,
4508                                               num_canonical,
4509                                               character_ranges->at(read));
4510    read++;
4511  } while (read < n);
4512  character_ranges->Rewind(num_canonical);
4513
4514  ASSERT(CharacterRange::IsCanonical(character_ranges));
4515}
4516
4517
4518// Utility function for CharacterRange::Merge. Adds a range at the end of
4519// a canonicalized range list, if necessary merging the range with the last
4520// range of the list.
4521static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
4522  if (set == NULL) return;
4523  ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from());
4524  int n = set->length();
4525  if (n > 0) {
4526    CharacterRange lastRange = set->at(n - 1);
4527    if (lastRange.to() == range.from() - 1) {
4528      set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
4529      return;
4530    }
4531  }
4532  set->Add(range);
4533}
4534
4535
4536static void AddRangeToSelectedSet(int selector,
4537                                  ZoneList<CharacterRange>* first_set,
4538                                  ZoneList<CharacterRange>* second_set,
4539                                  ZoneList<CharacterRange>* intersection_set,
4540                                  CharacterRange range) {
4541  switch (selector) {
4542    case kInsideFirst:
4543      AddRangeToSet(first_set, range);
4544      break;
4545    case kInsideSecond:
4546      AddRangeToSet(second_set, range);
4547      break;
4548    case kInsideBoth:
4549      AddRangeToSet(intersection_set, range);
4550      break;
4551  }
4552}
4553
4554
4555
4556void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
4557                           ZoneList<CharacterRange>* second_set,
4558                           ZoneList<CharacterRange>* first_set_only_out,
4559                           ZoneList<CharacterRange>* second_set_only_out,
4560                           ZoneList<CharacterRange>* both_sets_out) {
4561  // Inputs are canonicalized.
4562  ASSERT(CharacterRange::IsCanonical(first_set));
4563  ASSERT(CharacterRange::IsCanonical(second_set));
4564  // Outputs are empty, if applicable.
4565  ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0);
4566  ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0);
4567  ASSERT(both_sets_out == NULL || both_sets_out->length() == 0);
4568
4569  // Merge sets by iterating through the lists in order of lowest "from" value,
4570  // and putting intervals into one of three sets.
4571
4572  if (first_set->length() == 0) {
4573    second_set_only_out->AddAll(*second_set);
4574    return;
4575  }
4576  if (second_set->length() == 0) {
4577    first_set_only_out->AddAll(*first_set);
4578    return;
4579  }
4580  // Indices into input lists.
4581  int i1 = 0;
4582  int i2 = 0;
4583  // Cache length of input lists.
4584  int n1 = first_set->length();
4585  int n2 = second_set->length();
4586  // Current range. May be invalid if state is kInsideNone.
4587  int from = 0;
4588  int to = -1;
4589  // Where current range comes from.
4590  int state = kInsideNone;
4591
4592  while (i1 < n1 || i2 < n2) {
4593    CharacterRange next_range;
4594    int range_source;
4595    if (i2 == n2 ||
4596        (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {
4597      // Next smallest element is in first set.
4598      next_range = first_set->at(i1++);
4599      range_source = kInsideFirst;
4600    } else {
4601      // Next smallest element is in second set.
4602      next_range = second_set->at(i2++);
4603      range_source = kInsideSecond;
4604    }
4605    if (to < next_range.from()) {
4606      // Ranges disjoint: |current|  |next|
4607      AddRangeToSelectedSet(state,
4608                            first_set_only_out,
4609                            second_set_only_out,
4610                            both_sets_out,
4611                            CharacterRange(from, to));
4612      from = next_range.from();
4613      to = next_range.to();
4614      state = range_source;
4615    } else {
4616      if (from < next_range.from()) {
4617        AddRangeToSelectedSet(state,
4618                              first_set_only_out,
4619                              second_set_only_out,
4620                              both_sets_out,
4621                              CharacterRange(from, next_range.from()-1));
4622      }
4623      if (to < next_range.to()) {
4624        // Ranges overlap:  |current|
4625        //                       |next|
4626        AddRangeToSelectedSet(state | range_source,
4627                              first_set_only_out,
4628                              second_set_only_out,
4629                              both_sets_out,
4630                              CharacterRange(next_range.from(), to));
4631        from = to + 1;
4632        to = next_range.to();
4633        state = range_source;
4634      } else {
4635        // Range included:    |current| , possibly ending at same character.
4636        //                      |next|
4637        AddRangeToSelectedSet(
4638            state | range_source,
4639            first_set_only_out,
4640            second_set_only_out,
4641            both_sets_out,
4642            CharacterRange(next_range.from(), next_range.to()));
4643        from = next_range.to() + 1;
4644        // If ranges end at same character, both ranges are consumed completely.
4645        if (next_range.to() == to) state = kInsideNone;
4646      }
4647    }
4648  }
4649  AddRangeToSelectedSet(state,
4650                        first_set_only_out,
4651                        second_set_only_out,
4652                        both_sets_out,
4653                        CharacterRange(from, to));
4654}
4655
4656
4657void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
4658                            ZoneList<CharacterRange>* negated_ranges) {
4659  ASSERT(CharacterRange::IsCanonical(ranges));
4660  ASSERT_EQ(0, negated_ranges->length());
4661  int range_count = ranges->length();
4662  uc16 from = 0;
4663  int i = 0;
4664  if (range_count > 0 && ranges->at(0).from() == 0) {
4665    from = ranges->at(0).to();
4666    i = 1;
4667  }
4668  while (i < range_count) {
4669    CharacterRange range = ranges->at(i);
4670    negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
4671    from = range.to();
4672    i++;
4673  }
4674  if (from < String::kMaxUC16CharCode) {
4675    negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
4676  }
4677}
4678
4679
4680
4681// -------------------------------------------------------------------
4682// Interest propagation
4683
4684
4685RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4686  for (int i = 0; i < siblings_.length(); i++) {
4687    RegExpNode* sibling = siblings_.Get(i);
4688    if (sibling->info()->Matches(info))
4689      return sibling;
4690  }
4691  return NULL;
4692}
4693
4694
4695RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4696  ASSERT_EQ(false, *cloned);
4697  siblings_.Ensure(this);
4698  RegExpNode* result = TryGetSibling(info);
4699  if (result != NULL) return result;
4700  result = this->Clone();
4701  NodeInfo* new_info = result->info();
4702  new_info->ResetCompilationState();
4703  new_info->AddFromPreceding(info);
4704  AddSibling(result);
4705  *cloned = true;
4706  return result;
4707}
4708
4709
4710template <class C>
4711static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4712  NodeInfo full_info(*node->info());
4713  full_info.AddFromPreceding(info);
4714  bool cloned = false;
4715  return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4716}
4717
4718
4719// -------------------------------------------------------------------
4720// Splay tree
4721
4722
4723OutSet* OutSet::Extend(unsigned value) {
4724  if (Get(value))
4725    return this;
4726  if (successors() != NULL) {
4727    for (int i = 0; i < successors()->length(); i++) {
4728      OutSet* successor = successors()->at(i);
4729      if (successor->Get(value))
4730        return successor;
4731    }
4732  } else {
4733    successors_ = new ZoneList<OutSet*>(2);
4734  }
4735  OutSet* result = new OutSet(first_, remaining_);
4736  result->Set(value);
4737  successors()->Add(result);
4738  return result;
4739}
4740
4741
4742void OutSet::Set(unsigned value) {
4743  if (value < kFirstLimit) {
4744    first_ |= (1 << value);
4745  } else {
4746    if (remaining_ == NULL)
4747      remaining_ = new ZoneList<unsigned>(1);
4748    if (remaining_->is_empty() || !remaining_->Contains(value))
4749      remaining_->Add(value);
4750  }
4751}
4752
4753
4754bool OutSet::Get(unsigned value) {
4755  if (value < kFirstLimit) {
4756    return (first_ & (1 << value)) != 0;
4757  } else if (remaining_ == NULL) {
4758    return false;
4759  } else {
4760    return remaining_->Contains(value);
4761  }
4762}
4763
4764
4765const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4766const DispatchTable::Entry DispatchTable::Config::kNoValue;
4767
4768
4769void DispatchTable::AddRange(CharacterRange full_range, int value) {
4770  CharacterRange current = full_range;
4771  if (tree()->is_empty()) {
4772    // If this is the first range we just insert into the table.
4773    ZoneSplayTree<Config>::Locator loc;
4774    ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4775    loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4776    return;
4777  }
4778  // First see if there is a range to the left of this one that
4779  // overlaps.
4780  ZoneSplayTree<Config>::Locator loc;
4781  if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4782    Entry* entry = &loc.value();
4783    // If we've found a range that overlaps with this one, and it
4784    // starts strictly to the left of this one, we have to fix it
4785    // because the following code only handles ranges that start on
4786    // or after the start point of the range we're adding.
4787    if (entry->from() < current.from() && entry->to() >= current.from()) {
4788      // Snap the overlapping range in half around the start point of
4789      // the range we're adding.
4790      CharacterRange left(entry->from(), current.from() - 1);
4791      CharacterRange right(current.from(), entry->to());
4792      // The left part of the overlapping range doesn't overlap.
4793      // Truncate the whole entry to be just the left part.
4794      entry->set_to(left.to());
4795      // The right part is the one that overlaps.  We add this part
4796      // to the map and let the next step deal with merging it with
4797      // the range we're adding.
4798      ZoneSplayTree<Config>::Locator loc;
4799      ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4800      loc.set_value(Entry(right.from(),
4801                          right.to(),
4802                          entry->out_set()));
4803    }
4804  }
4805  while (current.is_valid()) {
4806    if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4807        (loc.value().from() <= current.to()) &&
4808        (loc.value().to() >= current.from())) {
4809      Entry* entry = &loc.value();
4810      // We have overlap.  If there is space between the start point of
4811      // the range we're adding and where the overlapping range starts
4812      // then we have to add a range covering just that space.
4813      if (current.from() < entry->from()) {
4814        ZoneSplayTree<Config>::Locator ins;
4815        ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4816        ins.set_value(Entry(current.from(),
4817                            entry->from() - 1,
4818                            empty()->Extend(value)));
4819        current.set_from(entry->from());
4820      }
4821      ASSERT_EQ(current.from(), entry->from());
4822      // If the overlapping range extends beyond the one we want to add
4823      // we have to snap the right part off and add it separately.
4824      if (entry->to() > current.to()) {
4825        ZoneSplayTree<Config>::Locator ins;
4826        ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4827        ins.set_value(Entry(current.to() + 1,
4828                            entry->to(),
4829                            entry->out_set()));
4830        entry->set_to(current.to());
4831      }
4832      ASSERT(entry->to() <= current.to());
4833      // The overlapping range is now completely contained by the range
4834      // we're adding so we can just update it and move the start point
4835      // of the range we're adding just past it.
4836      entry->AddValue(value);
4837      // Bail out if the last interval ended at 0xFFFF since otherwise
4838      // adding 1 will wrap around to 0.
4839      if (entry->to() == String::kMaxUC16CharCode)
4840        break;
4841      ASSERT(entry->to() + 1 > current.from());
4842      current.set_from(entry->to() + 1);
4843    } else {
4844      // There is no overlap so we can just add the range
4845      ZoneSplayTree<Config>::Locator ins;
4846      ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4847      ins.set_value(Entry(current.from(),
4848                          current.to(),
4849                          empty()->Extend(value)));
4850      break;
4851    }
4852  }
4853}
4854
4855
4856OutSet* DispatchTable::Get(uc16 value) {
4857  ZoneSplayTree<Config>::Locator loc;
4858  if (!tree()->FindGreatestLessThan(value, &loc))
4859    return empty();
4860  Entry* entry = &loc.value();
4861  if (value <= entry->to())
4862    return entry->out_set();
4863  else
4864    return empty();
4865}
4866
4867
4868// -------------------------------------------------------------------
4869// Analysis
4870
4871
4872void Analysis::EnsureAnalyzed(RegExpNode* that) {
4873  StackLimitCheck check(Isolate::Current());
4874  if (check.HasOverflowed()) {
4875    fail("Stack overflow");
4876    return;
4877  }
4878  if (that->info()->been_analyzed || that->info()->being_analyzed)
4879    return;
4880  that->info()->being_analyzed = true;
4881  that->Accept(this);
4882  that->info()->being_analyzed = false;
4883  that->info()->been_analyzed = true;
4884}
4885
4886
4887void Analysis::VisitEnd(EndNode* that) {
4888  // nothing to do
4889}
4890
4891
4892void TextNode::CalculateOffsets() {
4893  int element_count = elements()->length();
4894  // Set up the offsets of the elements relative to the start.  This is a fixed
4895  // quantity since a TextNode can only contain fixed-width things.
4896  int cp_offset = 0;
4897  for (int i = 0; i < element_count; i++) {
4898    TextElement& elm = elements()->at(i);
4899    elm.cp_offset = cp_offset;
4900    if (elm.type == TextElement::ATOM) {
4901      cp_offset += elm.data.u_atom->data().length();
4902    } else {
4903      cp_offset++;
4904      Vector<const uc16> quarks = elm.data.u_atom->data();
4905    }
4906  }
4907}
4908
4909
4910void Analysis::VisitText(TextNode* that) {
4911  if (ignore_case_) {
4912    that->MakeCaseIndependent(is_ascii_);
4913  }
4914  EnsureAnalyzed(that->on_success());
4915  if (!has_failed()) {
4916    that->CalculateOffsets();
4917  }
4918}
4919
4920
4921void Analysis::VisitAction(ActionNode* that) {
4922  RegExpNode* target = that->on_success();
4923  EnsureAnalyzed(target);
4924  if (!has_failed()) {
4925    // If the next node is interested in what it follows then this node
4926    // has to be interested too so it can pass the information on.
4927    that->info()->AddFromFollowing(target->info());
4928  }
4929}
4930
4931
4932void Analysis::VisitChoice(ChoiceNode* that) {
4933  NodeInfo* info = that->info();
4934  for (int i = 0; i < that->alternatives()->length(); i++) {
4935    RegExpNode* node = that->alternatives()->at(i).node();
4936    EnsureAnalyzed(node);
4937    if (has_failed()) return;
4938    // Anything the following nodes need to know has to be known by
4939    // this node also, so it can pass it on.
4940    info->AddFromFollowing(node->info());
4941  }
4942}
4943
4944
4945void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4946  NodeInfo* info = that->info();
4947  for (int i = 0; i < that->alternatives()->length(); i++) {
4948    RegExpNode* node = that->alternatives()->at(i).node();
4949    if (node != that->loop_node()) {
4950      EnsureAnalyzed(node);
4951      if (has_failed()) return;
4952      info->AddFromFollowing(node->info());
4953    }
4954  }
4955  // Check the loop last since it may need the value of this node
4956  // to get a correct result.
4957  EnsureAnalyzed(that->loop_node());
4958  if (!has_failed()) {
4959    info->AddFromFollowing(that->loop_node()->info());
4960  }
4961}
4962
4963
4964void Analysis::VisitBackReference(BackReferenceNode* that) {
4965  EnsureAnalyzed(that->on_success());
4966}
4967
4968
4969void Analysis::VisitAssertion(AssertionNode* that) {
4970  EnsureAnalyzed(that->on_success());
4971  AssertionNode::AssertionNodeType type = that->type();
4972  if (type == AssertionNode::AT_BOUNDARY ||
4973      type == AssertionNode::AT_NON_BOUNDARY) {
4974    // Check if the following character is known to be a word character
4975    // or known to not be a word character.
4976    ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
4977
4978    CharacterRange::Canonicalize(following_chars);
4979
4980    SetRelation word_relation =
4981        CharacterRange::WordCharacterRelation(following_chars);
4982    if (word_relation.Disjoint()) {
4983      // Includes the case where following_chars is empty (e.g., end-of-input).
4984      // Following character is definitely *not* a word character.
4985      type = (type == AssertionNode::AT_BOUNDARY) ?
4986                 AssertionNode::AFTER_WORD_CHARACTER :
4987                 AssertionNode::AFTER_NONWORD_CHARACTER;
4988      that->set_type(type);
4989    } else if (word_relation.ContainedIn()) {
4990      // Following character is definitely a word character.
4991      type = (type == AssertionNode::AT_BOUNDARY) ?
4992                 AssertionNode::AFTER_NONWORD_CHARACTER :
4993                 AssertionNode::AFTER_WORD_CHARACTER;
4994      that->set_type(type);
4995    }
4996  }
4997}
4998
4999
5000ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
5001  if (first_character_set_ == NULL) {
5002    if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
5003      // If we can't find an exact solution within the budget, we
5004      // set the value to the set of every character, i.e., all characters
5005      // are possible.
5006      ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
5007      all_set->Add(CharacterRange::Everything());
5008      first_character_set_ = all_set;
5009    }
5010  }
5011  return first_character_set_;
5012}
5013
5014
5015int RegExpNode::ComputeFirstCharacterSet(int budget) {
5016  // Default behavior is to not be able to determine the first character.
5017  return kComputeFirstCharacterSetFail;
5018}
5019
5020
5021int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
5022  budget--;
5023  if (budget >= 0) {
5024    // Find loop min-iteration. It's the value of the guarded choice node
5025    // with a GEQ guard, if any.
5026    int min_repetition = 0;
5027
5028    for (int i = 0; i <= 1; i++) {
5029      GuardedAlternative alternative = alternatives()->at(i);
5030      ZoneList<Guard*>* guards = alternative.guards();
5031      if (guards != NULL && guards->length() > 0) {
5032        Guard* guard = guards->at(0);
5033        if (guard->op() == Guard::GEQ) {
5034          min_repetition = guard->value();
5035          break;
5036        }
5037      }
5038    }
5039
5040    budget = loop_node()->ComputeFirstCharacterSet(budget);
5041    if (budget >= 0) {
5042      ZoneList<CharacterRange>* character_set =
5043          loop_node()->first_character_set();
5044      if (body_can_be_zero_length() || min_repetition == 0) {
5045        budget = continue_node()->ComputeFirstCharacterSet(budget);
5046        if (budget < 0) return budget;
5047        ZoneList<CharacterRange>* body_set =
5048            continue_node()->first_character_set();
5049        ZoneList<CharacterRange>* union_set =
5050          new ZoneList<CharacterRange>(Max(character_set->length(),
5051                                           body_set->length()));
5052        CharacterRange::Merge(character_set,
5053                              body_set,
5054                              union_set,
5055                              union_set,
5056                              union_set);
5057        character_set = union_set;
5058      }
5059      set_first_character_set(character_set);
5060    }
5061  }
5062  return budget;
5063}
5064
5065
5066int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
5067  budget--;
5068  if (budget >= 0) {
5069    GuardedAlternative successor = this->alternatives()->at(1);
5070    RegExpNode* successor_node = successor.node();
5071    budget = successor_node->ComputeFirstCharacterSet(budget);
5072    if (budget >= 0) {
5073      set_first_character_set(successor_node->first_character_set());
5074    }
5075  }
5076  return budget;
5077}
5078
5079
5080// The first character set of an EndNode is unknowable. Just use the
5081// default implementation that fails and returns all characters as possible.
5082
5083
5084int AssertionNode::ComputeFirstCharacterSet(int budget) {
5085  budget -= 1;
5086  if (budget >= 0) {
5087    switch (type_) {
5088      case AT_END: {
5089        set_first_character_set(new ZoneList<CharacterRange>(0));
5090        break;
5091      }
5092      case AT_START:
5093      case AT_BOUNDARY:
5094      case AT_NON_BOUNDARY:
5095      case AFTER_NEWLINE:
5096      case AFTER_NONWORD_CHARACTER:
5097      case AFTER_WORD_CHARACTER: {
5098        ASSERT_NOT_NULL(on_success());
5099        budget = on_success()->ComputeFirstCharacterSet(budget);
5100        if (budget >= 0) {
5101          set_first_character_set(on_success()->first_character_set());
5102        }
5103        break;
5104      }
5105    }
5106  }
5107  return budget;
5108}
5109
5110
5111int ActionNode::ComputeFirstCharacterSet(int budget) {
5112  if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
5113  budget--;
5114  if (budget >= 0) {
5115    ASSERT_NOT_NULL(on_success());
5116    budget = on_success()->ComputeFirstCharacterSet(budget);
5117    if (budget >= 0) {
5118      set_first_character_set(on_success()->first_character_set());
5119    }
5120  }
5121  return budget;
5122}
5123
5124
5125int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
5126  // We don't know anything about the first character of a backreference
5127  // at this point.
5128  // The potential first characters are the first characters of the capture,
5129  // and the first characters of the on_success node, depending on whether the
5130  // capture can be empty and whether it is known to be participating or known
5131  // not to be.
5132  return kComputeFirstCharacterSetFail;
5133}
5134
5135
5136int TextNode::ComputeFirstCharacterSet(int budget) {
5137  budget--;
5138  if (budget >= 0) {
5139    ASSERT_NE(0, elements()->length());
5140    TextElement text = elements()->at(0);
5141    if (text.type == TextElement::ATOM) {
5142      RegExpAtom* atom = text.data.u_atom;
5143      ASSERT_NE(0, atom->length());
5144      uc16 first_char = atom->data()[0];
5145      ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
5146      range->Add(CharacterRange(first_char, first_char));
5147      set_first_character_set(range);
5148    } else {
5149      ASSERT(text.type == TextElement::CHAR_CLASS);
5150      RegExpCharacterClass* char_class = text.data.u_char_class;
5151      ZoneList<CharacterRange>* ranges = char_class->ranges();
5152      // TODO(lrn): Canonicalize ranges when they are created
5153      // instead of waiting until now.
5154      CharacterRange::Canonicalize(ranges);
5155      if (char_class->is_negated()) {
5156        int length = ranges->length();
5157        int new_length = length + 1;
5158        if (length > 0) {
5159          if (ranges->at(0).from() == 0) new_length--;
5160          if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
5161            new_length--;
5162          }
5163        }
5164        ZoneList<CharacterRange>* negated_ranges =
5165            new ZoneList<CharacterRange>(new_length);
5166        CharacterRange::Negate(ranges, negated_ranges);
5167        set_first_character_set(negated_ranges);
5168      } else {
5169        set_first_character_set(ranges);
5170      }
5171    }
5172  }
5173  return budget;
5174}
5175
5176
5177
5178// -------------------------------------------------------------------
5179// Dispatch table construction
5180
5181
5182void DispatchTableConstructor::VisitEnd(EndNode* that) {
5183  AddRange(CharacterRange::Everything());
5184}
5185
5186
5187void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
5188  node->set_being_calculated(true);
5189  ZoneList<GuardedAlternative>* alternatives = node->alternatives();
5190  for (int i = 0; i < alternatives->length(); i++) {
5191    set_choice_index(i);
5192    alternatives->at(i).node()->Accept(this);
5193  }
5194  node->set_being_calculated(false);
5195}
5196
5197
5198class AddDispatchRange {
5199 public:
5200  explicit AddDispatchRange(DispatchTableConstructor* constructor)
5201    : constructor_(constructor) { }
5202  void Call(uc32 from, DispatchTable::Entry entry);
5203 private:
5204  DispatchTableConstructor* constructor_;
5205};
5206
5207
5208void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
5209  CharacterRange range(from, entry.to());
5210  constructor_->AddRange(range);
5211}
5212
5213
5214void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
5215  if (node->being_calculated())
5216    return;
5217  DispatchTable* table = node->GetTable(ignore_case_);
5218  AddDispatchRange adder(this);
5219  table->ForEach(&adder);
5220}
5221
5222
5223void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
5224  // TODO(160): Find the node that we refer back to and propagate its start
5225  // set back to here.  For now we just accept anything.
5226  AddRange(CharacterRange::Everything());
5227}
5228
5229
5230void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
5231  RegExpNode* target = that->on_success();
5232  target->Accept(this);
5233}
5234
5235
5236static int CompareRangeByFrom(const CharacterRange* a,
5237                              const CharacterRange* b) {
5238  return Compare<uc16>(a->from(), b->from());
5239}
5240
5241
5242void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
5243  ranges->Sort(CompareRangeByFrom);
5244  uc16 last = 0;
5245  for (int i = 0; i < ranges->length(); i++) {
5246    CharacterRange range = ranges->at(i);
5247    if (last < range.from())
5248      AddRange(CharacterRange(last, range.from() - 1));
5249    if (range.to() >= last) {
5250      if (range.to() == String::kMaxUC16CharCode) {
5251        return;
5252      } else {
5253        last = range.to() + 1;
5254      }
5255    }
5256  }
5257  AddRange(CharacterRange(last, String::kMaxUC16CharCode));
5258}
5259
5260
5261void DispatchTableConstructor::VisitText(TextNode* that) {
5262  TextElement elm = that->elements()->at(0);
5263  switch (elm.type) {
5264    case TextElement::ATOM: {
5265      uc16 c = elm.data.u_atom->data()[0];
5266      AddRange(CharacterRange(c, c));
5267      break;
5268    }
5269    case TextElement::CHAR_CLASS: {
5270      RegExpCharacterClass* tree = elm.data.u_char_class;
5271      ZoneList<CharacterRange>* ranges = tree->ranges();
5272      if (tree->is_negated()) {
5273        AddInverse(ranges);
5274      } else {
5275        for (int i = 0; i < ranges->length(); i++)
5276          AddRange(ranges->at(i));
5277      }
5278      break;
5279    }
5280    default: {
5281      UNIMPLEMENTED();
5282    }
5283  }
5284}
5285
5286
5287void DispatchTableConstructor::VisitAction(ActionNode* that) {
5288  RegExpNode* target = that->on_success();
5289  target->Accept(this);
5290}
5291
5292
5293RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
5294                                                      bool ignore_case,
5295                                                      bool is_multiline,
5296                                                      Handle<String> pattern,
5297                                                      bool is_ascii) {
5298  if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
5299    return IrregexpRegExpTooBig();
5300  }
5301  RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
5302  // Wrap the body of the regexp in capture #0.
5303  RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
5304                                                    0,
5305                                                    &compiler,
5306                                                    compiler.accept());
5307  RegExpNode* node = captured_body;
5308  bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5309  bool is_start_anchored = data->tree->IsAnchoredAtStart();
5310  int max_length = data->tree->max_match();
5311  if (!is_start_anchored) {
5312    // Add a .*? at the beginning, outside the body capture, unless
5313    // this expression is anchored at the beginning.
5314    RegExpNode* loop_node =
5315        RegExpQuantifier::ToNode(0,
5316                                 RegExpTree::kInfinity,
5317                                 false,
5318                                 new RegExpCharacterClass('*'),
5319                                 &compiler,
5320                                 captured_body,
5321                                 data->contains_anchor);
5322
5323    if (data->contains_anchor) {
5324      // Unroll loop once, to take care of the case that might start
5325      // at the start of input.
5326      ChoiceNode* first_step_node = new ChoiceNode(2);
5327      first_step_node->AddAlternative(GuardedAlternative(captured_body));
5328      first_step_node->AddAlternative(GuardedAlternative(
5329          new TextNode(new RegExpCharacterClass('*'), loop_node)));
5330      node = first_step_node;
5331    } else {
5332      node = loop_node;
5333    }
5334  }
5335  data->node = node;
5336  Analysis analysis(ignore_case, is_ascii);
5337  analysis.EnsureAnalyzed(node);
5338  if (analysis.has_failed()) {
5339    const char* error_message = analysis.error_message();
5340    return CompilationResult(error_message);
5341  }
5342
5343  NodeInfo info = *node->info();
5344
5345  // Create the correct assembler for the architecture.
5346#ifndef V8_INTERPRETED_REGEXP
5347  // Native regexp implementation.
5348
5349  NativeRegExpMacroAssembler::Mode mode =
5350      is_ascii ? NativeRegExpMacroAssembler::ASCII
5351               : NativeRegExpMacroAssembler::UC16;
5352
5353#if V8_TARGET_ARCH_IA32
5354  RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
5355#elif V8_TARGET_ARCH_X64
5356  RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
5357#elif V8_TARGET_ARCH_ARM
5358  RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
5359#elif V8_TARGET_ARCH_MIPS
5360  RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2);
5361#endif
5362
5363#else  // V8_INTERPRETED_REGEXP
5364  // Interpreted regexp implementation.
5365  EmbeddedVector<byte, 1024> codes;
5366  RegExpMacroAssemblerIrregexp macro_assembler(codes);
5367#endif  // V8_INTERPRETED_REGEXP
5368
5369  // Inserted here, instead of in Assembler, because it depends on information
5370  // in the AST that isn't replicated in the Node structure.
5371  static const int kMaxBacksearchLimit = 1024;
5372  if (is_end_anchored &&
5373      !is_start_anchored &&
5374      max_length < kMaxBacksearchLimit) {
5375    macro_assembler.SetCurrentPositionFromEnd(max_length);
5376  }
5377
5378  return compiler.Assemble(&macro_assembler,
5379                           node,
5380                           data->capture_count,
5381                           pattern);
5382}
5383
5384
5385}}  // namespace v8::internal
5386