1// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// A simple interpreter for the Irregexp byte code.
6
7
8#include "src/v8.h"
9
10#include "src/ast.h"
11#include "src/bytecodes-irregexp.h"
12#include "src/interpreter-irregexp.h"
13#include "src/jsregexp.h"
14#include "src/regexp-macro-assembler.h"
15#include "src/unicode.h"
16#include "src/utils.h"
17
18namespace v8 {
19namespace internal {
20
21
22typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
23
24static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
25                                 int from,
26                                 int current,
27                                 int len,
28                                 Vector<const uc16> subject) {
29  for (int i = 0; i < len; i++) {
30    unibrow::uchar old_char = subject[from++];
31    unibrow::uchar new_char = subject[current++];
32    if (old_char == new_char) continue;
33    unibrow::uchar old_string[1] = { old_char };
34    unibrow::uchar new_string[1] = { new_char };
35    interp_canonicalize->get(old_char, '\0', old_string);
36    interp_canonicalize->get(new_char, '\0', new_string);
37    if (old_string[0] != new_string[0]) {
38      return false;
39    }
40  }
41  return true;
42}
43
44
45static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
46                                 int from,
47                                 int current,
48                                 int len,
49                                 Vector<const uint8_t> subject) {
50  for (int i = 0; i < len; i++) {
51    unsigned int old_char = subject[from++];
52    unsigned int new_char = subject[current++];
53    if (old_char == new_char) continue;
54    // Convert both characters to lower case.
55    old_char |= 0x20;
56    new_char |= 0x20;
57    if (old_char != new_char) return false;
58    // Not letters in the ASCII range and Latin-1 range.
59    if (!(old_char - 'a' <= 'z' - 'a') &&
60        !(old_char - 224 <= 254 - 224 && old_char != 247)) {
61      return false;
62    }
63  }
64  return true;
65}
66
67
68#ifdef DEBUG
69static void TraceInterpreter(const byte* code_base,
70                             const byte* pc,
71                             int stack_depth,
72                             int current_position,
73                             uint32_t current_char,
74                             int bytecode_length,
75                             const char* bytecode_name) {
76  if (FLAG_trace_regexp_bytecodes) {
77    bool printable = (current_char < 127 && current_char >= 32);
78    const char* format =
79        printable ?
80        "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = %s" :
81        "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = %s";
82    PrintF(format,
83           pc - code_base,
84           stack_depth,
85           current_position,
86           current_char,
87           printable ? current_char : '.',
88           bytecode_name);
89    for (int i = 0; i < bytecode_length; i++) {
90      printf(", %02x", pc[i]);
91    }
92    printf(" ");
93    for (int i = 1; i < bytecode_length; i++) {
94      unsigned char b = pc[i];
95      if (b < 127 && b >= 32) {
96        printf("%c", b);
97      } else {
98        printf(".");
99      }
100    }
101    printf("\n");
102  }
103}
104
105
106#define BYTECODE(name)                                                      \
107  case BC_##name:                                                           \
108    TraceInterpreter(code_base,                                             \
109                     pc,                                                    \
110                     static_cast<int>(backtrack_sp - backtrack_stack_base), \
111                     current,                                               \
112                     current_char,                                          \
113                     BC_##name##_LENGTH,                                    \
114                     #name);
115#else
116#define BYTECODE(name)                                                      \
117  case BC_##name:
118#endif
119
120
121static int32_t Load32Aligned(const byte* pc) {
122  DCHECK((reinterpret_cast<intptr_t>(pc) & 3) == 0);
123  return *reinterpret_cast<const int32_t *>(pc);
124}
125
126
127static int32_t Load16Aligned(const byte* pc) {
128  DCHECK((reinterpret_cast<intptr_t>(pc) & 1) == 0);
129  return *reinterpret_cast<const uint16_t *>(pc);
130}
131
132
133// A simple abstraction over the backtracking stack used by the interpreter.
134// This backtracking stack does not grow automatically, but it ensures that the
135// the memory held by the stack is released or remembered in a cache if the
136// matching terminates.
137class BacktrackStack {
138 public:
139  BacktrackStack() { data_ = NewArray<int>(kBacktrackStackSize); }
140
141  ~BacktrackStack() {
142    DeleteArray(data_);
143  }
144
145  int* data() const { return data_; }
146
147  int max_size() const { return kBacktrackStackSize; }
148
149 private:
150  static const int kBacktrackStackSize = 10000;
151
152  int* data_;
153
154  DISALLOW_COPY_AND_ASSIGN(BacktrackStack);
155};
156
157
158template <typename Char>
159static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate,
160                                           const byte* code_base,
161                                           Vector<const Char> subject,
162                                           int* registers,
163                                           int current,
164                                           uint32_t current_char) {
165  const byte* pc = code_base;
166  // BacktrackStack ensures that the memory allocated for the backtracking stack
167  // is returned to the system or cached if there is no stack being cached at
168  // the moment.
169  BacktrackStack backtrack_stack;
170  int* backtrack_stack_base = backtrack_stack.data();
171  int* backtrack_sp = backtrack_stack_base;
172  int backtrack_stack_space = backtrack_stack.max_size();
173#ifdef DEBUG
174  if (FLAG_trace_regexp_bytecodes) {
175    PrintF("\n\nStart bytecode interpreter\n\n");
176  }
177#endif
178  while (true) {
179    int32_t insn = Load32Aligned(pc);
180    switch (insn & BYTECODE_MASK) {
181      BYTECODE(BREAK)
182        UNREACHABLE();
183        return RegExpImpl::RE_FAILURE;
184      BYTECODE(PUSH_CP)
185        if (--backtrack_stack_space < 0) {
186          return RegExpImpl::RE_EXCEPTION;
187        }
188        *backtrack_sp++ = current;
189        pc += BC_PUSH_CP_LENGTH;
190        break;
191      BYTECODE(PUSH_BT)
192        if (--backtrack_stack_space < 0) {
193          return RegExpImpl::RE_EXCEPTION;
194        }
195        *backtrack_sp++ = Load32Aligned(pc + 4);
196        pc += BC_PUSH_BT_LENGTH;
197        break;
198      BYTECODE(PUSH_REGISTER)
199        if (--backtrack_stack_space < 0) {
200          return RegExpImpl::RE_EXCEPTION;
201        }
202        *backtrack_sp++ = registers[insn >> BYTECODE_SHIFT];
203        pc += BC_PUSH_REGISTER_LENGTH;
204        break;
205      BYTECODE(SET_REGISTER)
206        registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4);
207        pc += BC_SET_REGISTER_LENGTH;
208        break;
209      BYTECODE(ADVANCE_REGISTER)
210        registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4);
211        pc += BC_ADVANCE_REGISTER_LENGTH;
212        break;
213      BYTECODE(SET_REGISTER_TO_CP)
214        registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4);
215        pc += BC_SET_REGISTER_TO_CP_LENGTH;
216        break;
217      BYTECODE(SET_CP_TO_REGISTER)
218        current = registers[insn >> BYTECODE_SHIFT];
219        pc += BC_SET_CP_TO_REGISTER_LENGTH;
220        break;
221      BYTECODE(SET_REGISTER_TO_SP)
222        registers[insn >> BYTECODE_SHIFT] =
223            static_cast<int>(backtrack_sp - backtrack_stack_base);
224        pc += BC_SET_REGISTER_TO_SP_LENGTH;
225        break;
226      BYTECODE(SET_SP_TO_REGISTER)
227        backtrack_sp = backtrack_stack_base + registers[insn >> BYTECODE_SHIFT];
228        backtrack_stack_space = backtrack_stack.max_size() -
229            static_cast<int>(backtrack_sp - backtrack_stack_base);
230        pc += BC_SET_SP_TO_REGISTER_LENGTH;
231        break;
232      BYTECODE(POP_CP)
233        backtrack_stack_space++;
234        --backtrack_sp;
235        current = *backtrack_sp;
236        pc += BC_POP_CP_LENGTH;
237        break;
238      BYTECODE(POP_BT)
239        backtrack_stack_space++;
240        --backtrack_sp;
241        pc = code_base + *backtrack_sp;
242        break;
243      BYTECODE(POP_REGISTER)
244        backtrack_stack_space++;
245        --backtrack_sp;
246        registers[insn >> BYTECODE_SHIFT] = *backtrack_sp;
247        pc += BC_POP_REGISTER_LENGTH;
248        break;
249      BYTECODE(FAIL)
250        return RegExpImpl::RE_FAILURE;
251      BYTECODE(SUCCEED)
252        return RegExpImpl::RE_SUCCESS;
253      BYTECODE(ADVANCE_CP)
254        current += insn >> BYTECODE_SHIFT;
255        pc += BC_ADVANCE_CP_LENGTH;
256        break;
257      BYTECODE(GOTO)
258        pc = code_base + Load32Aligned(pc + 4);
259        break;
260      BYTECODE(ADVANCE_CP_AND_GOTO)
261        current += insn >> BYTECODE_SHIFT;
262        pc = code_base + Load32Aligned(pc + 4);
263        break;
264      BYTECODE(CHECK_GREEDY)
265        if (current == backtrack_sp[-1]) {
266          backtrack_sp--;
267          backtrack_stack_space++;
268          pc = code_base + Load32Aligned(pc + 4);
269        } else {
270          pc += BC_CHECK_GREEDY_LENGTH;
271        }
272        break;
273      BYTECODE(LOAD_CURRENT_CHAR) {
274        int pos = current + (insn >> BYTECODE_SHIFT);
275        if (pos >= subject.length()) {
276          pc = code_base + Load32Aligned(pc + 4);
277        } else {
278          current_char = subject[pos];
279          pc += BC_LOAD_CURRENT_CHAR_LENGTH;
280        }
281        break;
282      }
283      BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) {
284        int pos = current + (insn >> BYTECODE_SHIFT);
285        current_char = subject[pos];
286        pc += BC_LOAD_CURRENT_CHAR_UNCHECKED_LENGTH;
287        break;
288      }
289      BYTECODE(LOAD_2_CURRENT_CHARS) {
290        int pos = current + (insn >> BYTECODE_SHIFT);
291        if (pos + 2 > subject.length()) {
292          pc = code_base + Load32Aligned(pc + 4);
293        } else {
294          Char next = subject[pos + 1];
295          current_char =
296              (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
297          pc += BC_LOAD_2_CURRENT_CHARS_LENGTH;
298        }
299        break;
300      }
301      BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) {
302        int pos = current + (insn >> BYTECODE_SHIFT);
303        Char next = subject[pos + 1];
304        current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
305        pc += BC_LOAD_2_CURRENT_CHARS_UNCHECKED_LENGTH;
306        break;
307      }
308      BYTECODE(LOAD_4_CURRENT_CHARS) {
309        DCHECK(sizeof(Char) == 1);
310        int pos = current + (insn >> BYTECODE_SHIFT);
311        if (pos + 4 > subject.length()) {
312          pc = code_base + Load32Aligned(pc + 4);
313        } else {
314          Char next1 = subject[pos + 1];
315          Char next2 = subject[pos + 2];
316          Char next3 = subject[pos + 3];
317          current_char = (subject[pos] |
318                          (next1 << 8) |
319                          (next2 << 16) |
320                          (next3 << 24));
321          pc += BC_LOAD_4_CURRENT_CHARS_LENGTH;
322        }
323        break;
324      }
325      BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) {
326        DCHECK(sizeof(Char) == 1);
327        int pos = current + (insn >> BYTECODE_SHIFT);
328        Char next1 = subject[pos + 1];
329        Char next2 = subject[pos + 2];
330        Char next3 = subject[pos + 3];
331        current_char = (subject[pos] |
332                        (next1 << 8) |
333                        (next2 << 16) |
334                        (next3 << 24));
335        pc += BC_LOAD_4_CURRENT_CHARS_UNCHECKED_LENGTH;
336        break;
337      }
338      BYTECODE(CHECK_4_CHARS) {
339        uint32_t c = Load32Aligned(pc + 4);
340        if (c == current_char) {
341          pc = code_base + Load32Aligned(pc + 8);
342        } else {
343          pc += BC_CHECK_4_CHARS_LENGTH;
344        }
345        break;
346      }
347      BYTECODE(CHECK_CHAR) {
348        uint32_t c = (insn >> BYTECODE_SHIFT);
349        if (c == current_char) {
350          pc = code_base + Load32Aligned(pc + 4);
351        } else {
352          pc += BC_CHECK_CHAR_LENGTH;
353        }
354        break;
355      }
356      BYTECODE(CHECK_NOT_4_CHARS) {
357        uint32_t c = Load32Aligned(pc + 4);
358        if (c != current_char) {
359          pc = code_base + Load32Aligned(pc + 8);
360        } else {
361          pc += BC_CHECK_NOT_4_CHARS_LENGTH;
362        }
363        break;
364      }
365      BYTECODE(CHECK_NOT_CHAR) {
366        uint32_t c = (insn >> BYTECODE_SHIFT);
367        if (c != current_char) {
368          pc = code_base + Load32Aligned(pc + 4);
369        } else {
370          pc += BC_CHECK_NOT_CHAR_LENGTH;
371        }
372        break;
373      }
374      BYTECODE(AND_CHECK_4_CHARS) {
375        uint32_t c = Load32Aligned(pc + 4);
376        if (c == (current_char & Load32Aligned(pc + 8))) {
377          pc = code_base + Load32Aligned(pc + 12);
378        } else {
379          pc += BC_AND_CHECK_4_CHARS_LENGTH;
380        }
381        break;
382      }
383      BYTECODE(AND_CHECK_CHAR) {
384        uint32_t c = (insn >> BYTECODE_SHIFT);
385        if (c == (current_char & Load32Aligned(pc + 4))) {
386          pc = code_base + Load32Aligned(pc + 8);
387        } else {
388          pc += BC_AND_CHECK_CHAR_LENGTH;
389        }
390        break;
391      }
392      BYTECODE(AND_CHECK_NOT_4_CHARS) {
393        uint32_t c = Load32Aligned(pc + 4);
394        if (c != (current_char & Load32Aligned(pc + 8))) {
395          pc = code_base + Load32Aligned(pc + 12);
396        } else {
397          pc += BC_AND_CHECK_NOT_4_CHARS_LENGTH;
398        }
399        break;
400      }
401      BYTECODE(AND_CHECK_NOT_CHAR) {
402        uint32_t c = (insn >> BYTECODE_SHIFT);
403        if (c != (current_char & Load32Aligned(pc + 4))) {
404          pc = code_base + Load32Aligned(pc + 8);
405        } else {
406          pc += BC_AND_CHECK_NOT_CHAR_LENGTH;
407        }
408        break;
409      }
410      BYTECODE(MINUS_AND_CHECK_NOT_CHAR) {
411        uint32_t c = (insn >> BYTECODE_SHIFT);
412        uint32_t minus = Load16Aligned(pc + 4);
413        uint32_t mask = Load16Aligned(pc + 6);
414        if (c != ((current_char - minus) & mask)) {
415          pc = code_base + Load32Aligned(pc + 8);
416        } else {
417          pc += BC_MINUS_AND_CHECK_NOT_CHAR_LENGTH;
418        }
419        break;
420      }
421      BYTECODE(CHECK_CHAR_IN_RANGE) {
422        uint32_t from = Load16Aligned(pc + 4);
423        uint32_t to = Load16Aligned(pc + 6);
424        if (from <= current_char && current_char <= to) {
425          pc = code_base + Load32Aligned(pc + 8);
426        } else {
427          pc += BC_CHECK_CHAR_IN_RANGE_LENGTH;
428        }
429        break;
430      }
431      BYTECODE(CHECK_CHAR_NOT_IN_RANGE) {
432        uint32_t from = Load16Aligned(pc + 4);
433        uint32_t to = Load16Aligned(pc + 6);
434        if (from > current_char || current_char > to) {
435          pc = code_base + Load32Aligned(pc + 8);
436        } else {
437          pc += BC_CHECK_CHAR_NOT_IN_RANGE_LENGTH;
438        }
439        break;
440      }
441      BYTECODE(CHECK_BIT_IN_TABLE) {
442        int mask = RegExpMacroAssembler::kTableMask;
443        byte b = pc[8 + ((current_char & mask) >> kBitsPerByteLog2)];
444        int bit = (current_char & (kBitsPerByte - 1));
445        if ((b & (1 << bit)) != 0) {
446          pc = code_base + Load32Aligned(pc + 4);
447        } else {
448          pc += BC_CHECK_BIT_IN_TABLE_LENGTH;
449        }
450        break;
451      }
452      BYTECODE(CHECK_LT) {
453        uint32_t limit = (insn >> BYTECODE_SHIFT);
454        if (current_char < limit) {
455          pc = code_base + Load32Aligned(pc + 4);
456        } else {
457          pc += BC_CHECK_LT_LENGTH;
458        }
459        break;
460      }
461      BYTECODE(CHECK_GT) {
462        uint32_t limit = (insn >> BYTECODE_SHIFT);
463        if (current_char > limit) {
464          pc = code_base + Load32Aligned(pc + 4);
465        } else {
466          pc += BC_CHECK_GT_LENGTH;
467        }
468        break;
469      }
470      BYTECODE(CHECK_REGISTER_LT)
471        if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) {
472          pc = code_base + Load32Aligned(pc + 8);
473        } else {
474          pc += BC_CHECK_REGISTER_LT_LENGTH;
475        }
476        break;
477      BYTECODE(CHECK_REGISTER_GE)
478        if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) {
479          pc = code_base + Load32Aligned(pc + 8);
480        } else {
481          pc += BC_CHECK_REGISTER_GE_LENGTH;
482        }
483        break;
484      BYTECODE(CHECK_REGISTER_EQ_POS)
485        if (registers[insn >> BYTECODE_SHIFT] == current) {
486          pc = code_base + Load32Aligned(pc + 4);
487        } else {
488          pc += BC_CHECK_REGISTER_EQ_POS_LENGTH;
489        }
490        break;
491      BYTECODE(CHECK_NOT_REGS_EQUAL)
492        if (registers[insn >> BYTECODE_SHIFT] ==
493            registers[Load32Aligned(pc + 4)]) {
494          pc += BC_CHECK_NOT_REGS_EQUAL_LENGTH;
495        } else {
496          pc = code_base + Load32Aligned(pc + 8);
497        }
498        break;
499      BYTECODE(CHECK_NOT_BACK_REF) {
500        int from = registers[insn >> BYTECODE_SHIFT];
501        int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
502        if (from < 0 || len <= 0) {
503          pc += BC_CHECK_NOT_BACK_REF_LENGTH;
504          break;
505        }
506        if (current + len > subject.length()) {
507          pc = code_base + Load32Aligned(pc + 4);
508          break;
509        } else {
510          int i;
511          for (i = 0; i < len; i++) {
512            if (subject[from + i] != subject[current + i]) {
513              pc = code_base + Load32Aligned(pc + 4);
514              break;
515            }
516          }
517          if (i < len) break;
518          current += len;
519        }
520        pc += BC_CHECK_NOT_BACK_REF_LENGTH;
521        break;
522      }
523      BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
524        int from = registers[insn >> BYTECODE_SHIFT];
525        int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
526        if (from < 0 || len <= 0) {
527          pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
528          break;
529        }
530        if (current + len > subject.length()) {
531          pc = code_base + Load32Aligned(pc + 4);
532          break;
533        } else {
534          if (BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
535                                   from, current, len, subject)) {
536            current += len;
537            pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
538          } else {
539            pc = code_base + Load32Aligned(pc + 4);
540          }
541        }
542        break;
543      }
544      BYTECODE(CHECK_AT_START)
545        if (current == 0) {
546          pc = code_base + Load32Aligned(pc + 4);
547        } else {
548          pc += BC_CHECK_AT_START_LENGTH;
549        }
550        break;
551      BYTECODE(CHECK_NOT_AT_START)
552        if (current == 0) {
553          pc += BC_CHECK_NOT_AT_START_LENGTH;
554        } else {
555          pc = code_base + Load32Aligned(pc + 4);
556        }
557        break;
558      BYTECODE(SET_CURRENT_POSITION_FROM_END) {
559        int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT;
560        if (subject.length() - current > by) {
561          current = subject.length() - by;
562          current_char = subject[current - 1];
563        }
564        pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH;
565        break;
566      }
567      default:
568        UNREACHABLE();
569        break;
570    }
571  }
572}
573
574
575RegExpImpl::IrregexpResult IrregexpInterpreter::Match(
576    Isolate* isolate,
577    Handle<ByteArray> code_array,
578    Handle<String> subject,
579    int* registers,
580    int start_position) {
581  DCHECK(subject->IsFlat());
582
583  DisallowHeapAllocation no_gc;
584  const byte* code_base = code_array->GetDataStartAddress();
585  uc16 previous_char = '\n';
586  String::FlatContent subject_content = subject->GetFlatContent();
587  if (subject_content.IsOneByte()) {
588    Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector();
589    if (start_position != 0) previous_char = subject_vector[start_position - 1];
590    return RawMatch(isolate,
591                    code_base,
592                    subject_vector,
593                    registers,
594                    start_position,
595                    previous_char);
596  } else {
597    DCHECK(subject_content.IsTwoByte());
598    Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
599    if (start_position != 0) previous_char = subject_vector[start_position - 1];
600    return RawMatch(isolate,
601                    code_base,
602                    subject_vector,
603                    registers,
604                    start_position,
605                    previous_char);
606  }
607}
608
609} }  // namespace v8::internal
610