1// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-macro-assembler.h"
6
7#include "src/assembler.h"
8#include "src/isolate-inl.h"
9#include "src/regexp/regexp-stack.h"
10#include "src/simulator.h"
11
12#ifdef V8_I18N_SUPPORT
13#include "unicode/uchar.h"
14#endif  // V8_I18N_SUPPORT
15
16namespace v8 {
17namespace internal {
18
19RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
20    : slow_safe_compiler_(false),
21      global_mode_(NOT_GLOBAL),
22      isolate_(isolate),
23      zone_(zone) {}
24
25
26RegExpMacroAssembler::~RegExpMacroAssembler() {
27}
28
29
30int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
31                                                     Address byte_offset2,
32                                                     size_t byte_length,
33                                                     Isolate* isolate) {
34  unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
35      isolate->regexp_macro_assembler_canonicalize();
36  // This function is not allowed to cause a garbage collection.
37  // A GC might move the calling generated code and invalidate the
38  // return address on the stack.
39  DCHECK(byte_length % 2 == 0);
40  uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
41  uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
42  size_t length = byte_length >> 1;
43
44#ifdef V8_I18N_SUPPORT
45  if (isolate == nullptr) {
46    for (size_t i = 0; i < length; i++) {
47      uc32 c1 = substring1[i];
48      uc32 c2 = substring2[i];
49      if (unibrow::Utf16::IsLeadSurrogate(c1)) {
50        // Non-BMP characters do not have case-equivalents in the BMP.
51        // Both have to be non-BMP for them to be able to match.
52        if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
53        if (i + 1 < length) {
54          uc16 c1t = substring1[i + 1];
55          uc16 c2t = substring2[i + 1];
56          if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
57              unibrow::Utf16::IsTrailSurrogate(c2t)) {
58            c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
59            c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
60            i++;
61          }
62        }
63      }
64      c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
65      c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
66      if (c1 != c2) return 0;
67    }
68    return 1;
69  }
70#endif  // V8_I18N_SUPPORT
71  DCHECK_NOT_NULL(isolate);
72  for (size_t i = 0; i < length; i++) {
73    unibrow::uchar c1 = substring1[i];
74    unibrow::uchar c2 = substring2[i];
75    if (c1 != c2) {
76      unibrow::uchar s1[1] = {c1};
77      canonicalize->get(c1, '\0', s1);
78      if (s1[0] != c2) {
79        unibrow::uchar s2[1] = {c2};
80        canonicalize->get(c2, '\0', s2);
81        if (s1[0] != s2[0]) {
82          return 0;
83        }
84      }
85    }
86  }
87  return 1;
88}
89
90
91void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
92                                                   Label* on_failure) {
93  Label ok;
94  // Check that current character is not a trail surrogate.
95  LoadCurrentCharacter(cp_offset, &ok);
96  CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
97  // Check that previous character is not a lead surrogate.
98  LoadCurrentCharacter(cp_offset - 1, &ok);
99  CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
100  Bind(&ok);
101}
102
103void RegExpMacroAssembler::CheckPosition(int cp_offset,
104                                         Label* on_outside_input) {
105  LoadCurrentCharacter(cp_offset, on_outside_input, true);
106}
107
108bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
109                                                      Label* on_no_match) {
110  return false;
111}
112
113#ifndef V8_INTERPRETED_REGEXP  // Avoid unused code, e.g., on ARM.
114
115NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
116                                                       Zone* zone)
117    : RegExpMacroAssembler(isolate, zone) {}
118
119
120NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
121}
122
123
124bool NativeRegExpMacroAssembler::CanReadUnaligned() {
125  return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
126}
127
128const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
129    String* subject,
130    int start_index) {
131  if (subject->IsConsString()) {
132    subject = ConsString::cast(subject)->first();
133  } else if (subject->IsSlicedString()) {
134    start_index += SlicedString::cast(subject)->offset();
135    subject = SlicedString::cast(subject)->parent();
136  }
137  if (subject->IsThinString()) {
138    subject = ThinString::cast(subject)->actual();
139  }
140  DCHECK(start_index >= 0);
141  DCHECK(start_index <= subject->length());
142  if (subject->IsSeqOneByteString()) {
143    return reinterpret_cast<const byte*>(
144        SeqOneByteString::cast(subject)->GetChars() + start_index);
145  } else if (subject->IsSeqTwoByteString()) {
146    return reinterpret_cast<const byte*>(
147        SeqTwoByteString::cast(subject)->GetChars() + start_index);
148  } else if (subject->IsExternalOneByteString()) {
149    return reinterpret_cast<const byte*>(
150        ExternalOneByteString::cast(subject)->GetChars() + start_index);
151  } else {
152    DCHECK(subject->IsExternalTwoByteString());
153    return reinterpret_cast<const byte*>(
154        ExternalTwoByteString::cast(subject)->GetChars() + start_index);
155  }
156}
157
158
159int NativeRegExpMacroAssembler::CheckStackGuardState(
160    Isolate* isolate, int start_index, bool is_direct_call,
161    Address* return_address, Code* re_code, String** subject,
162    const byte** input_start, const byte** input_end) {
163  DCHECK(re_code->instruction_start() <= *return_address);
164  DCHECK(*return_address <= re_code->instruction_end());
165  int return_value = 0;
166  // Prepare for possible GC.
167  HandleScope handles(isolate);
168  Handle<Code> code_handle(re_code);
169  Handle<String> subject_handle(*subject);
170  bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
171
172  StackLimitCheck check(isolate);
173  if (check.JsHasOverflowed()) {
174    isolate->StackOverflow();
175    return_value = EXCEPTION;
176  } else if (is_direct_call) {
177    // If not real stack overflow the stack guard was used to interrupt
178    // execution for another purpose.  If this is a direct call from JavaScript
179    // retry the RegExp forcing the call through the runtime system.
180    // Currently the direct call cannot handle a GC.
181    return_value = RETRY;
182  } else {
183    Object* result = isolate->stack_guard()->HandleInterrupts();
184    if (result->IsException(isolate)) return_value = EXCEPTION;
185  }
186
187  DisallowHeapAllocation no_gc;
188
189  if (*code_handle != re_code) {  // Return address no longer valid
190    intptr_t delta = code_handle->address() - re_code->address();
191    // Overwrite the return address on the stack.
192    *return_address += delta;
193  }
194
195  // If we continue, we need to update the subject string addresses.
196  if (return_value == 0) {
197    // String encoding might have changed.
198    if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
199      // If we changed between an LATIN1 and an UC16 string, the specialized
200      // code cannot be used, and we need to restart regexp matching from
201      // scratch (including, potentially, compiling a new version of the code).
202      return_value = RETRY;
203    } else {
204      *subject = *subject_handle;
205      intptr_t byte_length = *input_end - *input_start;
206      *input_start = StringCharacterPosition(*subject, start_index);
207      *input_end = *input_start + byte_length;
208    }
209  }
210  return return_value;
211}
212
213
214NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
215    Handle<Code> regexp_code,
216    Handle<String> subject,
217    int* offsets_vector,
218    int offsets_vector_length,
219    int previous_index,
220    Isolate* isolate) {
221
222  DCHECK(subject->IsFlat());
223  DCHECK(previous_index >= 0);
224  DCHECK(previous_index <= subject->length());
225
226  // No allocations before calling the regexp, but we can't use
227  // DisallowHeapAllocation, since regexps might be preempted, and another
228  // thread might do allocation anyway.
229
230  String* subject_ptr = *subject;
231  // Character offsets into string.
232  int start_offset = previous_index;
233  int char_length = subject_ptr->length() - start_offset;
234  int slice_offset = 0;
235
236  // The string has been flattened, so if it is a cons string it contains the
237  // full string in the first part.
238  if (StringShape(subject_ptr).IsCons()) {
239    DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
240    subject_ptr = ConsString::cast(subject_ptr)->first();
241  } else if (StringShape(subject_ptr).IsSliced()) {
242    SlicedString* slice = SlicedString::cast(subject_ptr);
243    subject_ptr = slice->parent();
244    slice_offset = slice->offset();
245  }
246  if (StringShape(subject_ptr).IsThin()) {
247    subject_ptr = ThinString::cast(subject_ptr)->actual();
248  }
249  // Ensure that an underlying string has the same representation.
250  bool is_one_byte = subject_ptr->IsOneByteRepresentation();
251  DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
252  // String is now either Sequential or External
253  int char_size_shift = is_one_byte ? 0 : 1;
254
255  const byte* input_start =
256      StringCharacterPosition(subject_ptr, start_offset + slice_offset);
257  int byte_length = char_length << char_size_shift;
258  const byte* input_end = input_start + byte_length;
259  Result res = Execute(*regexp_code,
260                       *subject,
261                       start_offset,
262                       input_start,
263                       input_end,
264                       offsets_vector,
265                       offsets_vector_length,
266                       isolate);
267  return res;
268}
269
270
271NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
272    Code* code,
273    String* input,  // This needs to be the unpacked (sliced, cons) string.
274    int start_offset,
275    const byte* input_start,
276    const byte* input_end,
277    int* output,
278    int output_size,
279    Isolate* isolate) {
280  // Ensure that the minimum stack has been allocated.
281  RegExpStackScope stack_scope(isolate);
282  Address stack_base = stack_scope.stack()->stack_base();
283
284  int direct_call = 0;
285  int result = CALL_GENERATED_REGEXP_CODE(
286      isolate, code->entry(), input, start_offset, input_start, input_end,
287      output, output_size, stack_base, direct_call, isolate);
288  DCHECK(result >= RETRY);
289
290  if (result == EXCEPTION && !isolate->has_pending_exception()) {
291    // We detected a stack overflow (on the backtrack stack) in RegExp code,
292    // but haven't created the exception yet.
293    isolate->StackOverflow();
294  }
295  return static_cast<Result>(result);
296}
297
298
299const byte NativeRegExpMacroAssembler::word_character_map[] = {
300    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
301    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
302    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
303    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
304
305    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
306    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
307    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // '0' - '7'
308    0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
309
310    0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'A' - 'G'
311    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'H' - 'O'
312    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'P' - 'W'
313    0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu,  // 'X' - 'Z', '_'
314
315    0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'a' - 'g'
316    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'h' - 'o'
317    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'p' - 'w'
318    0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
319    // Latin-1 range
320    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
321    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
322    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
323    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
324
325    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
326    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
327    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
328    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
329
330    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
331    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
333    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
334
335    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
336    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339};
340
341
342Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
343                                              Address* stack_base,
344                                              Isolate* isolate) {
345  RegExpStack* regexp_stack = isolate->regexp_stack();
346  size_t size = regexp_stack->stack_capacity();
347  Address old_stack_base = regexp_stack->stack_base();
348  DCHECK(old_stack_base == *stack_base);
349  DCHECK(stack_pointer <= old_stack_base);
350  DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
351  Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
352  if (new_stack_base == NULL) {
353    return NULL;
354  }
355  *stack_base = new_stack_base;
356  intptr_t stack_content_size = old_stack_base - stack_pointer;
357  return new_stack_base - stack_content_size;
358}
359
360#endif  // V8_INTERPRETED_REGEXP
361
362}  // namespace internal
363}  // namespace v8
364