1// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-macro-assembler.h"
6
7#include "src/assembler.h"
8#include "src/isolate-inl.h"
9#include "src/regexp/regexp-stack.h"
10#include "src/simulator.h"
11
12#ifdef V8_I18N_SUPPORT
13#include "unicode/uchar.h"
14#endif  // V8_I18N_SUPPORT
15
16namespace v8 {
17namespace internal {
18
19RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
20    : slow_safe_compiler_(false),
21      global_mode_(NOT_GLOBAL),
22      isolate_(isolate),
23      zone_(zone) {}
24
25
26RegExpMacroAssembler::~RegExpMacroAssembler() {
27}
28
29
30int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
31                                                     Address byte_offset2,
32                                                     size_t byte_length,
33                                                     Isolate* isolate) {
34  unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
35      isolate->regexp_macro_assembler_canonicalize();
36  // This function is not allowed to cause a garbage collection.
37  // A GC might move the calling generated code and invalidate the
38  // return address on the stack.
39  DCHECK(byte_length % 2 == 0);
40  uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
41  uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
42  size_t length = byte_length >> 1;
43
44#ifdef V8_I18N_SUPPORT
45  if (isolate == nullptr) {
46    for (size_t i = 0; i < length; i++) {
47      uc32 c1 = substring1[i];
48      uc32 c2 = substring2[i];
49      if (unibrow::Utf16::IsLeadSurrogate(c1)) {
50        // Non-BMP characters do not have case-equivalents in the BMP.
51        // Both have to be non-BMP for them to be able to match.
52        if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
53        if (i + 1 < length) {
54          uc16 c1t = substring1[i + 1];
55          uc16 c2t = substring2[i + 1];
56          if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
57              unibrow::Utf16::IsTrailSurrogate(c2t)) {
58            c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
59            c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
60            i++;
61          }
62        }
63      }
64      c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
65      c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
66      if (c1 != c2) return 0;
67    }
68    return 1;
69  }
70#endif  // V8_I18N_SUPPORT
71  DCHECK_NOT_NULL(isolate);
72  for (size_t i = 0; i < length; i++) {
73    unibrow::uchar c1 = substring1[i];
74    unibrow::uchar c2 = substring2[i];
75    if (c1 != c2) {
76      unibrow::uchar s1[1] = {c1};
77      canonicalize->get(c1, '\0', s1);
78      if (s1[0] != c2) {
79        unibrow::uchar s2[1] = {c2};
80        canonicalize->get(c2, '\0', s2);
81        if (s1[0] != s2[0]) {
82          return 0;
83        }
84      }
85    }
86  }
87  return 1;
88}
89
90
91void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
92                                                   Label* on_failure) {
93  Label ok;
94  // Check that current character is not a trail surrogate.
95  LoadCurrentCharacter(cp_offset, &ok);
96  CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
97  // Check that previous character is not a lead surrogate.
98  LoadCurrentCharacter(cp_offset - 1, &ok);
99  CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
100  Bind(&ok);
101}
102
103void RegExpMacroAssembler::CheckPosition(int cp_offset,
104                                         Label* on_outside_input) {
105  LoadCurrentCharacter(cp_offset, on_outside_input, true);
106}
107
108bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
109                                                      Label* on_no_match) {
110  return false;
111}
112
113#ifndef V8_INTERPRETED_REGEXP  // Avoid unused code, e.g., on ARM.
114
115NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
116                                                       Zone* zone)
117    : RegExpMacroAssembler(isolate, zone) {}
118
119
120NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
121}
122
123
124bool NativeRegExpMacroAssembler::CanReadUnaligned() {
125  return FLAG_enable_unaligned_accesses && !slow_safe();
126}
127
128const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
129    String* subject,
130    int start_index) {
131  if (subject->IsConsString()) {
132    subject = ConsString::cast(subject)->first();
133  } else if (subject->IsSlicedString()) {
134    start_index += SlicedString::cast(subject)->offset();
135    subject = SlicedString::cast(subject)->parent();
136  }
137  DCHECK(start_index >= 0);
138  DCHECK(start_index <= subject->length());
139  if (subject->IsSeqOneByteString()) {
140    return reinterpret_cast<const byte*>(
141        SeqOneByteString::cast(subject)->GetChars() + start_index);
142  } else if (subject->IsSeqTwoByteString()) {
143    return reinterpret_cast<const byte*>(
144        SeqTwoByteString::cast(subject)->GetChars() + start_index);
145  } else if (subject->IsExternalOneByteString()) {
146    return reinterpret_cast<const byte*>(
147        ExternalOneByteString::cast(subject)->GetChars() + start_index);
148  } else {
149    return reinterpret_cast<const byte*>(
150        ExternalTwoByteString::cast(subject)->GetChars() + start_index);
151  }
152}
153
154
155int NativeRegExpMacroAssembler::CheckStackGuardState(
156    Isolate* isolate, int start_index, bool is_direct_call,
157    Address* return_address, Code* re_code, String** subject,
158    const byte** input_start, const byte** input_end) {
159  DCHECK(re_code->instruction_start() <= *return_address);
160  DCHECK(*return_address <= re_code->instruction_end());
161  int return_value = 0;
162  // Prepare for possible GC.
163  HandleScope handles(isolate);
164  Handle<Code> code_handle(re_code);
165  Handle<String> subject_handle(*subject);
166  bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
167
168  StackLimitCheck check(isolate);
169  if (check.JsHasOverflowed()) {
170    isolate->StackOverflow();
171    return_value = EXCEPTION;
172  } else if (is_direct_call) {
173    // If not real stack overflow the stack guard was used to interrupt
174    // execution for another purpose.  If this is a direct call from JavaScript
175    // retry the RegExp forcing the call through the runtime system.
176    // Currently the direct call cannot handle a GC.
177    return_value = RETRY;
178  } else {
179    Object* result = isolate->stack_guard()->HandleInterrupts();
180    if (result->IsException(isolate)) return_value = EXCEPTION;
181  }
182
183  DisallowHeapAllocation no_gc;
184
185  if (*code_handle != re_code) {  // Return address no longer valid
186    intptr_t delta = code_handle->address() - re_code->address();
187    // Overwrite the return address on the stack.
188    *return_address += delta;
189  }
190
191  // If we continue, we need to update the subject string addresses.
192  if (return_value == 0) {
193    // String encoding might have changed.
194    if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
195      // If we changed between an LATIN1 and an UC16 string, the specialized
196      // code cannot be used, and we need to restart regexp matching from
197      // scratch (including, potentially, compiling a new version of the code).
198      return_value = RETRY;
199    } else {
200      *subject = *subject_handle;
201      intptr_t byte_length = *input_end - *input_start;
202      *input_start = StringCharacterPosition(*subject, start_index);
203      *input_end = *input_start + byte_length;
204    }
205  }
206  return return_value;
207}
208
209
210NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
211    Handle<Code> regexp_code,
212    Handle<String> subject,
213    int* offsets_vector,
214    int offsets_vector_length,
215    int previous_index,
216    Isolate* isolate) {
217
218  DCHECK(subject->IsFlat());
219  DCHECK(previous_index >= 0);
220  DCHECK(previous_index <= subject->length());
221
222  // No allocations before calling the regexp, but we can't use
223  // DisallowHeapAllocation, since regexps might be preempted, and another
224  // thread might do allocation anyway.
225
226  String* subject_ptr = *subject;
227  // Character offsets into string.
228  int start_offset = previous_index;
229  int char_length = subject_ptr->length() - start_offset;
230  int slice_offset = 0;
231
232  // The string has been flattened, so if it is a cons string it contains the
233  // full string in the first part.
234  if (StringShape(subject_ptr).IsCons()) {
235    DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
236    subject_ptr = ConsString::cast(subject_ptr)->first();
237  } else if (StringShape(subject_ptr).IsSliced()) {
238    SlicedString* slice = SlicedString::cast(subject_ptr);
239    subject_ptr = slice->parent();
240    slice_offset = slice->offset();
241  }
242  // Ensure that an underlying string has the same representation.
243  bool is_one_byte = subject_ptr->IsOneByteRepresentation();
244  DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
245  // String is now either Sequential or External
246  int char_size_shift = is_one_byte ? 0 : 1;
247
248  const byte* input_start =
249      StringCharacterPosition(subject_ptr, start_offset + slice_offset);
250  int byte_length = char_length << char_size_shift;
251  const byte* input_end = input_start + byte_length;
252  Result res = Execute(*regexp_code,
253                       *subject,
254                       start_offset,
255                       input_start,
256                       input_end,
257                       offsets_vector,
258                       offsets_vector_length,
259                       isolate);
260  return res;
261}
262
263
264NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
265    Code* code,
266    String* input,  // This needs to be the unpacked (sliced, cons) string.
267    int start_offset,
268    const byte* input_start,
269    const byte* input_end,
270    int* output,
271    int output_size,
272    Isolate* isolate) {
273  // Ensure that the minimum stack has been allocated.
274  RegExpStackScope stack_scope(isolate);
275  Address stack_base = stack_scope.stack()->stack_base();
276
277  int direct_call = 0;
278  int result = CALL_GENERATED_REGEXP_CODE(
279      isolate, code->entry(), input, start_offset, input_start, input_end,
280      output, output_size, stack_base, direct_call, isolate);
281  DCHECK(result >= RETRY);
282
283  if (result == EXCEPTION && !isolate->has_pending_exception()) {
284    // We detected a stack overflow (on the backtrack stack) in RegExp code,
285    // but haven't created the exception yet.
286    isolate->StackOverflow();
287  }
288  return static_cast<Result>(result);
289}
290
291
292const byte NativeRegExpMacroAssembler::word_character_map[] = {
293    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
294    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
295    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
296    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
297
298    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
299    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
300    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // '0' - '7'
301    0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
302
303    0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'A' - 'G'
304    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'H' - 'O'
305    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'P' - 'W'
306    0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu,  // 'X' - 'Z', '_'
307
308    0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'a' - 'g'
309    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'h' - 'o'
310    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'p' - 'w'
311    0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
312    // Latin-1 range
313    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
314    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
315    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
316    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
317
318    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
319    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
320    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
321    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
322
323    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
324    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
325    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
326    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
327
328    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
329    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
330    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
331    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332};
333
334
335Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
336                                              Address* stack_base,
337                                              Isolate* isolate) {
338  RegExpStack* regexp_stack = isolate->regexp_stack();
339  size_t size = regexp_stack->stack_capacity();
340  Address old_stack_base = regexp_stack->stack_base();
341  DCHECK(old_stack_base == *stack_base);
342  DCHECK(stack_pointer <= old_stack_base);
343  DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
344  Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
345  if (new_stack_base == NULL) {
346    return NULL;
347  }
348  *stack_base = new_stack_base;
349  intptr_t stack_content_size = old_stack_base - stack_pointer;
350  return new_stack_base - stack_content_size;
351}
352
353#endif  // V8_INTERPRETED_REGEXP
354
355}  // namespace internal
356}  // namespace v8
357