1// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/v8.h"
6
7#include "src/scanner-character-streams.h"
8
9#include "include/v8.h"
10#include "src/handles.h"
11#include "src/unicode-inl.h"
12
13namespace v8 {
14namespace internal {
15
16namespace {
17
18unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src,
19                         unsigned* src_pos, unsigned src_length,
20                         ScriptCompiler::StreamedSource::Encoding encoding) {
21  if (encoding == ScriptCompiler::StreamedSource::UTF8) {
22    return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(
23        dest, length, src, src_pos, src_length);
24  }
25
26  unsigned to_fill = length;
27  if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos;
28
29  if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) {
30    v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill);
31  } else {
32    DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE);
33    v8::internal::CopyChars<uint16_t, uint16_t>(
34        dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill);
35  }
36  *src_pos += to_fill;
37  return to_fill;
38}
39
40}  // namespace
41
42
43// ----------------------------------------------------------------------------
44// BufferedUtf16CharacterStreams
45
46BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
47    : Utf16CharacterStream(),
48      pushback_limit_(NULL) {
49  // Initialize buffer as being empty. First read will fill the buffer.
50  buffer_cursor_ = buffer_;
51  buffer_end_ = buffer_;
52}
53
54
55BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
56
57void BufferedUtf16CharacterStream::PushBack(uc32 character) {
58  if (character == kEndOfInput) {
59    pos_--;
60    return;
61  }
62  if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
63    // buffer_ is writable, buffer_cursor_ is const pointer.
64    buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
65    pos_--;
66    return;
67  }
68  SlowPushBack(static_cast<uc16>(character));
69}
70
71
72void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
73  // In pushback mode, the end of the buffer contains pushback,
74  // and the start of the buffer (from buffer start to pushback_limit_)
75  // contains valid data that comes just after the pushback.
76  // We NULL the pushback_limit_ if pushing all the way back to the
77  // start of the buffer.
78
79  if (pushback_limit_ == NULL) {
80    // Enter pushback mode.
81    pushback_limit_ = buffer_end_;
82    buffer_end_ = buffer_ + kBufferSize;
83    buffer_cursor_ = buffer_end_;
84  }
85  // Ensure that there is room for at least one pushback.
86  DCHECK(buffer_cursor_ > buffer_);
87  DCHECK(pos_ > 0);
88  buffer_[--buffer_cursor_ - buffer_] = character;
89  if (buffer_cursor_ == buffer_) {
90    pushback_limit_ = NULL;
91  } else if (buffer_cursor_ < pushback_limit_) {
92    pushback_limit_ = buffer_cursor_;
93  }
94  pos_--;
95}
96
97
98bool BufferedUtf16CharacterStream::ReadBlock() {
99  buffer_cursor_ = buffer_;
100  if (pushback_limit_ != NULL) {
101    // Leave pushback mode.
102    buffer_end_ = pushback_limit_;
103    pushback_limit_ = NULL;
104    // If there were any valid characters left at the
105    // start of the buffer, use those.
106    if (buffer_cursor_ < buffer_end_) return true;
107    // Otherwise read a new block.
108  }
109  unsigned length = FillBuffer(pos_);
110  buffer_end_ = buffer_ + length;
111  return length > 0;
112}
113
114
115unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
116  // Leave pushback mode (i.e., ignore that there might be valid data
117  // in the buffer before the pushback_limit_ point).
118  pushback_limit_ = NULL;
119  return BufferSeekForward(delta);
120}
121
122
123// ----------------------------------------------------------------------------
124// GenericStringUtf16CharacterStream
125
126
127GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
128    Handle<String> data,
129    unsigned start_position,
130    unsigned end_position)
131    : string_(data),
132      length_(end_position) {
133  DCHECK(end_position >= start_position);
134  pos_ = start_position;
135}
136
137
138GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
139
140
141unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
142  unsigned old_pos = pos_;
143  pos_ = Min(pos_ + delta, length_);
144  ReadBlock();
145  return pos_ - old_pos;
146}
147
148
149unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos) {
150  if (from_pos >= length_) return 0;
151  unsigned length = kBufferSize;
152  if (from_pos + length > length_) {
153    length = length_ - from_pos;
154  }
155  String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
156  return length;
157}
158
159
160// ----------------------------------------------------------------------------
161// Utf8ToUtf16CharacterStream
162Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
163                                                       unsigned length)
164    : BufferedUtf16CharacterStream(),
165      raw_data_(data),
166      raw_data_length_(length),
167      raw_data_pos_(0),
168      raw_character_position_(0) {
169  ReadBlock();
170}
171
172
173Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
174
175
176unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length,
177                                               const byte* src,
178                                               unsigned* src_pos,
179                                               unsigned src_length) {
180  static const unibrow::uchar kMaxUtf16Character = 0xffff;
181  unsigned i = 0;
182  // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer
183  // one character early (in the normal case), because we need to have at least
184  // two free spaces in the buffer to be sure that the next character will fit.
185  while (i < length - 1) {
186    if (*src_pos == src_length) break;
187    unibrow::uchar c = src[*src_pos];
188    if (c <= unibrow::Utf8::kMaxOneByteChar) {
189      *src_pos = *src_pos + 1;
190    } else {
191      c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos,
192                                        src_pos);
193    }
194    if (c > kMaxUtf16Character) {
195      dest[i++] = unibrow::Utf16::LeadSurrogate(c);
196      dest[i++] = unibrow::Utf16::TrailSurrogate(c);
197    } else {
198      dest[i++] = static_cast<uc16>(c);
199    }
200  }
201  return i;
202}
203
204
205unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
206  unsigned old_pos = pos_;
207  unsigned target_pos = pos_ + delta;
208  SetRawPosition(target_pos);
209  pos_ = raw_character_position_;
210  ReadBlock();
211  return pos_ - old_pos;
212}
213
214
215unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {
216  SetRawPosition(char_position);
217  if (raw_character_position_ != char_position) {
218    // char_position was not a valid position in the stream (hit the end
219    // while spooling to it).
220    return 0u;
221  }
222  unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,
223                         raw_data_length_);
224  raw_character_position_ = char_position + i;
225  return i;
226}
227
228
229static const byte kUtf8MultiByteMask = 0xC0;
230static const byte kUtf8MultiByteCharFollower = 0x80;
231
232
233#ifdef DEBUG
234static const byte kUtf8MultiByteCharStart = 0xC0;
235static bool IsUtf8MultiCharacterStart(byte first_byte) {
236  return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
237}
238#endif
239
240
241static bool IsUtf8MultiCharacterFollower(byte later_byte) {
242  return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
243}
244
245
246// Move the cursor back to point at the preceding UTF-8 character start
247// in the buffer.
248static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
249  byte character = buffer[--*cursor];
250  if (character > unibrow::Utf8::kMaxOneByteChar) {
251    DCHECK(IsUtf8MultiCharacterFollower(character));
252    // Last byte of a multi-byte character encoding. Step backwards until
253    // pointing to the first byte of the encoding, recognized by having the
254    // top two bits set.
255    while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
256    DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor]));
257  }
258}
259
260
261// Move the cursor forward to point at the next following UTF-8 character start
262// in the buffer.
263static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
264  byte character = buffer[(*cursor)++];
265  if (character > unibrow::Utf8::kMaxOneByteChar) {
266    // First character of a multi-byte character encoding.
267    // The number of most-significant one-bits determines the length of the
268    // encoding:
269    //  110..... - (0xCx, 0xDx) one additional byte (minimum).
270    //  1110.... - (0xEx) two additional bytes.
271    //  11110... - (0xFx) three additional bytes (maximum).
272    DCHECK(IsUtf8MultiCharacterStart(character));
273    // Additional bytes is:
274    // 1 if value in range 0xC0 .. 0xDF.
275    // 2 if value in range 0xE0 .. 0xEF.
276    // 3 if value in range 0xF0 .. 0xF7.
277    // Encode that in a single value.
278    unsigned additional_bytes =
279        ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
280    *cursor += additional_bytes;
281    DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
282  }
283}
284
285
286// This can't set a raw position between two surrogate pairs, since there
287// is no position in the UTF8 stream that corresponds to that.  This assumes
288// that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence.  If
289// it is illegally coded as two 3 byte sequences then there is no problem here.
290void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
291  if (raw_character_position_ > target_position) {
292    // Spool backwards in utf8 buffer.
293    do {
294      int old_pos = raw_data_pos_;
295      Utf8CharacterBack(raw_data_, &raw_data_pos_);
296      raw_character_position_--;
297      DCHECK(old_pos - raw_data_pos_ <= 4);
298      // Step back over both code units for surrogate pairs.
299      if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
300    } while (raw_character_position_ > target_position);
301    // No surrogate pair splitting.
302    DCHECK(raw_character_position_ == target_position);
303    return;
304  }
305  // Spool forwards in the utf8 buffer.
306  while (raw_character_position_ < target_position) {
307    if (raw_data_pos_ == raw_data_length_) return;
308    int old_pos = raw_data_pos_;
309    Utf8CharacterForward(raw_data_, &raw_data_pos_);
310    raw_character_position_++;
311    DCHECK(raw_data_pos_ - old_pos <= 4);
312    if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
313  }
314  // No surrogate pair splitting.
315  DCHECK(raw_character_position_ == target_position);
316}
317
318
319unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
320  // Ignore "position" which is the position in the decoded data. Instead,
321  // ExternalStreamingStream keeps track of the position in the raw data.
322  unsigned data_in_buffer = 0;
323  // Note that the UTF-8 decoder might not be able to fill the buffer
324  // completely; it will typically leave the last character empty (see
325  // Utf8ToUtf16CharacterStream::CopyChars).
326  while (data_in_buffer < kBufferSize - 1) {
327    if (current_data_ == NULL) {
328      // GetSomeData will wait until the embedder has enough data. Here's an
329      // interface between the API which uses size_t (which is the correct type
330      // here) and the internal parts which use unsigned. TODO(marja): make the
331      // internal parts use size_t too.
332      current_data_length_ =
333          static_cast<unsigned>(source_stream_->GetMoreData(&current_data_));
334      current_data_offset_ = 0;
335      bool data_ends = current_data_length_ == 0;
336
337      // A caveat: a data chunk might end with bytes from an incomplete UTF-8
338      // character (the rest of the bytes will be in the next chunk).
339      if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {
340        HandleUtf8SplitCharacters(&data_in_buffer);
341        if (!data_ends && current_data_offset_ == current_data_length_) {
342          // The data stream didn't end, but we used all the data in the
343          // chunk. This will only happen when the chunk was really small. We
344          // don't handle the case where a UTF-8 character is split over several
345          // chunks; in that case V8 won't crash, but it will be a parse error.
346          delete[] current_data_;
347          current_data_ = NULL;
348          current_data_length_ = 0;
349          current_data_offset_ = 0;
350          continue;  // Request a new chunk.
351        }
352      }
353
354      // Did the data stream end?
355      if (data_ends) {
356        DCHECK(utf8_split_char_buffer_length_ == 0);
357        return data_in_buffer;
358      }
359    }
360
361    // Fill the buffer from current_data_.
362    unsigned new_offset = 0;
363    unsigned new_chars_in_buffer =
364        CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,
365                        current_data_ + current_data_offset_, &new_offset,
366                        current_data_length_ - current_data_offset_, encoding_);
367    data_in_buffer += new_chars_in_buffer;
368    current_data_offset_ += new_offset;
369    DCHECK(data_in_buffer <= kBufferSize);
370
371    // Did we use all the data in the data chunk?
372    if (current_data_offset_ == current_data_length_) {
373      delete[] current_data_;
374      current_data_ = NULL;
375      current_data_length_ = 0;
376      current_data_offset_ = 0;
377    }
378  }
379  return data_in_buffer;
380}
381
382void ExternalStreamingStream::HandleUtf8SplitCharacters(
383    unsigned* data_in_buffer) {
384  // First check if we have leftover data from the last chunk.
385  unibrow::uchar c;
386  if (utf8_split_char_buffer_length_ > 0) {
387    // Move the bytes which are part of the split character (which started in
388    // the previous chunk) into utf8_split_char_buffer_.
389    while (current_data_offset_ < current_data_length_ &&
390           utf8_split_char_buffer_length_ < 4 &&
391           (c = current_data_[current_data_offset_]) >
392               unibrow::Utf8::kMaxOneByteChar) {
393      utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
394      ++utf8_split_char_buffer_length_;
395      ++current_data_offset_;
396    }
397
398    // Convert the data in utf8_split_char_buffer_.
399    unsigned new_offset = 0;
400    unsigned new_chars_in_buffer =
401        CopyCharsHelper(buffer_ + *data_in_buffer,
402                        kBufferSize - *data_in_buffer, utf8_split_char_buffer_,
403                        &new_offset, utf8_split_char_buffer_length_, encoding_);
404    *data_in_buffer += new_chars_in_buffer;
405    // Make sure we used all the data.
406    DCHECK(new_offset == utf8_split_char_buffer_length_);
407    DCHECK(*data_in_buffer <= kBufferSize);
408
409    utf8_split_char_buffer_length_ = 0;
410  }
411
412  // Move bytes which are part of an incomplete character from the end of the
413  // current chunk to utf8_split_char_buffer_. They will be converted when the
414  // next data chunk arrives. Note that all valid UTF-8 characters are at most 4
415  // bytes long, but if the data is invalid, we can have character values bigger
416  // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
417  while (current_data_length_ > current_data_offset_ &&
418         (c = current_data_[current_data_length_ - 1]) >
419             unibrow::Utf8::kMaxOneByteChar &&
420         utf8_split_char_buffer_length_ < 4) {
421    --current_data_length_;
422    ++utf8_split_char_buffer_length_;
423  }
424  CHECK(utf8_split_char_buffer_length_ <= 4);
425  for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {
426    utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];
427  }
428}
429
430
431// ----------------------------------------------------------------------------
432// ExternalTwoByteStringUtf16CharacterStream
433
434ExternalTwoByteStringUtf16CharacterStream::
435    ~ExternalTwoByteStringUtf16CharacterStream() { }
436
437
438ExternalTwoByteStringUtf16CharacterStream
439    ::ExternalTwoByteStringUtf16CharacterStream(
440        Handle<ExternalTwoByteString> data,
441        int start_position,
442        int end_position)
443    : Utf16CharacterStream(),
444      source_(data),
445      raw_data_(data->GetTwoByteData(start_position)) {
446  buffer_cursor_ = raw_data_,
447  buffer_end_ = raw_data_ + (end_position - start_position);
448  pos_ = start_position;
449}
450
451} }  // namespace v8::internal
452