1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/v8.h"
6 
7 #include "src/scanner-character-streams.h"
8 
9 #include "include/v8.h"
10 #include "src/handles.h"
11 #include "src/unicode-inl.h"
12 
13 namespace v8 {
14 namespace internal {
15 
16 namespace {
17 
CopyCharsHelper(uint16_t * dest,unsigned length,const uint8_t * src,unsigned * src_pos,unsigned src_length,ScriptCompiler::StreamedSource::Encoding encoding)18 unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src,
19                          unsigned* src_pos, unsigned src_length,
20                          ScriptCompiler::StreamedSource::Encoding encoding) {
21   if (encoding == ScriptCompiler::StreamedSource::UTF8) {
22     return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(
23         dest, length, src, src_pos, src_length);
24   }
25 
26   unsigned to_fill = length;
27   if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos;
28 
29   if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) {
30     v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill);
31   } else {
32     DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE);
33     v8::internal::CopyChars<uint16_t, uint16_t>(
34         dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill);
35   }
36   *src_pos += to_fill;
37   return to_fill;
38 }
39 
40 }  // namespace
41 
42 
43 // ----------------------------------------------------------------------------
44 // BufferedUtf16CharacterStreams
45 
BufferedUtf16CharacterStream()46 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
47     : Utf16CharacterStream(),
48       pushback_limit_(NULL) {
49   // Initialize buffer as being empty. First read will fill the buffer.
50   buffer_cursor_ = buffer_;
51   buffer_end_ = buffer_;
52 }
53 
54 
~BufferedUtf16CharacterStream()55 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
56 
PushBack(uc32 character)57 void BufferedUtf16CharacterStream::PushBack(uc32 character) {
58   if (character == kEndOfInput) {
59     pos_--;
60     return;
61   }
62   if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
63     // buffer_ is writable, buffer_cursor_ is const pointer.
64     buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
65     pos_--;
66     return;
67   }
68   SlowPushBack(static_cast<uc16>(character));
69 }
70 
71 
SlowPushBack(uc16 character)72 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
73   // In pushback mode, the end of the buffer contains pushback,
74   // and the start of the buffer (from buffer start to pushback_limit_)
75   // contains valid data that comes just after the pushback.
76   // We NULL the pushback_limit_ if pushing all the way back to the
77   // start of the buffer.
78 
79   if (pushback_limit_ == NULL) {
80     // Enter pushback mode.
81     pushback_limit_ = buffer_end_;
82     buffer_end_ = buffer_ + kBufferSize;
83     buffer_cursor_ = buffer_end_;
84   }
85   // Ensure that there is room for at least one pushback.
86   DCHECK(buffer_cursor_ > buffer_);
87   DCHECK(pos_ > 0);
88   buffer_[--buffer_cursor_ - buffer_] = character;
89   if (buffer_cursor_ == buffer_) {
90     pushback_limit_ = NULL;
91   } else if (buffer_cursor_ < pushback_limit_) {
92     pushback_limit_ = buffer_cursor_;
93   }
94   pos_--;
95 }
96 
97 
ReadBlock()98 bool BufferedUtf16CharacterStream::ReadBlock() {
99   buffer_cursor_ = buffer_;
100   if (pushback_limit_ != NULL) {
101     // Leave pushback mode.
102     buffer_end_ = pushback_limit_;
103     pushback_limit_ = NULL;
104     // If there were any valid characters left at the
105     // start of the buffer, use those.
106     if (buffer_cursor_ < buffer_end_) return true;
107     // Otherwise read a new block.
108   }
109   unsigned length = FillBuffer(pos_);
110   buffer_end_ = buffer_ + length;
111   return length > 0;
112 }
113 
114 
SlowSeekForward(unsigned delta)115 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
116   // Leave pushback mode (i.e., ignore that there might be valid data
117   // in the buffer before the pushback_limit_ point).
118   pushback_limit_ = NULL;
119   return BufferSeekForward(delta);
120 }
121 
122 
123 // ----------------------------------------------------------------------------
124 // GenericStringUtf16CharacterStream
125 
126 
GenericStringUtf16CharacterStream(Handle<String> data,unsigned start_position,unsigned end_position)127 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
128     Handle<String> data,
129     unsigned start_position,
130     unsigned end_position)
131     : string_(data),
132       length_(end_position) {
133   DCHECK(end_position >= start_position);
134   pos_ = start_position;
135 }
136 
137 
~GenericStringUtf16CharacterStream()138 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
139 
140 
BufferSeekForward(unsigned delta)141 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
142   unsigned old_pos = pos_;
143   pos_ = Min(pos_ + delta, length_);
144   ReadBlock();
145   return pos_ - old_pos;
146 }
147 
148 
FillBuffer(unsigned from_pos)149 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos) {
150   if (from_pos >= length_) return 0;
151   unsigned length = kBufferSize;
152   if (from_pos + length > length_) {
153     length = length_ - from_pos;
154   }
155   String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
156   return length;
157 }
158 
159 
160 // ----------------------------------------------------------------------------
161 // Utf8ToUtf16CharacterStream
Utf8ToUtf16CharacterStream(const byte * data,unsigned length)162 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
163                                                        unsigned length)
164     : BufferedUtf16CharacterStream(),
165       raw_data_(data),
166       raw_data_length_(length),
167       raw_data_pos_(0),
168       raw_character_position_(0) {
169   ReadBlock();
170 }
171 
172 
~Utf8ToUtf16CharacterStream()173 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
174 
175 
CopyChars(uint16_t * dest,unsigned length,const byte * src,unsigned * src_pos,unsigned src_length)176 unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length,
177                                                const byte* src,
178                                                unsigned* src_pos,
179                                                unsigned src_length) {
180   static const unibrow::uchar kMaxUtf16Character = 0xffff;
181   unsigned i = 0;
182   // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer
183   // one character early (in the normal case), because we need to have at least
184   // two free spaces in the buffer to be sure that the next character will fit.
185   while (i < length - 1) {
186     if (*src_pos == src_length) break;
187     unibrow::uchar c = src[*src_pos];
188     if (c <= unibrow::Utf8::kMaxOneByteChar) {
189       *src_pos = *src_pos + 1;
190     } else {
191       c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos,
192                                         src_pos);
193     }
194     if (c > kMaxUtf16Character) {
195       dest[i++] = unibrow::Utf16::LeadSurrogate(c);
196       dest[i++] = unibrow::Utf16::TrailSurrogate(c);
197     } else {
198       dest[i++] = static_cast<uc16>(c);
199     }
200   }
201   return i;
202 }
203 
204 
BufferSeekForward(unsigned delta)205 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
206   unsigned old_pos = pos_;
207   unsigned target_pos = pos_ + delta;
208   SetRawPosition(target_pos);
209   pos_ = raw_character_position_;
210   ReadBlock();
211   return pos_ - old_pos;
212 }
213 
214 
FillBuffer(unsigned char_position)215 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {
216   SetRawPosition(char_position);
217   if (raw_character_position_ != char_position) {
218     // char_position was not a valid position in the stream (hit the end
219     // while spooling to it).
220     return 0u;
221   }
222   unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,
223                          raw_data_length_);
224   raw_character_position_ = char_position + i;
225   return i;
226 }
227 
228 
229 static const byte kUtf8MultiByteMask = 0xC0;
230 static const byte kUtf8MultiByteCharFollower = 0x80;
231 
232 
233 #ifdef DEBUG
234 static const byte kUtf8MultiByteCharStart = 0xC0;
IsUtf8MultiCharacterStart(byte first_byte)235 static bool IsUtf8MultiCharacterStart(byte first_byte) {
236   return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
237 }
238 #endif
239 
240 
IsUtf8MultiCharacterFollower(byte later_byte)241 static bool IsUtf8MultiCharacterFollower(byte later_byte) {
242   return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
243 }
244 
245 
246 // Move the cursor back to point at the preceding UTF-8 character start
247 // in the buffer.
Utf8CharacterBack(const byte * buffer,unsigned * cursor)248 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
249   byte character = buffer[--*cursor];
250   if (character > unibrow::Utf8::kMaxOneByteChar) {
251     DCHECK(IsUtf8MultiCharacterFollower(character));
252     // Last byte of a multi-byte character encoding. Step backwards until
253     // pointing to the first byte of the encoding, recognized by having the
254     // top two bits set.
255     while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
256     DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor]));
257   }
258 }
259 
260 
261 // Move the cursor forward to point at the next following UTF-8 character start
262 // in the buffer.
Utf8CharacterForward(const byte * buffer,unsigned * cursor)263 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
264   byte character = buffer[(*cursor)++];
265   if (character > unibrow::Utf8::kMaxOneByteChar) {
266     // First character of a multi-byte character encoding.
267     // The number of most-significant one-bits determines the length of the
268     // encoding:
269     //  110..... - (0xCx, 0xDx) one additional byte (minimum).
270     //  1110.... - (0xEx) two additional bytes.
271     //  11110... - (0xFx) three additional bytes (maximum).
272     DCHECK(IsUtf8MultiCharacterStart(character));
273     // Additional bytes is:
274     // 1 if value in range 0xC0 .. 0xDF.
275     // 2 if value in range 0xE0 .. 0xEF.
276     // 3 if value in range 0xF0 .. 0xF7.
277     // Encode that in a single value.
278     unsigned additional_bytes =
279         ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
280     *cursor += additional_bytes;
281     DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
282   }
283 }
284 
285 
286 // This can't set a raw position between two surrogate pairs, since there
287 // is no position in the UTF8 stream that corresponds to that.  This assumes
288 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence.  If
289 // it is illegally coded as two 3 byte sequences then there is no problem here.
SetRawPosition(unsigned target_position)290 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
291   if (raw_character_position_ > target_position) {
292     // Spool backwards in utf8 buffer.
293     do {
294       int old_pos = raw_data_pos_;
295       Utf8CharacterBack(raw_data_, &raw_data_pos_);
296       raw_character_position_--;
297       DCHECK(old_pos - raw_data_pos_ <= 4);
298       // Step back over both code units for surrogate pairs.
299       if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
300     } while (raw_character_position_ > target_position);
301     // No surrogate pair splitting.
302     DCHECK(raw_character_position_ == target_position);
303     return;
304   }
305   // Spool forwards in the utf8 buffer.
306   while (raw_character_position_ < target_position) {
307     if (raw_data_pos_ == raw_data_length_) return;
308     int old_pos = raw_data_pos_;
309     Utf8CharacterForward(raw_data_, &raw_data_pos_);
310     raw_character_position_++;
311     DCHECK(raw_data_pos_ - old_pos <= 4);
312     if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
313   }
314   // No surrogate pair splitting.
315   DCHECK(raw_character_position_ == target_position);
316 }
317 
318 
FillBuffer(unsigned position)319 unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
320   // Ignore "position" which is the position in the decoded data. Instead,
321   // ExternalStreamingStream keeps track of the position in the raw data.
322   unsigned data_in_buffer = 0;
323   // Note that the UTF-8 decoder might not be able to fill the buffer
324   // completely; it will typically leave the last character empty (see
325   // Utf8ToUtf16CharacterStream::CopyChars).
326   while (data_in_buffer < kBufferSize - 1) {
327     if (current_data_ == NULL) {
328       // GetSomeData will wait until the embedder has enough data. Here's an
329       // interface between the API which uses size_t (which is the correct type
330       // here) and the internal parts which use unsigned. TODO(marja): make the
331       // internal parts use size_t too.
332       current_data_length_ =
333           static_cast<unsigned>(source_stream_->GetMoreData(&current_data_));
334       current_data_offset_ = 0;
335       bool data_ends = current_data_length_ == 0;
336 
337       // A caveat: a data chunk might end with bytes from an incomplete UTF-8
338       // character (the rest of the bytes will be in the next chunk).
339       if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {
340         HandleUtf8SplitCharacters(&data_in_buffer);
341         if (!data_ends && current_data_offset_ == current_data_length_) {
342           // The data stream didn't end, but we used all the data in the
343           // chunk. This will only happen when the chunk was really small. We
344           // don't handle the case where a UTF-8 character is split over several
345           // chunks; in that case V8 won't crash, but it will be a parse error.
346           delete[] current_data_;
347           current_data_ = NULL;
348           current_data_length_ = 0;
349           current_data_offset_ = 0;
350           continue;  // Request a new chunk.
351         }
352       }
353 
354       // Did the data stream end?
355       if (data_ends) {
356         DCHECK(utf8_split_char_buffer_length_ == 0);
357         return data_in_buffer;
358       }
359     }
360 
361     // Fill the buffer from current_data_.
362     unsigned new_offset = 0;
363     unsigned new_chars_in_buffer =
364         CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,
365                         current_data_ + current_data_offset_, &new_offset,
366                         current_data_length_ - current_data_offset_, encoding_);
367     data_in_buffer += new_chars_in_buffer;
368     current_data_offset_ += new_offset;
369     DCHECK(data_in_buffer <= kBufferSize);
370 
371     // Did we use all the data in the data chunk?
372     if (current_data_offset_ == current_data_length_) {
373       delete[] current_data_;
374       current_data_ = NULL;
375       current_data_length_ = 0;
376       current_data_offset_ = 0;
377     }
378   }
379   return data_in_buffer;
380 }
381 
HandleUtf8SplitCharacters(unsigned * data_in_buffer)382 void ExternalStreamingStream::HandleUtf8SplitCharacters(
383     unsigned* data_in_buffer) {
384   // First check if we have leftover data from the last chunk.
385   unibrow::uchar c;
386   if (utf8_split_char_buffer_length_ > 0) {
387     // Move the bytes which are part of the split character (which started in
388     // the previous chunk) into utf8_split_char_buffer_.
389     while (current_data_offset_ < current_data_length_ &&
390            utf8_split_char_buffer_length_ < 4 &&
391            (c = current_data_[current_data_offset_]) >
392                unibrow::Utf8::kMaxOneByteChar) {
393       utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
394       ++utf8_split_char_buffer_length_;
395       ++current_data_offset_;
396     }
397 
398     // Convert the data in utf8_split_char_buffer_.
399     unsigned new_offset = 0;
400     unsigned new_chars_in_buffer =
401         CopyCharsHelper(buffer_ + *data_in_buffer,
402                         kBufferSize - *data_in_buffer, utf8_split_char_buffer_,
403                         &new_offset, utf8_split_char_buffer_length_, encoding_);
404     *data_in_buffer += new_chars_in_buffer;
405     // Make sure we used all the data.
406     DCHECK(new_offset == utf8_split_char_buffer_length_);
407     DCHECK(*data_in_buffer <= kBufferSize);
408 
409     utf8_split_char_buffer_length_ = 0;
410   }
411 
412   // Move bytes which are part of an incomplete character from the end of the
413   // current chunk to utf8_split_char_buffer_. They will be converted when the
414   // next data chunk arrives. Note that all valid UTF-8 characters are at most 4
415   // bytes long, but if the data is invalid, we can have character values bigger
416   // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
417   while (current_data_length_ > current_data_offset_ &&
418          (c = current_data_[current_data_length_ - 1]) >
419              unibrow::Utf8::kMaxOneByteChar &&
420          utf8_split_char_buffer_length_ < 4) {
421     --current_data_length_;
422     ++utf8_split_char_buffer_length_;
423   }
424   CHECK(utf8_split_char_buffer_length_ <= 4);
425   for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {
426     utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];
427   }
428 }
429 
430 
431 // ----------------------------------------------------------------------------
432 // ExternalTwoByteStringUtf16CharacterStream
433 
434 ExternalTwoByteStringUtf16CharacterStream::
~ExternalTwoByteStringUtf16CharacterStream()435     ~ExternalTwoByteStringUtf16CharacterStream() { }
436 
437 
438 ExternalTwoByteStringUtf16CharacterStream
ExternalTwoByteStringUtf16CharacterStream(Handle<ExternalTwoByteString> data,int start_position,int end_position)439     ::ExternalTwoByteStringUtf16CharacterStream(
440         Handle<ExternalTwoByteString> data,
441         int start_position,
442         int end_position)
443     : Utf16CharacterStream(),
444       source_(data),
445       raw_data_(data->GetTwoByteData(start_position)) {
446   buffer_cursor_ = raw_data_,
447   buffer_end_ = raw_data_ + (end_position - start_position);
448   pos_ = start_position;
449 }
450 
451 } }  // namespace v8::internal
452