1 // Copyright 2017 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_OBJECTS_STRING_H_
6 #define V8_OBJECTS_STRING_H_
7 
8 #include "src/base/bits.h"
9 #include "src/objects/name.h"
10 #include "src/unicode-decoder.h"
11 
12 // Has to be the last include (doesn't have include guards):
13 #include "src/objects/object-macros.h"
14 
15 namespace v8 {
16 namespace internal {
17 
18 class BigInt;
19 
20 enum AllowNullsFlag { ALLOW_NULLS, DISALLOW_NULLS };
21 enum RobustnessFlag { ROBUST_STRING_TRAVERSAL, FAST_STRING_TRAVERSAL };
22 
23 // The characteristics of a string are stored in its map.  Retrieving these
24 // few bits of information is moderately expensive, involving two memory
25 // loads where the second is dependent on the first.  To improve efficiency
26 // the shape of the string is given its own class so that it can be retrieved
27 // once and used for several string operations.  A StringShape is small enough
28 // to be passed by value and is immutable, but be aware that flattening a
29 // string can potentially alter its shape.  Also be aware that a GC caused by
30 // something else can alter the shape of a string due to ConsString
31 // shortcutting.  Keeping these restrictions in mind has proven to be error-
32 // prone and so we no longer put StringShapes in variables unless there is a
33 // concrete performance benefit at that particular point in the code.
34 class StringShape BASE_EMBEDDED {
35  public:
36   inline explicit StringShape(const String* s);
37   inline explicit StringShape(Map* s);
38   inline explicit StringShape(InstanceType t);
39   inline bool IsSequential();
40   inline bool IsExternal();
41   inline bool IsCons();
42   inline bool IsSliced();
43   inline bool IsThin();
44   inline bool IsIndirect();
45   inline bool IsExternalOneByte();
46   inline bool IsExternalTwoByte();
47   inline bool IsSequentialOneByte();
48   inline bool IsSequentialTwoByte();
49   inline bool IsInternalized();
50   inline StringRepresentationTag representation_tag();
51   inline uint32_t encoding_tag();
52   inline uint32_t full_representation_tag();
53   inline bool HasOnlyOneByteChars();
54 #ifdef DEBUG
type()55   inline uint32_t type() { return type_; }
invalidate()56   inline void invalidate() { valid_ = false; }
valid()57   inline bool valid() { return valid_; }
58 #else
invalidate()59   inline void invalidate() {}
60 #endif
61 
62  private:
63   uint32_t type_;
64 #ifdef DEBUG
set_valid()65   inline void set_valid() { valid_ = true; }
66   bool valid_;
67 #else
set_valid()68   inline void set_valid() {}
69 #endif
70 };
71 
72 // The String abstract class captures JavaScript string values:
73 //
74 // Ecma-262:
75 //  4.3.16 String Value
76 //    A string value is a member of the type String and is a finite
77 //    ordered sequence of zero or more 16-bit unsigned integer values.
78 //
79 // All string values have a length field.
80 class String : public Name {
81  public:
82   enum Encoding { ONE_BYTE_ENCODING, TWO_BYTE_ENCODING };
83 
84   class SubStringRange {
85    public:
86     explicit inline SubStringRange(String* string, int first = 0,
87                                    int length = -1);
88     class iterator;
89     inline iterator begin();
90     inline iterator end();
91 
92    private:
93     String* string_;
94     int first_;
95     int length_;
96   };
97 
98   // Representation of the flat content of a String.
99   // A non-flat string doesn't have flat content.
100   // A flat string has content that's encoded as a sequence of either
101   // one-byte chars or two-byte UC16.
102   // Returned by String::GetFlatContent().
103   class FlatContent {
104    public:
105     // Returns true if the string is flat and this structure contains content.
IsFlat()106     bool IsFlat() const { return state_ != NON_FLAT; }
107     // Returns true if the structure contains one-byte content.
IsOneByte()108     bool IsOneByte() const { return state_ == ONE_BYTE; }
109     // Returns true if the structure contains two-byte content.
IsTwoByte()110     bool IsTwoByte() const { return state_ == TWO_BYTE; }
111 
112     // Return the one byte content of the string. Only use if IsOneByte()
113     // returns true.
ToOneByteVector()114     Vector<const uint8_t> ToOneByteVector() const {
115       DCHECK_EQ(ONE_BYTE, state_);
116       return Vector<const uint8_t>(onebyte_start, length_);
117     }
118     // Return the two-byte content of the string. Only use if IsTwoByte()
119     // returns true.
ToUC16Vector()120     Vector<const uc16> ToUC16Vector() const {
121       DCHECK_EQ(TWO_BYTE, state_);
122       return Vector<const uc16>(twobyte_start, length_);
123     }
124 
Get(int i)125     uc16 Get(int i) const {
126       DCHECK(i < length_);
127       DCHECK(state_ != NON_FLAT);
128       if (state_ == ONE_BYTE) return onebyte_start[i];
129       return twobyte_start[i];
130     }
131 
UsesSameString(const FlatContent & other)132     bool UsesSameString(const FlatContent& other) const {
133       return onebyte_start == other.onebyte_start;
134     }
135 
136    private:
137     enum State { NON_FLAT, ONE_BYTE, TWO_BYTE };
138 
139     // Constructors only used by String::GetFlatContent().
FlatContent(const uint8_t * start,int length)140     explicit FlatContent(const uint8_t* start, int length)
141         : onebyte_start(start), length_(length), state_(ONE_BYTE) {}
FlatContent(const uc16 * start,int length)142     explicit FlatContent(const uc16* start, int length)
143         : twobyte_start(start), length_(length), state_(TWO_BYTE) {}
FlatContent()144     FlatContent() : onebyte_start(nullptr), length_(0), state_(NON_FLAT) {}
145 
146     union {
147       const uint8_t* onebyte_start;
148       const uc16* twobyte_start;
149     };
150     int length_;
151     State state_;
152 
153     friend class String;
154     friend class IterableSubString;
155   };
156 
157   template <typename Char>
158   V8_INLINE Vector<const Char> GetCharVector();
159 
160   // Get and set the length of the string.
161   inline int length() const;
162   inline void set_length(int value);
163 
164   // Get and set the length of the string using acquire loads and release
165   // stores.
166   inline int synchronized_length() const;
167   inline void synchronized_set_length(int value);
168 
169   // Returns whether this string has only one-byte chars, i.e. all of them can
170   // be one-byte encoded.  This might be the case even if the string is
171   // two-byte.  Such strings may appear when the embedder prefers
172   // two-byte external representations even for one-byte data.
173   inline bool IsOneByteRepresentation() const;
174   inline bool IsTwoByteRepresentation() const;
175 
176   // Cons and slices have an encoding flag that may not represent the actual
177   // encoding of the underlying string.  This is taken into account here.
178   // Requires: this->IsFlat()
179   inline bool IsOneByteRepresentationUnderneath();
180   inline bool IsTwoByteRepresentationUnderneath();
181 
182   // NOTE: this should be considered only a hint.  False negatives are
183   // possible.
184   inline bool HasOnlyOneByteChars();
185 
186   // Get and set individual two byte chars in the string.
187   inline void Set(int index, uint16_t value);
188   // Get individual two byte char in the string.  Repeated calls
189   // to this method are not efficient unless the string is flat.
190   V8_INLINE uint16_t Get(int index);
191 
192   // ES6 section 7.1.3.1 ToNumber Applied to the String Type
193   static Handle<Object> ToNumber(Isolate* isolate, Handle<String> subject);
194 
195   // Flattens the string.  Checks first inline to see if it is
196   // necessary.  Does nothing if the string is not a cons string.
197   // Flattening allocates a sequential string with the same data as
198   // the given string and mutates the cons string to a degenerate
199   // form, where the first component is the new sequential string and
200   // the second component is the empty string.  If allocation fails,
201   // this function returns a failure.  If flattening succeeds, this
202   // function returns the sequential string that is now the first
203   // component of the cons string.
204   //
205   // Degenerate cons strings are handled specially by the garbage
206   // collector (see IsShortcutCandidate).
207 
208   static inline Handle<String> Flatten(Isolate* isolate, Handle<String> string,
209                                        PretenureFlag pretenure = NOT_TENURED);
210 
211   // Tries to return the content of a flat string as a structure holding either
212   // a flat vector of char or of uc16.
213   // If the string isn't flat, and therefore doesn't have flat content, the
214   // returned structure will report so, and can't provide a vector of either
215   // kind.
216   FlatContent GetFlatContent();
217 
218   // Returns the parent of a sliced string or first part of a flat cons string.
219   // Requires: StringShape(this).IsIndirect() && this->IsFlat()
220   inline String* GetUnderlying();
221 
222   // String relational comparison, implemented according to ES6 section 7.2.11
223   // Abstract Relational Comparison (step 5): The comparison of Strings uses a
224   // simple lexicographic ordering on sequences of code unit values. There is no
225   // attempt to use the more complex, semantically oriented definitions of
226   // character or string equality and collating order defined in the Unicode
227   // specification. Therefore String values that are canonically equal according
228   // to the Unicode standard could test as unequal. In effect this algorithm
229   // assumes that both Strings are already in normalized form. Also, note that
230   // for strings containing supplementary characters, lexicographic ordering on
231   // sequences of UTF-16 code unit values differs from that on sequences of code
232   // point values.
233   V8_WARN_UNUSED_RESULT static ComparisonResult Compare(Isolate* isolate,
234                                                         Handle<String> x,
235                                                         Handle<String> y);
236 
237   // Perform ES6 21.1.3.8, including checking arguments.
238   static Object* IndexOf(Isolate* isolate, Handle<Object> receiver,
239                          Handle<Object> search, Handle<Object> position);
240   // Perform string match of pattern on subject, starting at start index.
241   // Caller must ensure that 0 <= start_index <= sub->length(), as this does not
242   // check any arguments.
243   static int IndexOf(Isolate* isolate, Handle<String> receiver,
244                      Handle<String> search, int start_index);
245 
246   static Object* LastIndexOf(Isolate* isolate, Handle<Object> receiver,
247                              Handle<Object> search, Handle<Object> position);
248 
249   // Encapsulates logic related to a match and its capture groups as required
250   // by GetSubstitution.
251   class Match {
252    public:
253     virtual Handle<String> GetMatch() = 0;
254     virtual Handle<String> GetPrefix() = 0;
255     virtual Handle<String> GetSuffix() = 0;
256 
257     // A named capture can be invalid (if it is not specified in the pattern),
258     // unmatched (specified but not matched in the current string), and matched.
259     enum CaptureState { INVALID, UNMATCHED, MATCHED };
260 
261     virtual int CaptureCount() = 0;
262     virtual bool HasNamedCaptures() = 0;
263     virtual MaybeHandle<String> GetCapture(int i, bool* capture_exists) = 0;
264     virtual MaybeHandle<String> GetNamedCapture(Handle<String> name,
265                                                 CaptureState* state) = 0;
266 
~Match()267     virtual ~Match() {}
268   };
269 
270   // ES#sec-getsubstitution
271   // GetSubstitution(matched, str, position, captures, replacement)
272   // Expand the $-expressions in the string and return a new string with
273   // the result.
274   // A {start_index} can be passed to specify where to start scanning the
275   // replacement string.
276   V8_WARN_UNUSED_RESULT static MaybeHandle<String> GetSubstitution(
277       Isolate* isolate, Match* match, Handle<String> replacement,
278       int start_index = 0);
279 
280   // String equality operations.
281   inline bool Equals(String* other);
282   inline static bool Equals(Isolate* isolate, Handle<String> one,
283                             Handle<String> two);
284   bool IsUtf8EqualTo(Vector<const char> str, bool allow_prefix_match = false);
285 
286   // Dispatches to Is{One,Two}ByteEqualTo.
287   template <typename Char>
288   bool IsEqualTo(Vector<const Char> str);
289 
290   bool IsOneByteEqualTo(Vector<const uint8_t> str);
291   bool IsTwoByteEqualTo(Vector<const uc16> str);
292 
293   // Return a UTF8 representation of the string.  The string is null
294   // terminated but may optionally contain nulls.  Length is returned
295   // in length_output if length_output is not a null pointer  The string
296   // should be nearly flat, otherwise the performance of this method may
297   // be very slow (quadratic in the length).  Setting robustness_flag to
298   // ROBUST_STRING_TRAVERSAL invokes behaviour that is robust  This means it
299   // handles unexpected data without causing assert failures and it does not
300   // do any heap allocations.  This is useful when printing stack traces.
301   std::unique_ptr<char[]> ToCString(AllowNullsFlag allow_nulls,
302                                     RobustnessFlag robustness_flag, int offset,
303                                     int length, int* length_output = 0);
304   std::unique_ptr<char[]> ToCString(
305       AllowNullsFlag allow_nulls = DISALLOW_NULLS,
306       RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
307       int* length_output = 0);
308 
309   bool ComputeArrayIndex(uint32_t* index);
310 
311   // Externalization.
312   bool MakeExternal(v8::String::ExternalStringResource* resource);
313   bool MakeExternal(v8::String::ExternalOneByteStringResource* resource);
314   bool SupportsExternalization();
315 
316   // Conversion.
317   inline bool AsArrayIndex(uint32_t* index);
318   uint32_t inline ToValidIndex(Object* number);
319 
320   // Trimming.
321   enum TrimMode { kTrim, kTrimStart, kTrimEnd };
322   static Handle<String> Trim(Isolate* isolate, Handle<String> string,
323                              TrimMode mode);
324 
325   DECL_CAST(String)
326 
327   void PrintOn(FILE* out);
328 
329   // For use during stack traces.  Performs rudimentary sanity check.
330   bool LooksValid();
331 
332   // Dispatched behavior.
333   void StringShortPrint(StringStream* accumulator, bool show_details = true);
334   void PrintUC16(std::ostream& os, int start = 0, int end = -1);  // NOLINT
335 #if defined(DEBUG) || defined(OBJECT_PRINT)
336   char* ToAsciiArray();
337 #endif
338   DECL_PRINTER(String)
339   DECL_VERIFIER(String)
340 
341   inline bool IsFlat();
342 
343   // Layout description.
344   static const int kLengthOffset = Name::kSize;
345   static const int kSize = kLengthOffset + kPointerSize;
346 
347   // Max char codes.
348   static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar;
349   static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar;
350   static const int kMaxUtf16CodeUnit = 0xffff;
351   static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit;
352   static const uc32 kMaxCodePoint = 0x10ffff;
353 
354   // Maximal string length.
355   // The max length is different on 32 and 64 bit platforms. Max length for a
356   // 32-bit platform is ~268.4M chars. On 64-bit platforms, max length is
357   // ~1.073B chars. The limit on 64-bit is so that SeqTwoByteString::kMaxSize
358   // can fit in a 32bit int: 2^31 - 1 is the max positive int, minus one bit as
359   // each char needs two bytes, subtract 24 bytes for the string header size.
360 
361   // See include/v8.h for the definition.
362   static const int kMaxLength = v8::String::kMaxLength;
363   static_assert(kMaxLength <= (Smi::kMaxValue / 2 - kSize),
364                 "Unexpected max String length");
365 
366   // Max length for computing hash. For strings longer than this limit the
367   // string length is used as the hash value.
368   static const int kMaxHashCalcLength = 16383;
369 
370   // Limit for truncation in short printing.
371   static const int kMaxShortPrintLength = 1024;
372 
373   // Support for regular expressions.
374   const uc16* GetTwoByteData(unsigned start);
375 
376   // Helper function for flattening strings.
377   template <typename sinkchar>
378   static void WriteToFlat(String* source, sinkchar* sink, int from, int to);
379 
380   // The return value may point to the first aligned word containing the first
381   // non-one-byte character, rather than directly to the non-one-byte character.
382   // If the return value is >= the passed length, the entire string was
383   // one-byte.
NonAsciiStart(const char * chars,int length)384   static inline int NonAsciiStart(const char* chars, int length) {
385     const char* start = chars;
386     const char* limit = chars + length;
387 
388     if (length >= kIntptrSize) {
389       // Check unaligned bytes.
390       while (!IsAligned(reinterpret_cast<intptr_t>(chars), sizeof(uintptr_t))) {
391         if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
392           return static_cast<int>(chars - start);
393         }
394         ++chars;
395       }
396       // Check aligned words.
397       DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
398       const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFF * 0x80;
399       while (chars + sizeof(uintptr_t) <= limit) {
400         if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) {
401           return static_cast<int>(chars - start);
402         }
403         chars += sizeof(uintptr_t);
404       }
405     }
406     // Check remaining unaligned bytes.
407     while (chars < limit) {
408       if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
409         return static_cast<int>(chars - start);
410       }
411       ++chars;
412     }
413 
414     return static_cast<int>(chars - start);
415   }
416 
IsAscii(const char * chars,int length)417   static inline bool IsAscii(const char* chars, int length) {
418     return NonAsciiStart(chars, length) >= length;
419   }
420 
IsAscii(const uint8_t * chars,int length)421   static inline bool IsAscii(const uint8_t* chars, int length) {
422     return NonAsciiStart(reinterpret_cast<const char*>(chars), length) >=
423            length;
424   }
425 
NonOneByteStart(const uc16 * chars,int length)426   static inline int NonOneByteStart(const uc16* chars, int length) {
427     const uc16* limit = chars + length;
428     const uc16* start = chars;
429     while (chars < limit) {
430       if (*chars > kMaxOneByteCharCodeU) return static_cast<int>(chars - start);
431       ++chars;
432     }
433     return static_cast<int>(chars - start);
434   }
435 
IsOneByte(const uc16 * chars,int length)436   static inline bool IsOneByte(const uc16* chars, int length) {
437     return NonOneByteStart(chars, length) >= length;
438   }
439 
440   template <class Visitor>
441   static inline ConsString* VisitFlat(Visitor* visitor, String* string,
442                                       int offset = 0);
443 
444   static Handle<FixedArray> CalculateLineEnds(Isolate* isolate,
445                                               Handle<String> string,
446                                               bool include_ending_line);
447 
448  private:
449   friend class Name;
450   friend class StringTableInsertionKey;
451   friend class InternalizedStringKey;
452 
453   static Handle<String> SlowFlatten(Isolate* isolate, Handle<ConsString> cons,
454                                     PretenureFlag tenure);
455 
456   // Slow case of String::Equals.  This implementation works on any strings
457   // but it is most efficient on strings that are almost flat.
458   bool SlowEquals(String* other);
459 
460   static bool SlowEquals(Isolate* isolate, Handle<String> one,
461                          Handle<String> two);
462 
463   // Slow case of AsArrayIndex.
464   V8_EXPORT_PRIVATE bool SlowAsArrayIndex(uint32_t* index);
465 
466   // Compute and set the hash code.
467   uint32_t ComputeAndSetHash(Isolate* isolate);
468 
469   DISALLOW_IMPLICIT_CONSTRUCTORS(String);
470 };
471 
472 // The SeqString abstract class captures sequential string values.
473 class SeqString : public String {
474  public:
475   DECL_CAST(SeqString)
476 
477   // Layout description.
478   static const int kHeaderSize = String::kSize;
479 
480   // Truncate the string in-place if possible and return the result.
481   // In case of new_length == 0, the empty string is returned without
482   // truncating the original string.
483   V8_WARN_UNUSED_RESULT static Handle<String> Truncate(Handle<SeqString> string,
484                                                        int new_length);
485 
486  private:
487   DISALLOW_IMPLICIT_CONSTRUCTORS(SeqString);
488 };
489 
490 class InternalizedString : public String {
491  public:
492   DECL_CAST(InternalizedString)
493   // TODO(neis): Possibly move some stuff from String here.
494 
495  private:
496   DISALLOW_IMPLICIT_CONSTRUCTORS(InternalizedString);
497 };
498 
499 // The OneByteString class captures sequential one-byte string objects.
500 // Each character in the OneByteString is an one-byte character.
501 class SeqOneByteString : public SeqString {
502  public:
503   static const bool kHasOneByteEncoding = true;
504 
505   // Dispatched behavior.
506   inline uint16_t SeqOneByteStringGet(int index);
507   inline void SeqOneByteStringSet(int index, uint16_t value);
508 
509   // Get the address of the characters in this string.
510   inline Address GetCharsAddress();
511 
512   inline uint8_t* GetChars();
513 
514   // Clear uninitialized padding space. This ensures that the snapshot content
515   // is deterministic.
516   void clear_padding();
517 
518   DECL_CAST(SeqOneByteString)
519 
520   // Garbage collection support.  This method is called by the
521   // garbage collector to compute the actual size of an OneByteString
522   // instance.
523   inline int SeqOneByteStringSize(InstanceType instance_type);
524 
525   // Computes the size for an OneByteString instance of a given length.
SizeFor(int length)526   static int SizeFor(int length) {
527     return OBJECT_POINTER_ALIGN(kHeaderSize + length * kCharSize);
528   }
529 
530   // Maximal memory usage for a single sequential one-byte string.
531   static const int kMaxCharsSize = kMaxLength;
532   static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize);
533   STATIC_ASSERT((kMaxSize - kHeaderSize) >= String::kMaxLength);
534 
535   class BodyDescriptor;
536   // No weak fields.
537   typedef BodyDescriptor BodyDescriptorWeak;
538 
539  private:
540   DISALLOW_IMPLICIT_CONSTRUCTORS(SeqOneByteString);
541 };
542 
543 // The TwoByteString class captures sequential unicode string objects.
544 // Each character in the TwoByteString is a two-byte uint16_t.
545 class SeqTwoByteString : public SeqString {
546  public:
547   static const bool kHasOneByteEncoding = false;
548 
549   // Dispatched behavior.
550   inline uint16_t SeqTwoByteStringGet(int index);
551   inline void SeqTwoByteStringSet(int index, uint16_t value);
552 
553   // Get the address of the characters in this string.
554   inline Address GetCharsAddress();
555 
556   inline uc16* GetChars();
557 
558   // Clear uninitialized padding space. This ensures that the snapshot content
559   // is deterministic.
560   void clear_padding();
561 
562   // For regexp code.
563   const uint16_t* SeqTwoByteStringGetData(unsigned start);
564 
565   DECL_CAST(SeqTwoByteString)
566 
567   // Garbage collection support.  This method is called by the
568   // garbage collector to compute the actual size of a TwoByteString
569   // instance.
570   inline int SeqTwoByteStringSize(InstanceType instance_type);
571 
572   // Computes the size for a TwoByteString instance of a given length.
SizeFor(int length)573   static int SizeFor(int length) {
574     return OBJECT_POINTER_ALIGN(kHeaderSize + length * kShortSize);
575   }
576 
577   // Maximal memory usage for a single sequential two-byte string.
578   static const int kMaxCharsSize = kMaxLength * 2;
579   static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize);
580   STATIC_ASSERT(static_cast<int>((kMaxSize - kHeaderSize) / sizeof(uint16_t)) >=
581                 String::kMaxLength);
582 
583   class BodyDescriptor;
584   // No weak fields.
585   typedef BodyDescriptor BodyDescriptorWeak;
586 
587  private:
588   DISALLOW_IMPLICIT_CONSTRUCTORS(SeqTwoByteString);
589 };
590 
591 // The ConsString class describes string values built by using the
592 // addition operator on strings.  A ConsString is a pair where the
593 // first and second components are pointers to other string values.
594 // One or both components of a ConsString can be pointers to other
595 // ConsStrings, creating a binary tree of ConsStrings where the leaves
596 // are non-ConsString string values.  The string value represented by
597 // a ConsString can be obtained by concatenating the leaf string
598 // values in a left-to-right depth-first traversal of the tree.
599 class ConsString : public String {
600  public:
601   // First string of the cons cell.
602   inline String* first();
603   // Doesn't check that the result is a string, even in debug mode.  This is
604   // useful during GC where the mark bits confuse the checks.
605   inline Object* unchecked_first();
606   inline void set_first(Isolate* isolate, String* first,
607                         WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
608 
609   // Second string of the cons cell.
610   inline String* second();
611   // Doesn't check that the result is a string, even in debug mode.  This is
612   // useful during GC where the mark bits confuse the checks.
613   inline Object* unchecked_second();
614   inline void set_second(Isolate* isolate, String* second,
615                          WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
616 
617   // Dispatched behavior.
618   V8_EXPORT_PRIVATE uint16_t ConsStringGet(int index);
619 
620   DECL_CAST(ConsString)
621 
622   // Layout description.
623   static const int kFirstOffset = POINTER_SIZE_ALIGN(String::kSize);
624   static const int kSecondOffset = kFirstOffset + kPointerSize;
625   static const int kSize = kSecondOffset + kPointerSize;
626 
627   // Minimum length for a cons string.
628   static const int kMinLength = 13;
629 
630   typedef FixedBodyDescriptor<kFirstOffset, kSecondOffset + kPointerSize, kSize>
631       BodyDescriptor;
632   // No weak fields.
633   typedef BodyDescriptor BodyDescriptorWeak;
634 
635   DECL_VERIFIER(ConsString)
636 
637  private:
638   DISALLOW_IMPLICIT_CONSTRUCTORS(ConsString);
639 };
640 
641 // The ThinString class describes string objects that are just references
642 // to another string object. They are used for in-place internalization when
643 // the original string cannot actually be internalized in-place: in these
644 // cases, the original string is converted to a ThinString pointing at its
645 // internalized version (which is allocated as a new object).
646 // In terms of memory layout and most algorithms operating on strings,
647 // ThinStrings can be thought of as "one-part cons strings".
648 class ThinString : public String {
649  public:
650   // Actual string that this ThinString refers to.
651   inline String* actual() const;
652   inline HeapObject* unchecked_actual() const;
653   inline void set_actual(String* s,
654                          WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
655 
656   V8_EXPORT_PRIVATE uint16_t ThinStringGet(int index);
657 
658   DECL_CAST(ThinString)
659   DECL_VERIFIER(ThinString)
660 
661   // Layout description.
662   static const int kActualOffset = String::kSize;
663   static const int kSize = kActualOffset + kPointerSize;
664 
665   typedef FixedBodyDescriptor<kActualOffset, kSize, kSize> BodyDescriptor;
666   // No weak fields.
667   typedef BodyDescriptor BodyDescriptorWeak;
668 
669  private:
670   DISALLOW_COPY_AND_ASSIGN(ThinString);
671 };
672 
673 // The Sliced String class describes strings that are substrings of another
674 // sequential string.  The motivation is to save time and memory when creating
675 // a substring.  A Sliced String is described as a pointer to the parent,
676 // the offset from the start of the parent string and the length.  Using
677 // a Sliced String therefore requires unpacking of the parent string and
678 // adding the offset to the start address.  A substring of a Sliced String
679 // are not nested since the double indirection is simplified when creating
680 // such a substring.
681 // Currently missing features are:
682 //  - handling externalized parent strings
683 //  - external strings as parent
684 //  - truncating sliced string to enable otherwise unneeded parent to be GC'ed.
685 class SlicedString : public String {
686  public:
687   inline String* parent();
688   inline void set_parent(Isolate* isolate, String* parent,
689                          WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
690   inline int offset() const;
691   inline void set_offset(int offset);
692 
693   // Dispatched behavior.
694   V8_EXPORT_PRIVATE uint16_t SlicedStringGet(int index);
695 
696   DECL_CAST(SlicedString)
697 
698   // Layout description.
699   static const int kParentOffset = POINTER_SIZE_ALIGN(String::kSize);
700   static const int kOffsetOffset = kParentOffset + kPointerSize;
701   static const int kSize = kOffsetOffset + kPointerSize;
702 
703   // Minimum length for a sliced string.
704   static const int kMinLength = 13;
705 
706   typedef FixedBodyDescriptor<kParentOffset, kOffsetOffset + kPointerSize,
707                               kSize>
708       BodyDescriptor;
709   // No weak fields.
710   typedef BodyDescriptor BodyDescriptorWeak;
711 
712   DECL_VERIFIER(SlicedString)
713 
714  private:
715   DISALLOW_IMPLICIT_CONSTRUCTORS(SlicedString);
716 };
717 
718 // The ExternalString class describes string values that are backed by
719 // a string resource that lies outside the V8 heap.  ExternalStrings
720 // consist of the length field common to all strings, a pointer to the
721 // external resource.  It is important to ensure (externally) that the
722 // resource is not deallocated while the ExternalString is live in the
723 // V8 heap.
724 //
725 // The API expects that all ExternalStrings are created through the
726 // API.  Therefore, ExternalStrings should not be used internally.
727 class ExternalString : public String {
728  public:
729   DECL_CAST(ExternalString)
730 
731   // Layout description.
732   static const int kResourceOffset = POINTER_SIZE_ALIGN(String::kSize);
733   static const int kShortSize = kResourceOffset + kPointerSize;
734   static const int kResourceDataOffset = kResourceOffset + kPointerSize;
735   static const int kSize = kResourceDataOffset + kPointerSize;
736 
737   // Return whether external string is short (data pointer is not cached).
738   inline bool is_short() const;
739   // Size in bytes of the external payload.
740   int ExternalPayloadSize() const;
741 
742   // Used in the serializer/deserializer.
743   inline Address resource_as_address();
744   inline void set_address_as_resource(Address address);
745   inline uint32_t resource_as_uint32();
746   inline void set_uint32_as_resource(uint32_t value);
747 
748   STATIC_ASSERT(kResourceOffset == Internals::kStringResourceOffset);
749 
750  private:
751   DISALLOW_IMPLICIT_CONSTRUCTORS(ExternalString);
752 };
753 
754 // The ExternalOneByteString class is an external string backed by an
755 // one-byte string.
756 class ExternalOneByteString : public ExternalString {
757  public:
758   static const bool kHasOneByteEncoding = true;
759 
760   typedef v8::String::ExternalOneByteStringResource Resource;
761 
762   // The underlying resource.
763   inline const Resource* resource();
764 
765   // It is assumed that the previous resource is null. If it is not null, then
766   // it is the responsability of the caller the handle the previous resource.
767   inline void SetResource(Isolate* isolate, const Resource* buffer);
768   // Used only during serialization.
769   inline void set_resource(const Resource* buffer);
770 
771   // Update the pointer cache to the external character array.
772   // The cached pointer is always valid, as the external character array does =
773   // not move during lifetime.  Deserialization is the only exception, after
774   // which the pointer cache has to be refreshed.
775   inline void update_data_cache();
776 
777   inline const uint8_t* GetChars();
778 
779   // Dispatched behavior.
780   inline uint16_t ExternalOneByteStringGet(int index);
781 
782   DECL_CAST(ExternalOneByteString)
783 
784   class BodyDescriptor;
785   // No weak fields.
786   typedef BodyDescriptor BodyDescriptorWeak;
787 
788  private:
789   DISALLOW_IMPLICIT_CONSTRUCTORS(ExternalOneByteString);
790 };
791 
792 // The ExternalTwoByteString class is an external string backed by a UTF-16
793 // encoded string.
794 class ExternalTwoByteString : public ExternalString {
795  public:
796   static const bool kHasOneByteEncoding = false;
797 
798   typedef v8::String::ExternalStringResource Resource;
799 
800   // The underlying string resource.
801   inline const Resource* resource();
802 
803   // It is assumed that the previous resource is null. If it is not null, then
804   // it is the responsability of the caller the handle the previous resource.
805   inline void SetResource(Isolate* isolate, const Resource* buffer);
806   // Used only during serialization.
807   inline void set_resource(const Resource* buffer);
808 
809   // Update the pointer cache to the external character array.
810   // The cached pointer is always valid, as the external character array does =
811   // not move during lifetime.  Deserialization is the only exception, after
812   // which the pointer cache has to be refreshed.
813   inline void update_data_cache();
814 
815   inline const uint16_t* GetChars();
816 
817   // Dispatched behavior.
818   inline uint16_t ExternalTwoByteStringGet(int index);
819 
820   // For regexp code.
821   inline const uint16_t* ExternalTwoByteStringGetData(unsigned start);
822 
823   DECL_CAST(ExternalTwoByteString)
824 
825   class BodyDescriptor;
826   // No weak fields.
827   typedef BodyDescriptor BodyDescriptorWeak;
828 
829  private:
830   DISALLOW_IMPLICIT_CONSTRUCTORS(ExternalTwoByteString);
831 };
832 
833 // A flat string reader provides random access to the contents of a
834 // string independent of the character width of the string.  The handle
835 // must be valid as long as the reader is being used.
836 class FlatStringReader : public Relocatable {
837  public:
838   FlatStringReader(Isolate* isolate, Handle<String> str);
839   FlatStringReader(Isolate* isolate, Vector<const char> input);
840   void PostGarbageCollection();
841   inline uc32 Get(int index);
842   template <typename Char>
843   inline Char Get(int index);
length()844   int length() { return length_; }
845 
846  private:
847   String** str_;
848   bool is_one_byte_;
849   int length_;
850   const void* start_;
851 };
852 
853 // This maintains an off-stack representation of the stack frames required
854 // to traverse a ConsString, allowing an entirely iterative and restartable
855 // traversal of the entire string
856 class ConsStringIterator {
857  public:
ConsStringIterator()858   inline ConsStringIterator() {}
859   inline explicit ConsStringIterator(ConsString* cons_string, int offset = 0) {
860     Reset(cons_string, offset);
861   }
862   inline void Reset(ConsString* cons_string, int offset = 0) {
863     depth_ = 0;
864     // Next will always return nullptr.
865     if (cons_string == nullptr) return;
866     Initialize(cons_string, offset);
867   }
868   // Returns nullptr when complete.
Next(int * offset_out)869   inline String* Next(int* offset_out) {
870     *offset_out = 0;
871     if (depth_ == 0) return nullptr;
872     return Continue(offset_out);
873   }
874 
875  private:
876   static const int kStackSize = 32;
877   // Use a mask instead of doing modulo operations for stack wrapping.
878   static const int kDepthMask = kStackSize - 1;
879   static_assert(base::bits::IsPowerOfTwo(kStackSize),
880                 "kStackSize must be power of two");
881   static inline int OffsetForDepth(int depth);
882 
883   inline void PushLeft(ConsString* string);
884   inline void PushRight(ConsString* string);
885   inline void AdjustMaximumDepth();
886   inline void Pop();
StackBlown()887   inline bool StackBlown() { return maximum_depth_ - depth_ == kStackSize; }
888   void Initialize(ConsString* cons_string, int offset);
889   String* Continue(int* offset_out);
890   String* NextLeaf(bool* blew_stack);
891   String* Search(int* offset_out);
892 
893   // Stack must always contain only frames for which right traversal
894   // has not yet been performed.
895   ConsString* frames_[kStackSize];
896   ConsString* root_;
897   int depth_;
898   int maximum_depth_;
899   int consumed_;
900   DISALLOW_COPY_AND_ASSIGN(ConsStringIterator);
901 };
902 
903 class StringCharacterStream {
904  public:
905   inline explicit StringCharacterStream(String* string, int offset = 0);
906   inline uint16_t GetNext();
907   inline bool HasMore();
908   inline void Reset(String* string, int offset = 0);
909   inline void VisitOneByteString(const uint8_t* chars, int length);
910   inline void VisitTwoByteString(const uint16_t* chars, int length);
911 
912  private:
913   ConsStringIterator iter_;
914   bool is_one_byte_;
915   union {
916     const uint8_t* buffer8_;
917     const uint16_t* buffer16_;
918   };
919   const uint8_t* end_;
920   DISALLOW_COPY_AND_ASSIGN(StringCharacterStream);
921 };
922 
923 }  // namespace internal
924 }  // namespace v8
925 
926 #include "src/objects/object-macros-undef.h"
927 
928 #endif  // V8_OBJECTS_STRING_H_
929