1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_UNICODE_H_
6 #define V8_UNICODE_H_
7 
8 #include <sys/types.h>
9 #include "src/globals.h"
10 /**
11  * \file
12  * Definitions and convenience functions for working with unicode.
13  */
14 
15 namespace unibrow {
16 
17 typedef unsigned int uchar;
18 typedef unsigned char byte;
19 
20 /**
21  * The max length of the result of converting the case of a single
22  * character.
23  */
24 const int kMaxMappingSize = 4;
25 
26 template <class T, int size = 256>
27 class Predicate {
28  public:
Predicate()29   inline Predicate() { }
30   inline bool get(uchar c);
31  private:
32   friend class Test;
33   bool CalculateValue(uchar c);
34   struct CacheEntry {
CacheEntryCacheEntry35     inline CacheEntry() : code_point_(0), value_(0) { }
CacheEntryCacheEntry36     inline CacheEntry(uchar code_point, bool value)
37       : code_point_(code_point),
38         value_(value) { }
39     uchar code_point_ : 21;
40     bool value_ : 1;
41   };
42   static const int kSize = size;
43   static const int kMask = kSize - 1;
44   CacheEntry entries_[kSize];
45 };
46 
47 // A cache used in case conversion.  It caches the value for characters
48 // that either have no mapping or map to a single character independent
49 // of context.  Characters that map to more than one character or that
50 // map differently depending on context are always looked up.
51 template <class T, int size = 256>
52 class Mapping {
53  public:
Mapping()54   inline Mapping() { }
55   inline int get(uchar c, uchar n, uchar* result);
56  private:
57   friend class Test;
58   int CalculateValue(uchar c, uchar n, uchar* result);
59   struct CacheEntry {
CacheEntryCacheEntry60     inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
CacheEntryCacheEntry61     inline CacheEntry(uchar code_point, signed offset)
62       : code_point_(code_point),
63         offset_(offset) { }
64     uchar code_point_;
65     signed offset_;
66     static const int kNoChar = (1 << 21) - 1;
67   };
68   static const int kSize = size;
69   static const int kMask = kSize - 1;
70   CacheEntry entries_[kSize];
71 };
72 
73 class UnicodeData {
74  private:
75   friend class Test;
76   static int GetByteCount();
77   static const uchar kMaxCodePoint;
78 };
79 
80 class Utf16 {
81  public:
IsSurrogatePair(int lead,int trail)82   static inline bool IsSurrogatePair(int lead, int trail) {
83     return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
84   }
IsLeadSurrogate(int code)85   static inline bool IsLeadSurrogate(int code) {
86     if (code == kNoPreviousCharacter) return false;
87     return (code & 0xfc00) == 0xd800;
88   }
IsTrailSurrogate(int code)89   static inline bool IsTrailSurrogate(int code) {
90     if (code == kNoPreviousCharacter) return false;
91     return (code & 0xfc00) == 0xdc00;
92   }
93 
CombineSurrogatePair(uchar lead,uchar trail)94   static inline int CombineSurrogatePair(uchar lead, uchar trail) {
95     return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
96   }
97   static const int kNoPreviousCharacter = -1;
98   static const uchar kMaxNonSurrogateCharCode = 0xffff;
99   // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
100   // of UTF-8 data.  The special case where the unit is a surrogate
101   // trail produces 1 byte net, because the encoding of the pair is
102   // 4 bytes and the 3 bytes that were used to encode the lead surrogate
103   // can be reclaimed.
104   static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
105   // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
106   // The illegality stems from the surrogate not being part of a pair.
107   static const int kUtf8BytesToCodeASurrogate = 3;
LeadSurrogate(uint32_t char_code)108   static inline uint16_t LeadSurrogate(uint32_t char_code) {
109     return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
110   }
TrailSurrogate(uint32_t char_code)111   static inline uint16_t TrailSurrogate(uint32_t char_code) {
112     return 0xdc00 + (char_code & 0x3ff);
113   }
114 };
115 
116 class Latin1 {
117  public:
118   static const unsigned kMaxChar = 0xff;
119   // Returns 0 if character does not convert to single latin-1 character
120   // or if the character doesn't not convert back to latin-1 via inverse
121   // operation (upper to lower, etc).
122   static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
123 };
124 
125 class Utf8 {
126  public:
127   static inline uchar Length(uchar chr, int previous);
128   static inline unsigned EncodeOneByte(char* out, uint8_t c);
129   static inline unsigned Encode(char* out,
130                                 uchar c,
131                                 int previous,
132                                 bool replace_invalid = false);
133   static uchar CalculateValue(const byte* str,
134                               unsigned length,
135                               unsigned* cursor);
136 
137   // The unicode replacement character, used to signal invalid unicode
138   // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
139   static const uchar kBadChar = 0xFFFD;
140   static const unsigned kMaxEncodedSize   = 4;
141   static const unsigned kMaxOneByteChar   = 0x7f;
142   static const unsigned kMaxTwoByteChar   = 0x7ff;
143   static const unsigned kMaxThreeByteChar = 0xffff;
144   static const unsigned kMaxFourByteChar  = 0x1fffff;
145 
146   // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
147   // that match are coded as a 4 byte UTF-8 sequence.
148   static const unsigned kBytesSavedByCombiningSurrogates = 2;
149   static const unsigned kSizeOfUnmatchedSurrogate = 3;
150   // The maximum size a single UTF-16 code unit may take up when encoded as
151   // UTF-8.
152   static const unsigned kMax16BitCodeUnitSize  = 3;
153   static inline uchar ValueOf(const byte* str,
154                               unsigned length,
155                               unsigned* cursor);
156 };
157 
158 
159 class Utf8DecoderBase {
160  public:
161   // Initialization done in subclass.
162   inline Utf8DecoderBase();
163   inline Utf8DecoderBase(uint16_t* buffer,
164                          unsigned buffer_length,
165                          const uint8_t* stream,
166                          unsigned stream_length);
Utf16Length()167   inline unsigned Utf16Length() const { return utf16_length_; }
168  protected:
169   // This reads all characters and sets the utf16_length_.
170   // The first buffer_length utf16 chars are cached in the buffer.
171   void Reset(uint16_t* buffer,
172              unsigned buffer_length,
173              const uint8_t* stream,
174              unsigned stream_length);
175   static void WriteUtf16Slow(const uint8_t* stream,
176                              uint16_t* data,
177                              unsigned length);
178   const uint8_t* unbuffered_start_;
179   unsigned utf16_length_;
180   bool last_byte_of_buffer_unused_;
181  private:
182   DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
183 };
184 
185 template <unsigned kBufferSize>
186 class Utf8Decoder : public Utf8DecoderBase {
187  public:
Utf8Decoder()188   inline Utf8Decoder() {}
189   inline Utf8Decoder(const char* stream, unsigned length);
190   inline void Reset(const char* stream, unsigned length);
191   inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
192  private:
193   uint16_t buffer_[kBufferSize];
194 };
195 
196 
197 struct Uppercase {
198   static bool Is(uchar c);
199 };
200 struct Lowercase {
201   static bool Is(uchar c);
202 };
203 struct Letter {
204   static bool Is(uchar c);
205 };
206 struct Number {
207   static bool Is(uchar c);
208 };
209 struct WhiteSpace {
210   static bool Is(uchar c);
211 };
212 struct LineTerminator {
213   static bool Is(uchar c);
214 };
215 struct CombiningMark {
216   static bool Is(uchar c);
217 };
218 struct ConnectorPunctuation {
219   static bool Is(uchar c);
220 };
221 struct ToLowercase {
222   static const int kMaxWidth = 3;
223   static const bool kIsToLower = true;
224   static int Convert(uchar c,
225                      uchar n,
226                      uchar* result,
227                      bool* allow_caching_ptr);
228 };
229 struct ToUppercase {
230   static const int kMaxWidth = 3;
231   static const bool kIsToLower = false;
232   static int Convert(uchar c,
233                      uchar n,
234                      uchar* result,
235                      bool* allow_caching_ptr);
236 };
237 struct Ecma262Canonicalize {
238   static const int kMaxWidth = 1;
239   static int Convert(uchar c,
240                      uchar n,
241                      uchar* result,
242                      bool* allow_caching_ptr);
243 };
244 struct Ecma262UnCanonicalize {
245   static const int kMaxWidth = 4;
246   static int Convert(uchar c,
247                      uchar n,
248                      uchar* result,
249                      bool* allow_caching_ptr);
250 };
251 struct CanonicalizationRange {
252   static const int kMaxWidth = 1;
253   static int Convert(uchar c,
254                      uchar n,
255                      uchar* result,
256                      bool* allow_caching_ptr);
257 };
258 
259 }  // namespace unibrow
260 
261 #endif  // V8_UNICODE_H_
262