1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_UNICODE_DECODER_H_
6 #define V8_UNICODE_DECODER_H_
7
8 #include <sys/types.h>
9 #include "src/globals.h"
10 #include "src/utils.h"
11
12 namespace unibrow {
13
14 class V8_EXPORT_PRIVATE Utf8DecoderBase {
15 public:
16 // Initialization done in subclass.
17 inline Utf8DecoderBase();
18 inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
19 const uint8_t* stream, size_t stream_length);
Utf16Length()20 inline size_t Utf16Length() const { return utf16_length_; }
21
22 protected:
23 // This reads all characters and sets the utf16_length_.
24 // The first buffer_length utf16 chars are cached in the buffer.
25 void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream,
26 size_t stream_length);
27 static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length,
28 uint16_t* data, size_t length);
29 const uint8_t* unbuffered_start_;
30 size_t unbuffered_length_;
31 size_t utf16_length_;
32 bool last_byte_of_buffer_unused_;
33
34 private:
35 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
36 };
37
38 template <size_t kBufferSize>
39 class Utf8Decoder : public Utf8DecoderBase {
40 public:
Utf8Decoder()41 inline Utf8Decoder() {}
42 inline Utf8Decoder(const char* stream, size_t length);
43 inline void Reset(const char* stream, size_t length);
44 inline size_t WriteUtf16(uint16_t* data, size_t length) const;
45
46 private:
47 uint16_t buffer_[kBufferSize];
48 };
49
50
Utf8DecoderBase()51 Utf8DecoderBase::Utf8DecoderBase()
52 : unbuffered_start_(NULL),
53 unbuffered_length_(0),
54 utf16_length_(0),
55 last_byte_of_buffer_unused_(false) {}
56
57
Utf8DecoderBase(uint16_t * buffer,size_t buffer_length,const uint8_t * stream,size_t stream_length)58 Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
59 const uint8_t* stream, size_t stream_length) {
60 Reset(buffer, buffer_length, stream, stream_length);
61 }
62
63
64 template <size_t kBufferSize>
Utf8Decoder(const char * stream,size_t length)65 Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length)
66 : Utf8DecoderBase(buffer_, kBufferSize,
67 reinterpret_cast<const uint8_t*>(stream), length) {}
68
69
70 template <size_t kBufferSize>
Reset(const char * stream,size_t length)71 void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) {
72 Utf8DecoderBase::Reset(buffer_, kBufferSize,
73 reinterpret_cast<const uint8_t*>(stream), length);
74 }
75
76
77 template <size_t kBufferSize>
WriteUtf16(uint16_t * data,size_t length)78 size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
79 size_t length) const {
80 DCHECK(length > 0);
81 if (length > utf16_length_) length = utf16_length_;
82 // memcpy everything in buffer.
83 size_t buffer_length =
84 last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
85 size_t memcpy_length = length <= buffer_length ? length : buffer_length;
86 v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
87 if (length <= buffer_length) return length;
88 DCHECK(unbuffered_start_ != NULL);
89 // Copy the rest the slow way.
90 WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length,
91 length - buffer_length);
92 return length;
93 }
94
95 class Latin1 {
96 public:
97 static const unsigned kMaxChar = 0xff;
98 // Returns 0 if character does not convert to single latin-1 character
99 // or if the character doesn't not convert back to latin-1 via inverse
100 // operation (upper to lower, etc).
101 static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
102 };
103
104
ConvertNonLatin1ToLatin1(uint16_t c)105 uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
106 DCHECK(c > Latin1::kMaxChar);
107 switch (c) {
108 // This are equivalent characters in unicode.
109 case 0x39c:
110 case 0x3bc:
111 return 0xb5;
112 // This is an uppercase of a Latin-1 character
113 // outside of Latin-1.
114 case 0x178:
115 return 0xff;
116 }
117 return 0;
118 }
119
120
121 } // namespace unibrow
122
123 #endif // V8_UNICODE_DECODER_H_
124