1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 #include <google/protobuf/util/internal/json_escaping.h>
32 
33 #include <google/protobuf/stubs/logging.h>
34 #include <google/protobuf/stubs/common.h>
35 
36 namespace google {
37 namespace protobuf {
38 namespace util {
39 namespace converter {
40 
41 namespace {
42 
43 // Array of hex characters for conversion to hex.
44 static const char kHex[] = "0123456789abcdef";
45 
46 // Characters 0x00 to 0x9f are very commonly used, so we provide a special
47 // table lookup.
48 //
49 // For unicode code point ch < 0xa0:
50 // kCommonEscapes[ch] is the escaped string of ch, if escaping is needed;
51 //                    or an empty string, if escaping is not needed.
52 static const char kCommonEscapes[160][7] = {
53   // C0 (ASCII and derivatives) control characters
54   "\\u0000", "\\u0001", "\\u0002", "\\u0003",  // 0x00
55   "\\u0004", "\\u0005", "\\u0006", "\\u0007",
56   "\\b",     "\\t",     "\\n",     "\\u000b",
57   "\\f",     "\\r",     "\\u000e", "\\u000f",
58   "\\u0010", "\\u0011", "\\u0012", "\\u0013",  // 0x10
59   "\\u0014", "\\u0015", "\\u0016", "\\u0017",
60   "\\u0018", "\\u0019", "\\u001a", "\\u001b",
61   "\\u001c", "\\u001d", "\\u001e", "\\u001f",
62   // Escaping of " and \ are required by www.json.org string definition.
63   // Escaping of < and > are required for HTML security.
64   "", "", "\\\"", "", "",        "", "",        "",  // 0x20
65   "", "", "",     "", "",        "", "",        "",
66   "", "", "",     "", "",        "", "",        "",  // 0x30
67   "", "", "",     "", "\\u003c", "", "\\u003e", "",
68   "", "", "",     "", "",        "", "",        "",  // 0x40
69   "", "", "",     "", "",        "", "",        "",
70   "", "", "",     "", "",        "", "",        "",  // 0x50
71   "", "", "",     "", "\\\\",    "", "",        "",
72   "", "", "",     "", "",        "", "",        "",  // 0x60
73   "", "", "",     "", "",        "", "",        "",
74   "", "", "",     "", "",        "", "",        "",  // 0x70
75   "", "", "",     "", "",        "", "",        "\\u007f",
76   // C1 (ISO 8859 and Unicode) extended control characters
77   "\\u0080", "\\u0081", "\\u0082", "\\u0083",  // 0x80
78   "\\u0084", "\\u0085", "\\u0086", "\\u0087",
79   "\\u0088", "\\u0089", "\\u008a", "\\u008b",
80   "\\u008c", "\\u008d", "\\u008e", "\\u008f",
81   "\\u0090", "\\u0091", "\\u0092", "\\u0093",  // 0x90
82   "\\u0094", "\\u0095", "\\u0096", "\\u0097",
83   "\\u0098", "\\u0099", "\\u009a", "\\u009b",
84   "\\u009c", "\\u009d", "\\u009e", "\\u009f"
85 };
86 
87 // Determines if the given char value is a unicode high-surrogate code unit.
88 // Such values do not represent characters by themselves, but are used in the
89 // representation of supplementary characters in the utf-16 encoding.
IsHighSurrogate(uint16 c)90 inline bool IsHighSurrogate(uint16 c) {
91   // Optimized form of:
92   // return c >= kMinHighSurrogate && c <= kMaxHighSurrogate;
93   // (Reduced from 3 ALU instructions to 2 ALU instructions)
94   return (c & ~(JsonEscaping::kMaxHighSurrogate -
95                 JsonEscaping::kMinHighSurrogate))
96       == JsonEscaping::kMinHighSurrogate;
97 }
98 
99 // Determines if the given char value is a unicode low-surrogate code unit.
100 // Such values do not represent characters by themselves, but are used in the
101 // representation of supplementary characters in the utf-16 encoding.
IsLowSurrogate(uint16 c)102 inline bool IsLowSurrogate(uint16 c) {
103   // Optimized form of:
104   // return c >= kMinLowSurrogate && c <= kMaxLowSurrogate;
105   // (Reduced from 3 ALU instructions to 2 ALU instructions)
106   return (c & ~(JsonEscaping::kMaxLowSurrogate -
107                 JsonEscaping::kMinLowSurrogate))
108       == JsonEscaping::kMinLowSurrogate;
109 }
110 
111 // Determines if the given char value is a unicode surrogate code unit (either
112 // high-surrogate or low-surrogate).
IsSurrogate(uint32 c)113 inline bool IsSurrogate(uint32 c) {
114   // Optimized form of:
115   // return c >= kMinHighSurrogate && c <= kMaxLowSurrogate;
116   // (Reduced from 3 ALU instructions to 2 ALU instructions)
117   return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate;
118 }
119 
120 // Returns true if the given unicode code point cp is
121 // in the supplementary character range.
IsSupplementalCodePoint(uint32 cp)122 inline bool IsSupplementalCodePoint(uint32 cp) {
123   // Optimized form of:
124   // return kMinSupplementaryCodePoint <= cp && cp <= kMaxCodePoint;
125   // (Reduced from 3 ALU instructions to 2 ALU instructions)
126   return (cp & ~(JsonEscaping::kMinSupplementaryCodePoint - 1))
127       < JsonEscaping::kMaxCodePoint;
128 }
129 
130 // Returns true if the given unicode code point cp is a valid
131 // unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint).
IsValidCodePoint(uint32 cp)132 inline bool IsValidCodePoint(uint32 cp) {
133   return cp <= JsonEscaping::kMaxCodePoint;
134 }
135 
136 // Converts the specified surrogate pair to its supplementary code point value.
137 // It is the callers' responsibility to validate the specified surrogate pair.
ToCodePoint(uint16 high,uint16 low)138 inline uint32 ToCodePoint(uint16 high, uint16 low) {
139   // Optimized form of:
140   // return ((high - kMinHighSurrogate) << 10)
141   //     + (low - kMinLowSurrogate)
142   //     + kMinSupplementaryCodePoint;
143   // (Reduced from 5 ALU instructions to 3 ALU instructions)
144   return (high << 10) + low +
145       (JsonEscaping::kMinSupplementaryCodePoint
146        - (static_cast<unsigned>(JsonEscaping::kMinHighSurrogate) << 10)
147        - JsonEscaping::kMinLowSurrogate);
148 }
149 
150 // Returns the low surrogate for the given unicode code point. The result is
151 // meaningless if the given code point is not a supplementary character.
ToLowSurrogate(uint32 cp)152 inline uint16 ToLowSurrogate(uint32 cp) {
153   return (cp & (JsonEscaping::kMaxLowSurrogate
154                 - JsonEscaping::kMinLowSurrogate))
155       + JsonEscaping::kMinLowSurrogate;
156 }
157 
158 // Returns the high surrogate for the given unicode code point. The result is
159 // meaningless if the given code point is not a supplementary character.
ToHighSurrogate(uint32 cp)160 inline uint16 ToHighSurrogate(uint32 cp) {
161   return (cp >> 10) + (JsonEscaping::kMinHighSurrogate -
162                        (JsonEscaping::kMinSupplementaryCodePoint >> 10));
163 }
164 
165 // Input str is encoded in UTF-8. A unicode code point could be encoded in
166 // UTF-8 using anywhere from 1 to 4 characters, and it could span multiple
167 // reads of the ByteSource.
168 //
169 // This function reads the next unicode code point from the input (str) at
170 // the given position (index), taking into account any left-over partial
171 // code point from the previous iteration (cp), together with the number
172 // of characters left to read to complete this code point (num_left).
173 //
174 // This function assumes that the input (str) is valid at the given position
175 // (index). In order words, at least one character could be read successfully.
176 //
177 // The code point read (partial or complete) is stored in (cp). Upon return,
178 // (num_left) stores the number of characters that has yet to be read in
179 // order to complete the current unicode code point. If the read is complete,
180 // then (num_left) is 0. Also, (num_read) is the number of characters read.
181 //
182 // Returns false if we encounter an invalid UTF-8 string. Returns true
183 // otherwise, including the case when we reach the end of the input (str)
184 // before a complete unicode code point is read.
ReadCodePoint(StringPiece str,int index,uint32 * cp,int * num_left,int * num_read)185 bool ReadCodePoint(StringPiece str, int index,
186                    uint32 *cp, int* num_left, int *num_read) {
187   if (*num_left == 0) {
188     // Last read was complete. Start reading a new unicode code point.
189     *cp = static_cast<uint8>(str[index++]);
190     *num_read = 1;
191     // The length of the code point is determined from reading the first byte.
192     //
193     // If the first byte is between:
194     //    0..0x7f: that's the value of the code point.
195     // 0x80..0xbf: <invalid>
196     // 0xc0..0xdf: 11-bit code point encoded in 2 bytes.
197     //                                   bit 10-6, bit 5-0
198     // 0xe0..0xef: 16-bit code point encoded in 3 bytes.
199     //                        bit 15-12, bit 11-6, bit 5-0
200     // 0xf0..0xf7: 21-bit code point encoded in 4 bytes.
201     //             bit 20-18, bit 17-12, bit 11-6, bit 5-0
202     // 0xf8..0xff: <invalid>
203     //
204     // Meaning of each bit:
205     // <msb> bit 7: 0 - single byte code point: bits 6-0 are values.
206     //              1 - multibyte code point
207     //       bit 6: 0 - subsequent bytes of multibyte code point:
208     //                  bits 5-0 are values.
209     //              1 - first byte of multibyte code point
210     //       bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values.
211     //              1 - first byte of code point with >= 3 bytes.
212     //       bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values.
213     //              1 - first byte of code point with >= 4 bytes.
214     //       bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values.
215     //              1 - reserved for future expansion.
216     if (*cp <= 0x7f) {
217       return true;
218     } else if (*cp <= 0xbf) {
219       return false;
220     } else if (*cp <= 0xdf) {
221       *cp &= 0x1f;
222       *num_left = 1;
223     } else if (*cp <= 0xef) {
224       *cp &= 0x0f;
225       *num_left = 2;
226     } else if (*cp <= 0xf7) {
227       *cp &= 0x07;
228       *num_left = 3;
229     } else {
230       return false;
231     }
232   } else {
233     // Last read was partial. Initialize num_read to 0 and continue reading
234     // the last unicode code point.
235     *num_read = 0;
236   }
237   while (*num_left > 0 && index < str.size()) {
238     uint32 ch = static_cast<uint8>(str[index++]);
239     --(*num_left);
240     ++(*num_read);
241     *cp = (*cp << 6) | (ch & 0x3f);
242     if (ch < 0x80 || ch > 0xbf) return false;
243   }
244   return *num_left > 0 || (!IsSurrogate(*cp) && IsValidCodePoint(*cp));
245 }
246 
247 // Stores the 16-bit unicode code point as its hexadecimal digits in buffer
248 // and returns a StringPiece that points to this buffer. The input buffer needs
249 // to be at least 6 bytes long.
ToHex(uint16 cp,char * buffer)250 StringPiece ToHex(uint16 cp, char* buffer) {
251   buffer[5] = kHex[cp & 0x0f];
252   cp >>= 4;
253   buffer[4] = kHex[cp & 0x0f];
254   cp >>= 4;
255   buffer[3] = kHex[cp & 0x0f];
256   cp >>= 4;
257   buffer[2] = kHex[cp & 0x0f];
258   return StringPiece(buffer).substr(0, 6);
259 }
260 
261 // Stores the 32-bit unicode code point as its hexadecimal digits in buffer
262 // and returns a StringPiece that points to this buffer. The input buffer needs
263 // to be at least 12 bytes long.
ToSurrogateHex(uint32 cp,char * buffer)264 StringPiece ToSurrogateHex(uint32 cp, char* buffer) {
265   uint16 low = ToLowSurrogate(cp);
266   uint16 high = ToHighSurrogate(cp);
267 
268   buffer[11] = kHex[low & 0x0f];
269   low >>= 4;
270   buffer[10] = kHex[low & 0x0f];
271   low >>= 4;
272   buffer[9] = kHex[low & 0x0f];
273   low >>= 4;
274   buffer[8] = kHex[low & 0x0f];
275 
276   buffer[5] = kHex[high & 0x0f];
277   high >>= 4;
278   buffer[4] = kHex[high & 0x0f];
279   high >>= 4;
280   buffer[3] = kHex[high & 0x0f];
281   high >>= 4;
282   buffer[2] = kHex[high & 0x0f];
283 
284   return StringPiece(buffer, 12);
285 }
286 
287 // If the given unicode code point needs escaping, then returns the
288 // escaped form. The returned StringPiece either points to statically
289 // pre-allocated char[] or to the given buffer. The input buffer needs
290 // to be at least 12 bytes long.
291 //
292 // If the given unicode code point does not need escaping, an empty
293 // StringPiece is returned.
EscapeCodePoint(uint32 cp,char * buffer)294 StringPiece EscapeCodePoint(uint32 cp, char* buffer) {
295   if (cp < 0xa0) return kCommonEscapes[cp];
296   switch (cp) {
297     // These are not required by json spec
298     // but used to prevent security bugs in javascript.
299     case 0xfeff:  // Zero width no-break space
300     case 0xfff9:  // Interlinear annotation anchor
301     case 0xfffa:  // Interlinear annotation separator
302     case 0xfffb:  // Interlinear annotation terminator
303 
304     case 0x00ad:  // Soft-hyphen
305     case 0x06dd:  // Arabic end of ayah
306     case 0x070f:  // Syriac abbreviation mark
307     case 0x17b4:  // Khmer vowel inherent Aq
308     case 0x17b5:  // Khmer vowel inherent Aa
309       return ToHex(cp, buffer);
310 
311     default:
312       if ((cp >= 0x0600 && cp <= 0x0603) ||  // Arabic signs
313           (cp >= 0x200b && cp <= 0x200f) ||  // Zero width etc.
314           (cp >= 0x2028 && cp <= 0x202e) ||  // Separators etc.
315           (cp >= 0x2060 && cp <= 0x2064) ||  // Invisible etc.
316           (cp >= 0x206a && cp <= 0x206f)) {  // Shaping etc.
317         return ToHex(cp, buffer);
318       }
319 
320       if (cp == 0x000e0001 ||                        // Language tag
321           (cp >= 0x0001d173 && cp <= 0x0001d17a) ||  // Music formatting
322           (cp >= 0x000e0020 && cp <= 0x000e007f)) {  // TAG symbols
323         return ToSurrogateHex(cp, buffer);
324       }
325   }
326   return StringPiece();
327 }
328 
329 // Tries to escape the given code point first. If the given code point
330 // does not need to be escaped, but force_output is true, then render
331 // the given multi-byte code point in UTF8 in the buffer and returns it.
EscapeCodePoint(uint32 cp,char * buffer,bool force_output)332 StringPiece EscapeCodePoint(uint32 cp, char* buffer, bool force_output) {
333   StringPiece sp = EscapeCodePoint(cp, buffer);
334   if (force_output && sp.empty()) {
335     buffer[5] = (cp & 0x3f) | 0x80;
336     cp >>= 6;
337     if (cp <= 0x1f) {
338       buffer[4] = cp | 0xc0;
339       sp.set(buffer + 4, 2);
340       return sp;
341     }
342     buffer[4] = (cp & 0x3f) | 0x80;
343     cp >>= 6;
344     if (cp <= 0x0f) {
345       buffer[3] = cp | 0xe0;
346       sp.set(buffer + 3, 3);
347       return sp;
348     }
349     buffer[3] = (cp & 0x3f) | 0x80;
350     buffer[2] = ((cp >> 6) & 0x07) | 0xf0;
351     sp.set(buffer + 2, 4);
352   }
353   return sp;
354 }
355 
356 }  // namespace
357 
Escape(strings::ByteSource * input,strings::ByteSink * output)358 void JsonEscaping::Escape(strings::ByteSource* input,
359                           strings::ByteSink* output) {
360   char buffer[12] = "\\udead\\ubee";
361   uint32 cp = 0;     // Current unicode code point.
362   int num_left = 0;  // Num of chars to read to complete the code point.
363   while (input->Available() > 0) {
364     StringPiece str = input->Peek();
365     StringPiece escaped;
366     int i = 0;
367     int num_read;
368     bool ok;
369     bool cp_was_split = num_left > 0;
370     // Loop until we encounter either
371     //   i) a code point that needs to be escaped; or
372     //  ii) a split code point is completely read; or
373     // iii) a character that is not a valid utf8; or
374     //  iv) end of the StringPiece str is reached.
375     do {
376       ok = ReadCodePoint(str, i, &cp, &num_left, &num_read);
377       if (num_left > 0 || !ok) break;  // case iii or iv
378       escaped = EscapeCodePoint(cp, buffer, cp_was_split);
379       if (!escaped.empty()) break;     // case i or ii
380       i += num_read;
381       num_read = 0;
382     } while (i < str.length());        // case iv
383     // First copy the un-escaped prefix, if any, to the output ByteSink.
384     if (i > 0) input->CopyTo(output, i);
385     if (num_read > 0) input->Skip(num_read);
386     if (!ok) {
387       // Case iii: Report error.
388       // TODO(wpoon): Add error reporting.
389       num_left = 0;
390     } else if (num_left == 0 && !escaped.empty()) {
391       // Case i or ii: Append the escaped code point to the output ByteSink.
392       output->Append(escaped.data(), escaped.size());
393     }
394   }
395   if (num_left > 0) {
396     // Treat as case iii: report error.
397     // TODO(wpoon): Add error reporting.
398   }
399 }
400 
401 }  // namespace converter
402 }  // namespace util
403 }  // namespace protobuf
404 }  // namespace google
405