1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/uri.h"
6 
7 #include <vector>
8 
9 #include "src/char-predicates-inl.h"
10 #include "src/isolate-inl.h"
11 #include "src/string-search.h"
12 #include "src/unicode-inl.h"
13 
14 namespace v8 {
15 namespace internal {
16 
17 namespace {  // anonymous namespace for DecodeURI helper functions
IsReservedPredicate(uc16 c)18 bool IsReservedPredicate(uc16 c) {
19   switch (c) {
20     case '#':
21     case '$':
22     case '&':
23     case '+':
24     case ',':
25     case '/':
26     case ':':
27     case ';':
28     case '=':
29     case '?':
30     case '@':
31       return true;
32     default:
33       return false;
34   }
35 }
36 
IsReplacementCharacter(const uint8_t * octets,int length)37 bool IsReplacementCharacter(const uint8_t* octets, int length) {
38   // The replacement character is at codepoint U+FFFD in the Unicode Specials
39   // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
40   if (length != 3 || octets[0] != 0xEF || octets[1] != 0xBF ||
41       octets[2] != 0xBD) {
42     return false;
43   }
44   return true;
45 }
46 
DecodeOctets(const uint8_t * octets,int length,std::vector<uc16> * buffer)47 bool DecodeOctets(const uint8_t* octets, int length,
48                   std::vector<uc16>* buffer) {
49   size_t cursor = 0;
50   uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
51   if (value == unibrow::Utf8::kBadChar &&
52       !IsReplacementCharacter(octets, length)) {
53     return false;
54   }
55 
56   if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
57     buffer->push_back(value);
58   } else {
59     buffer->push_back(unibrow::Utf16::LeadSurrogate(value));
60     buffer->push_back(unibrow::Utf16::TrailSurrogate(value));
61   }
62   return true;
63 }
64 
TwoDigitHex(uc16 character1,uc16 character2)65 int TwoDigitHex(uc16 character1, uc16 character2) {
66   if (character1 > 'f') return -1;
67   int high = HexValue(character1);
68   if (high == -1) return -1;
69   if (character2 > 'f') return -1;
70   int low = HexValue(character2);
71   if (low == -1) return -1;
72   return (high << 4) + low;
73 }
74 
75 template <typename T>
AddToBuffer(uc16 decoded,String::FlatContent * uri_content,int index,bool is_uri,std::vector<T> * buffer)76 void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index,
77                  bool is_uri, std::vector<T>* buffer) {
78   if (is_uri && IsReservedPredicate(decoded)) {
79     buffer->push_back('%');
80     uc16 first = uri_content->Get(index + 1);
81     uc16 second = uri_content->Get(index + 2);
82     DCHECK_GT(std::numeric_limits<T>::max(), first);
83     DCHECK_GT(std::numeric_limits<T>::max(), second);
84 
85     buffer->push_back(first);
86     buffer->push_back(second);
87   } else {
88     buffer->push_back(decoded);
89   }
90 }
91 
IntoTwoByte(int index,bool is_uri,int uri_length,String::FlatContent * uri_content,std::vector<uc16> * buffer)92 bool IntoTwoByte(int index, bool is_uri, int uri_length,
93                  String::FlatContent* uri_content, std::vector<uc16>* buffer) {
94   for (int k = index; k < uri_length; k++) {
95     uc16 code = uri_content->Get(k);
96     if (code == '%') {
97       int two_digits;
98       if (k + 2 >= uri_length ||
99           (two_digits = TwoDigitHex(uri_content->Get(k + 1),
100                                     uri_content->Get(k + 2))) < 0) {
101         return false;
102       }
103       k += 2;
104       uc16 decoded = static_cast<uc16>(two_digits);
105       if (decoded > unibrow::Utf8::kMaxOneByteChar) {
106         uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
107         octets[0] = decoded;
108 
109         int number_of_continuation_bytes = 0;
110         while ((decoded << ++number_of_continuation_bytes) & 0x80) {
111           if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
112             return false;
113           }
114           if (uri_content->Get(++k) != '%' ||
115               (two_digits = TwoDigitHex(uri_content->Get(k + 1),
116                                         uri_content->Get(k + 2))) < 0) {
117             return false;
118           }
119           k += 2;
120           uc16 continuation_byte = static_cast<uc16>(two_digits);
121           octets[number_of_continuation_bytes] = continuation_byte;
122         }
123 
124         if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
125           return false;
126         }
127       } else {
128         AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
129       }
130     } else {
131       buffer->push_back(code);
132     }
133   }
134   return true;
135 }
136 
IntoOneAndTwoByte(Handle<String> uri,bool is_uri,std::vector<uint8_t> * one_byte_buffer,std::vector<uc16> * two_byte_buffer)137 bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,
138                        std::vector<uint8_t>* one_byte_buffer,
139                        std::vector<uc16>* two_byte_buffer) {
140   DisallowHeapAllocation no_gc;
141   String::FlatContent uri_content = uri->GetFlatContent();
142 
143   int uri_length = uri->length();
144   for (int k = 0; k < uri_length; k++) {
145     uc16 code = uri_content.Get(k);
146     if (code == '%') {
147       int two_digits;
148       if (k + 2 >= uri_length ||
149           (two_digits = TwoDigitHex(uri_content.Get(k + 1),
150                                     uri_content.Get(k + 2))) < 0) {
151         return false;
152       }
153 
154       uc16 decoded = static_cast<uc16>(two_digits);
155       if (decoded > unibrow::Utf8::kMaxOneByteChar) {
156         return IntoTwoByte(k, is_uri, uri_length, &uri_content,
157                            two_byte_buffer);
158       }
159 
160       AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
161       k += 2;
162     } else {
163       if (code > unibrow::Utf8::kMaxOneByteChar) {
164         return IntoTwoByte(k, is_uri, uri_length, &uri_content,
165                            two_byte_buffer);
166       }
167       one_byte_buffer->push_back(code);
168     }
169   }
170   return true;
171 }
172 
173 }  // anonymous namespace
174 
Decode(Isolate * isolate,Handle<String> uri,bool is_uri)175 MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri,
176                                 bool is_uri) {
177   uri = String::Flatten(isolate, uri);
178   std::vector<uint8_t> one_byte_buffer;
179   std::vector<uc16> two_byte_buffer;
180 
181   if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
182     THROW_NEW_ERROR(isolate, NewURIError(), String);
183   }
184 
185   if (two_byte_buffer.empty()) {
186     return isolate->factory()->NewStringFromOneByte(Vector<const uint8_t>(
187         one_byte_buffer.data(), static_cast<int>(one_byte_buffer.size())));
188   }
189 
190   Handle<SeqTwoByteString> result;
191   int result_length =
192       static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size());
193   ASSIGN_RETURN_ON_EXCEPTION(
194       isolate, result, isolate->factory()->NewRawTwoByteString(result_length),
195       String);
196 
197   CopyChars(result->GetChars(), one_byte_buffer.data(), one_byte_buffer.size());
198   CopyChars(result->GetChars() + one_byte_buffer.size(), two_byte_buffer.data(),
199             two_byte_buffer.size());
200 
201   return result;
202 }
203 
204 namespace {  // anonymous namespace for EncodeURI helper functions
IsUnescapePredicateInUriComponent(uc16 c)205 bool IsUnescapePredicateInUriComponent(uc16 c) {
206   if (IsAlphaNumeric(c)) {
207     return true;
208   }
209 
210   switch (c) {
211     case '!':
212     case '\'':
213     case '(':
214     case ')':
215     case '*':
216     case '-':
217     case '.':
218     case '_':
219     case '~':
220       return true;
221     default:
222       return false;
223   }
224 }
225 
IsUriSeparator(uc16 c)226 bool IsUriSeparator(uc16 c) {
227   switch (c) {
228     case '#':
229     case ':':
230     case ';':
231     case '/':
232     case '?':
233     case '$':
234     case '&':
235     case '+':
236     case ',':
237     case '@':
238     case '=':
239       return true;
240     default:
241       return false;
242   }
243 }
244 
AddEncodedOctetToBuffer(uint8_t octet,std::vector<uint8_t> * buffer)245 void AddEncodedOctetToBuffer(uint8_t octet, std::vector<uint8_t>* buffer) {
246   buffer->push_back('%');
247   buffer->push_back(HexCharOfValue(octet >> 4));
248   buffer->push_back(HexCharOfValue(octet & 0x0F));
249 }
250 
EncodeSingle(uc16 c,std::vector<uint8_t> * buffer)251 void EncodeSingle(uc16 c, std::vector<uint8_t>* buffer) {
252   char s[4] = {};
253   int number_of_bytes;
254   number_of_bytes =
255       unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);
256   for (int k = 0; k < number_of_bytes; k++) {
257     AddEncodedOctetToBuffer(s[k], buffer);
258   }
259 }
260 
EncodePair(uc16 cc1,uc16 cc2,std::vector<uint8_t> * buffer)261 void EncodePair(uc16 cc1, uc16 cc2, std::vector<uint8_t>* buffer) {
262   char s[4] = {};
263   int number_of_bytes =
264       unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),
265                             unibrow::Utf16::kNoPreviousCharacter, false);
266   for (int k = 0; k < number_of_bytes; k++) {
267     AddEncodedOctetToBuffer(s[k], buffer);
268   }
269 }
270 
271 }  // anonymous namespace
272 
Encode(Isolate * isolate,Handle<String> uri,bool is_uri)273 MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri,
274                                 bool is_uri) {
275   uri = String::Flatten(isolate, uri);
276   int uri_length = uri->length();
277   std::vector<uint8_t> buffer;
278   buffer.reserve(uri_length);
279 
280   {
281     DisallowHeapAllocation no_gc;
282     String::FlatContent uri_content = uri->GetFlatContent();
283 
284     for (int k = 0; k < uri_length; k++) {
285       uc16 cc1 = uri_content.Get(k);
286       if (unibrow::Utf16::IsLeadSurrogate(cc1)) {
287         k++;
288         if (k < uri_length) {
289           uc16 cc2 = uri->Get(k);
290           if (unibrow::Utf16::IsTrailSurrogate(cc2)) {
291             EncodePair(cc1, cc2, &buffer);
292             continue;
293           }
294         }
295       } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
296         if (IsUnescapePredicateInUriComponent(cc1) ||
297             (is_uri && IsUriSeparator(cc1))) {
298           buffer.push_back(cc1);
299         } else {
300           EncodeSingle(cc1, &buffer);
301         }
302         continue;
303       }
304 
305       AllowHeapAllocation allocate_error_and_return;
306       THROW_NEW_ERROR(isolate, NewURIError(), String);
307     }
308   }
309 
310   return isolate->factory()->NewStringFromOneByte(
311       Vector<const uint8_t>(buffer.data(), static_cast<int>(buffer.size())));
312 }
313 
314 namespace {  // Anonymous namespace for Escape and Unescape
315 
316 template <typename Char>
UnescapeChar(Vector<const Char> vector,int i,int length,int * step)317 int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) {
318   uint16_t character = vector[i];
319   int32_t hi = 0;
320   int32_t lo = 0;
321   if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
322       (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
323       (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
324     *step = 6;
325     return (hi << 8) + lo;
326   } else if (character == '%' && i <= length - 3 &&
327              (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
328     *step = 3;
329     return lo;
330   } else {
331     *step = 1;
332     return character;
333   }
334 }
335 
336 template <typename Char>
UnescapeSlow(Isolate * isolate,Handle<String> string,int start_index)337 MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string,
338                                  int start_index) {
339   bool one_byte = true;
340   int length = string->length();
341 
342   int unescaped_length = 0;
343   {
344     DisallowHeapAllocation no_allocation;
345     Vector<const Char> vector = string->GetCharVector<Char>();
346     for (int i = start_index; i < length; unescaped_length++) {
347       int step;
348       if (UnescapeChar(vector, i, length, &step) >
349           String::kMaxOneByteCharCode) {
350         one_byte = false;
351       }
352       i += step;
353     }
354   }
355 
356   DCHECK(start_index < length);
357   Handle<String> first_part =
358       isolate->factory()->NewProperSubString(string, 0, start_index);
359 
360   int dest_position = 0;
361   Handle<String> second_part;
362   DCHECK_LE(unescaped_length, String::kMaxLength);
363   if (one_byte) {
364     Handle<SeqOneByteString> dest = isolate->factory()
365                                         ->NewRawOneByteString(unescaped_length)
366                                         .ToHandleChecked();
367     DisallowHeapAllocation no_allocation;
368     Vector<const Char> vector = string->GetCharVector<Char>();
369     for (int i = start_index; i < length; dest_position++) {
370       int step;
371       dest->SeqOneByteStringSet(dest_position,
372                                 UnescapeChar(vector, i, length, &step));
373       i += step;
374     }
375     second_part = dest;
376   } else {
377     Handle<SeqTwoByteString> dest = isolate->factory()
378                                         ->NewRawTwoByteString(unescaped_length)
379                                         .ToHandleChecked();
380     DisallowHeapAllocation no_allocation;
381     Vector<const Char> vector = string->GetCharVector<Char>();
382     for (int i = start_index; i < length; dest_position++) {
383       int step;
384       dest->SeqTwoByteStringSet(dest_position,
385                                 UnescapeChar(vector, i, length, &step));
386       i += step;
387     }
388     second_part = dest;
389   }
390   return isolate->factory()->NewConsString(first_part, second_part);
391 }
392 
IsNotEscaped(uint16_t c)393 bool IsNotEscaped(uint16_t c) {
394   if (IsAlphaNumeric(c)) {
395     return true;
396   }
397   //  @*_+-./
398   switch (c) {
399     case '@':
400     case '*':
401     case '_':
402     case '+':
403     case '-':
404     case '.':
405     case '/':
406       return true;
407     default:
408       return false;
409   }
410 }
411 
412 template <typename Char>
UnescapePrivate(Isolate * isolate,Handle<String> source)413 static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
414                                            Handle<String> source) {
415   int index;
416   {
417     DisallowHeapAllocation no_allocation;
418     StringSearch<uint8_t, Char> search(isolate, STATIC_CHAR_VECTOR("%"));
419     index = search.Search(source->GetCharVector<Char>(), 0);
420     if (index < 0) return source;
421   }
422   return UnescapeSlow<Char>(isolate, source, index);
423 }
424 
425 template <typename Char>
EscapePrivate(Isolate * isolate,Handle<String> string)426 static MaybeHandle<String> EscapePrivate(Isolate* isolate,
427                                          Handle<String> string) {
428   DCHECK(string->IsFlat());
429   int escaped_length = 0;
430   int length = string->length();
431 
432   {
433     DisallowHeapAllocation no_allocation;
434     Vector<const Char> vector = string->GetCharVector<Char>();
435     for (int i = 0; i < length; i++) {
436       uint16_t c = vector[i];
437       if (c >= 256) {
438         escaped_length += 6;
439       } else if (IsNotEscaped(c)) {
440         escaped_length++;
441       } else {
442         escaped_length += 3;
443       }
444 
445       // We don't allow strings that are longer than a maximal length.
446       DCHECK_LT(String::kMaxLength, 0x7FFFFFFF - 6);   // Cannot overflow.
447       if (escaped_length > String::kMaxLength) break;  // Provoke exception.
448     }
449   }
450 
451   // No length change implies no change.  Return original string if no change.
452   if (escaped_length == length) return string;
453 
454   Handle<SeqOneByteString> dest;
455   ASSIGN_RETURN_ON_EXCEPTION(
456       isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length),
457       String);
458   int dest_position = 0;
459 
460   {
461     DisallowHeapAllocation no_allocation;
462     Vector<const Char> vector = string->GetCharVector<Char>();
463     for (int i = 0; i < length; i++) {
464       uint16_t c = vector[i];
465       if (c >= 256) {
466         dest->SeqOneByteStringSet(dest_position, '%');
467         dest->SeqOneByteStringSet(dest_position + 1, 'u');
468         dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12));
469         dest->SeqOneByteStringSet(dest_position + 3,
470                                   HexCharOfValue((c >> 8) & 0xF));
471         dest->SeqOneByteStringSet(dest_position + 4,
472                                   HexCharOfValue((c >> 4) & 0xF));
473         dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xF));
474         dest_position += 6;
475       } else if (IsNotEscaped(c)) {
476         dest->SeqOneByteStringSet(dest_position, c);
477         dest_position++;
478       } else {
479         dest->SeqOneByteStringSet(dest_position, '%');
480         dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4));
481         dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xF));
482         dest_position += 3;
483       }
484     }
485   }
486 
487   return dest;
488 }
489 
490 }  // Anonymous namespace
491 
Escape(Isolate * isolate,Handle<String> string)492 MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) {
493   Handle<String> result;
494   string = String::Flatten(isolate, string);
495   return string->IsOneByteRepresentationUnderneath()
496              ? EscapePrivate<uint8_t>(isolate, string)
497              : EscapePrivate<uc16>(isolate, string);
498 }
499 
Unescape(Isolate * isolate,Handle<String> string)500 MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) {
501   Handle<String> result;
502   string = String::Flatten(isolate, string);
503   return string->IsOneByteRepresentationUnderneath()
504              ? UnescapePrivate<uint8_t>(isolate, string)
505              : UnescapePrivate<uc16>(isolate, string);
506 }
507 
508 }  // namespace internal
509 }  // namespace v8
510