1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/uri.h"
6 
7 #include "src/char-predicates-inl.h"
8 #include "src/handles.h"
9 #include "src/isolate-inl.h"
10 #include "src/list.h"
11 #include "src/string-search.h"
12 
13 namespace v8 {
14 namespace internal {
15 
16 namespace {  // anonymous namespace for DecodeURI helper functions
IsReservedPredicate(uc16 c)17 bool IsReservedPredicate(uc16 c) {
18   switch (c) {
19     case '#':
20     case '$':
21     case '&':
22     case '+':
23     case ',':
24     case '/':
25     case ':':
26     case ';':
27     case '=':
28     case '?':
29     case '@':
30       return true;
31     default:
32       return false;
33   }
34 }
35 
IsReplacementCharacter(const uint8_t * octets,int length)36 bool IsReplacementCharacter(const uint8_t* octets, int length) {
37   // The replacement character is at codepoint U+FFFD in the Unicode Specials
38   // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
39   if (length != 3 || octets[0] != 0xef || octets[1] != 0xbf ||
40       octets[2] != 0xbd) {
41     return false;
42   }
43   return true;
44 }
45 
DecodeOctets(const uint8_t * octets,int length,List<uc16> * buffer)46 bool DecodeOctets(const uint8_t* octets, int length, List<uc16>* buffer) {
47   size_t cursor = 0;
48   uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
49   if (value == unibrow::Utf8::kBadChar &&
50       !IsReplacementCharacter(octets, length)) {
51     return false;
52   }
53 
54   if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
55     buffer->Add(value);
56   } else {
57     buffer->Add(unibrow::Utf16::LeadSurrogate(value));
58     buffer->Add(unibrow::Utf16::TrailSurrogate(value));
59   }
60   return true;
61 }
62 
TwoDigitHex(uc16 character1,uc16 character2)63 int TwoDigitHex(uc16 character1, uc16 character2) {
64   if (character1 > 'f') return -1;
65   int high = HexValue(character1);
66   if (high == -1) return -1;
67   if (character2 > 'f') return -1;
68   int low = HexValue(character2);
69   if (low == -1) return -1;
70   return (high << 4) + low;
71 }
72 
73 template <typename T>
AddToBuffer(uc16 decoded,String::FlatContent * uri_content,int index,bool is_uri,List<T> * buffer)74 void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index,
75                  bool is_uri, List<T>* buffer) {
76   if (is_uri && IsReservedPredicate(decoded)) {
77     buffer->Add('%');
78     uc16 first = uri_content->Get(index + 1);
79     uc16 second = uri_content->Get(index + 2);
80     DCHECK_GT(std::numeric_limits<T>::max(), first);
81     DCHECK_GT(std::numeric_limits<T>::max(), second);
82 
83     buffer->Add(first);
84     buffer->Add(second);
85   } else {
86     buffer->Add(decoded);
87   }
88 }
89 
IntoTwoByte(int index,bool is_uri,int uri_length,String::FlatContent * uri_content,List<uc16> * buffer)90 bool IntoTwoByte(int index, bool is_uri, int uri_length,
91                  String::FlatContent* uri_content, List<uc16>* buffer) {
92   for (int k = index; k < uri_length; k++) {
93     uc16 code = uri_content->Get(k);
94     if (code == '%') {
95       int two_digits;
96       if (k + 2 >= uri_length ||
97           (two_digits = TwoDigitHex(uri_content->Get(k + 1),
98                                     uri_content->Get(k + 2))) < 0) {
99         return false;
100       }
101       k += 2;
102       uc16 decoded = static_cast<uc16>(two_digits);
103       if (decoded > unibrow::Utf8::kMaxOneByteChar) {
104         uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
105         octets[0] = decoded;
106 
107         int number_of_continuation_bytes = 0;
108         while ((decoded << ++number_of_continuation_bytes) & 0x80) {
109           if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
110             return false;
111           }
112           if (uri_content->Get(++k) != '%' ||
113               (two_digits = TwoDigitHex(uri_content->Get(k + 1),
114                                         uri_content->Get(k + 2))) < 0) {
115             return false;
116           }
117           k += 2;
118           uc16 continuation_byte = static_cast<uc16>(two_digits);
119           octets[number_of_continuation_bytes] = continuation_byte;
120         }
121 
122         if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
123           return false;
124         }
125       } else {
126         AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
127       }
128     } else {
129       buffer->Add(code);
130     }
131   }
132   return true;
133 }
134 
IntoOneAndTwoByte(Handle<String> uri,bool is_uri,List<uint8_t> * one_byte_buffer,List<uc16> * two_byte_buffer)135 bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,
136                        List<uint8_t>* one_byte_buffer,
137                        List<uc16>* two_byte_buffer) {
138   DisallowHeapAllocation no_gc;
139   String::FlatContent uri_content = uri->GetFlatContent();
140 
141   int uri_length = uri->length();
142   for (int k = 0; k < uri_length; k++) {
143     uc16 code = uri_content.Get(k);
144     if (code == '%') {
145       int two_digits;
146       if (k + 2 >= uri_length ||
147           (two_digits = TwoDigitHex(uri_content.Get(k + 1),
148                                     uri_content.Get(k + 2))) < 0) {
149         return false;
150       }
151 
152       uc16 decoded = static_cast<uc16>(two_digits);
153       if (decoded > unibrow::Utf8::kMaxOneByteChar) {
154         return IntoTwoByte(k, is_uri, uri_length, &uri_content,
155                            two_byte_buffer);
156       }
157 
158       AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
159       k += 2;
160     } else {
161       if (code > unibrow::Utf8::kMaxOneByteChar) {
162         return IntoTwoByte(k, is_uri, uri_length, &uri_content,
163                            two_byte_buffer);
164       }
165       one_byte_buffer->Add(code);
166     }
167   }
168   return true;
169 }
170 
171 }  // anonymous namespace
172 
Decode(Isolate * isolate,Handle<String> uri,bool is_uri)173 MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri,
174                                 bool is_uri) {
175   uri = String::Flatten(uri);
176   List<uint8_t> one_byte_buffer;
177   List<uc16> two_byte_buffer;
178 
179   if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
180     THROW_NEW_ERROR(isolate, NewURIError(), String);
181   }
182 
183   if (two_byte_buffer.is_empty()) {
184     return isolate->factory()->NewStringFromOneByte(
185         one_byte_buffer.ToConstVector());
186   }
187 
188   Handle<SeqTwoByteString> result;
189   ASSIGN_RETURN_ON_EXCEPTION(
190       isolate, result, isolate->factory()->NewRawTwoByteString(
191                            one_byte_buffer.length() + two_byte_buffer.length()),
192       String);
193 
194   CopyChars(result->GetChars(), one_byte_buffer.ToConstVector().start(),
195             one_byte_buffer.length());
196   CopyChars(result->GetChars() + one_byte_buffer.length(),
197             two_byte_buffer.ToConstVector().start(), two_byte_buffer.length());
198 
199   return result;
200 }
201 
202 namespace {  // anonymous namespace for EncodeURI helper functions
IsUnescapePredicateInUriComponent(uc16 c)203 bool IsUnescapePredicateInUriComponent(uc16 c) {
204   if (IsAlphaNumeric(c)) {
205     return true;
206   }
207 
208   switch (c) {
209     case '!':
210     case '\'':
211     case '(':
212     case ')':
213     case '*':
214     case '-':
215     case '.':
216     case '_':
217     case '~':
218       return true;
219     default:
220       return false;
221   }
222 }
223 
IsUriSeparator(uc16 c)224 bool IsUriSeparator(uc16 c) {
225   switch (c) {
226     case '#':
227     case ':':
228     case ';':
229     case '/':
230     case '?':
231     case '$':
232     case '&':
233     case '+':
234     case ',':
235     case '@':
236     case '=':
237       return true;
238     default:
239       return false;
240   }
241 }
242 
AddEncodedOctetToBuffer(uint8_t octet,List<uint8_t> * buffer)243 void AddEncodedOctetToBuffer(uint8_t octet, List<uint8_t>* buffer) {
244   buffer->Add('%');
245   buffer->Add(HexCharOfValue(octet >> 4));
246   buffer->Add(HexCharOfValue(octet & 0x0F));
247 }
248 
EncodeSingle(uc16 c,List<uint8_t> * buffer)249 void EncodeSingle(uc16 c, List<uint8_t>* buffer) {
250   char s[4] = {};
251   int number_of_bytes;
252   number_of_bytes =
253       unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);
254   for (int k = 0; k < number_of_bytes; k++) {
255     AddEncodedOctetToBuffer(s[k], buffer);
256   }
257 }
258 
EncodePair(uc16 cc1,uc16 cc2,List<uint8_t> * buffer)259 void EncodePair(uc16 cc1, uc16 cc2, List<uint8_t>* buffer) {
260   char s[4] = {};
261   int number_of_bytes =
262       unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),
263                             unibrow::Utf16::kNoPreviousCharacter, false);
264   for (int k = 0; k < number_of_bytes; k++) {
265     AddEncodedOctetToBuffer(s[k], buffer);
266   }
267 }
268 
269 }  // anonymous namespace
270 
Encode(Isolate * isolate,Handle<String> uri,bool is_uri)271 MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri,
272                                 bool is_uri) {
273   uri = String::Flatten(uri);
274   int uri_length = uri->length();
275   List<uint8_t> buffer(uri_length);
276 
277   {
278     DisallowHeapAllocation no_gc;
279     String::FlatContent uri_content = uri->GetFlatContent();
280 
281     for (int k = 0; k < uri_length; k++) {
282       uc16 cc1 = uri_content.Get(k);
283       if (unibrow::Utf16::IsLeadSurrogate(cc1)) {
284         k++;
285         if (k < uri_length) {
286           uc16 cc2 = uri->Get(k);
287           if (unibrow::Utf16::IsTrailSurrogate(cc2)) {
288             EncodePair(cc1, cc2, &buffer);
289             continue;
290           }
291         }
292       } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
293         if (IsUnescapePredicateInUriComponent(cc1) ||
294             (is_uri && IsUriSeparator(cc1))) {
295           buffer.Add(cc1);
296         } else {
297           EncodeSingle(cc1, &buffer);
298         }
299         continue;
300       }
301 
302       AllowHeapAllocation allocate_error_and_return;
303       THROW_NEW_ERROR(isolate, NewURIError(), String);
304     }
305   }
306 
307   return isolate->factory()->NewStringFromOneByte(buffer.ToConstVector());
308 }
309 
310 namespace {  // Anonymous namespace for Escape and Unescape
311 
312 template <typename Char>
UnescapeChar(Vector<const Char> vector,int i,int length,int * step)313 int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) {
314   uint16_t character = vector[i];
315   int32_t hi = 0;
316   int32_t lo = 0;
317   if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
318       (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
319       (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
320     *step = 6;
321     return (hi << 8) + lo;
322   } else if (character == '%' && i <= length - 3 &&
323              (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
324     *step = 3;
325     return lo;
326   } else {
327     *step = 1;
328     return character;
329   }
330 }
331 
332 template <typename Char>
UnescapeSlow(Isolate * isolate,Handle<String> string,int start_index)333 MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string,
334                                  int start_index) {
335   bool one_byte = true;
336   int length = string->length();
337 
338   int unescaped_length = 0;
339   {
340     DisallowHeapAllocation no_allocation;
341     Vector<const Char> vector = string->GetCharVector<Char>();
342     for (int i = start_index; i < length; unescaped_length++) {
343       int step;
344       if (UnescapeChar(vector, i, length, &step) >
345           String::kMaxOneByteCharCode) {
346         one_byte = false;
347       }
348       i += step;
349     }
350   }
351 
352   DCHECK(start_index < length);
353   Handle<String> first_part =
354       isolate->factory()->NewProperSubString(string, 0, start_index);
355 
356   int dest_position = 0;
357   Handle<String> second_part;
358   DCHECK(unescaped_length <= String::kMaxLength);
359   if (one_byte) {
360     Handle<SeqOneByteString> dest = isolate->factory()
361                                         ->NewRawOneByteString(unescaped_length)
362                                         .ToHandleChecked();
363     DisallowHeapAllocation no_allocation;
364     Vector<const Char> vector = string->GetCharVector<Char>();
365     for (int i = start_index; i < length; dest_position++) {
366       int step;
367       dest->SeqOneByteStringSet(dest_position,
368                                 UnescapeChar(vector, i, length, &step));
369       i += step;
370     }
371     second_part = dest;
372   } else {
373     Handle<SeqTwoByteString> dest = isolate->factory()
374                                         ->NewRawTwoByteString(unescaped_length)
375                                         .ToHandleChecked();
376     DisallowHeapAllocation no_allocation;
377     Vector<const Char> vector = string->GetCharVector<Char>();
378     for (int i = start_index; i < length; dest_position++) {
379       int step;
380       dest->SeqTwoByteStringSet(dest_position,
381                                 UnescapeChar(vector, i, length, &step));
382       i += step;
383     }
384     second_part = dest;
385   }
386   return isolate->factory()->NewConsString(first_part, second_part);
387 }
388 
IsNotEscaped(uint16_t c)389 bool IsNotEscaped(uint16_t c) {
390   if (IsAlphaNumeric(c)) {
391     return true;
392   }
393   //  @*_+-./
394   switch (c) {
395     case '@':
396     case '*':
397     case '_':
398     case '+':
399     case '-':
400     case '.':
401     case '/':
402       return true;
403     default:
404       return false;
405   }
406 }
407 
408 template <typename Char>
UnescapePrivate(Isolate * isolate,Handle<String> source)409 static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
410                                            Handle<String> source) {
411   int index;
412   {
413     DisallowHeapAllocation no_allocation;
414     StringSearch<uint8_t, Char> search(isolate, STATIC_CHAR_VECTOR("%"));
415     index = search.Search(source->GetCharVector<Char>(), 0);
416     if (index < 0) return source;
417   }
418   return UnescapeSlow<Char>(isolate, source, index);
419 }
420 
421 template <typename Char>
EscapePrivate(Isolate * isolate,Handle<String> string)422 static MaybeHandle<String> EscapePrivate(Isolate* isolate,
423                                          Handle<String> string) {
424   DCHECK(string->IsFlat());
425   int escaped_length = 0;
426   int length = string->length();
427 
428   {
429     DisallowHeapAllocation no_allocation;
430     Vector<const Char> vector = string->GetCharVector<Char>();
431     for (int i = 0; i < length; i++) {
432       uint16_t c = vector[i];
433       if (c >= 256) {
434         escaped_length += 6;
435       } else if (IsNotEscaped(c)) {
436         escaped_length++;
437       } else {
438         escaped_length += 3;
439       }
440 
441       // We don't allow strings that are longer than a maximal length.
442       DCHECK(String::kMaxLength < 0x7fffffff - 6);     // Cannot overflow.
443       if (escaped_length > String::kMaxLength) break;  // Provoke exception.
444     }
445   }
446 
447   // No length change implies no change.  Return original string if no change.
448   if (escaped_length == length) return string;
449 
450   Handle<SeqOneByteString> dest;
451   ASSIGN_RETURN_ON_EXCEPTION(
452       isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length),
453       String);
454   int dest_position = 0;
455 
456   {
457     DisallowHeapAllocation no_allocation;
458     Vector<const Char> vector = string->GetCharVector<Char>();
459     for (int i = 0; i < length; i++) {
460       uint16_t c = vector[i];
461       if (c >= 256) {
462         dest->SeqOneByteStringSet(dest_position, '%');
463         dest->SeqOneByteStringSet(dest_position + 1, 'u');
464         dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12));
465         dest->SeqOneByteStringSet(dest_position + 3,
466                                   HexCharOfValue((c >> 8) & 0xf));
467         dest->SeqOneByteStringSet(dest_position + 4,
468                                   HexCharOfValue((c >> 4) & 0xf));
469         dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xf));
470         dest_position += 6;
471       } else if (IsNotEscaped(c)) {
472         dest->SeqOneByteStringSet(dest_position, c);
473         dest_position++;
474       } else {
475         dest->SeqOneByteStringSet(dest_position, '%');
476         dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4));
477         dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xf));
478         dest_position += 3;
479       }
480     }
481   }
482 
483   return dest;
484 }
485 
486 }  // Anonymous namespace
487 
Escape(Isolate * isolate,Handle<String> string)488 MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) {
489   Handle<String> result;
490   string = String::Flatten(string);
491   return string->IsOneByteRepresentationUnderneath()
492              ? EscapePrivate<uint8_t>(isolate, string)
493              : EscapePrivate<uc16>(isolate, string);
494 }
495 
Unescape(Isolate * isolate,Handle<String> string)496 MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) {
497   Handle<String> result;
498   string = String::Flatten(string);
499   return string->IsOneByteRepresentationUnderneath()
500              ? UnescapePrivate<uint8_t>(isolate, string)
501              : UnescapePrivate<uc16>(isolate, string);
502 }
503 
504 }  // namespace internal
505 }  // namespace v8
506