1 // Copyright 2013-2016 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8 
9 //! Parser and serializer for the [`application/x-www-form-urlencoded` syntax](
10 //! http://url.spec.whatwg.org/#application/x-www-form-urlencoded),
11 //! as used by HTML forms.
12 //!
13 //! Converts between a string (such as an URL’s query string)
14 //! and a sequence of (name, value) pairs.
15 
16 #[macro_use]
17 extern crate matches;
18 
19 use percent_encoding::{percent_decode, percent_encode_byte};
20 use std::borrow::{Borrow, Cow};
21 use std::str;
22 
23 /// Convert a byte string in the `application/x-www-form-urlencoded` syntax
24 /// into a iterator of (name, value) pairs.
25 ///
26 /// Use `parse(input.as_bytes())` to parse a `&str` string.
27 ///
28 /// The names and values are percent-decoded. For instance, `%23first=%25try%25` will be
29 /// converted to `[("#first", "%try%")]`.
30 #[inline]
parse(input: &[u8]) -> Parse<'_>31 pub fn parse(input: &[u8]) -> Parse<'_> {
32     Parse { input }
33 }
34 /// The return type of `parse()`.
35 #[derive(Copy, Clone)]
36 pub struct Parse<'a> {
37     input: &'a [u8],
38 }
39 
40 impl<'a> Iterator for Parse<'a> {
41     type Item = (Cow<'a, str>, Cow<'a, str>);
42 
next(&mut self) -> Option<Self::Item>43     fn next(&mut self) -> Option<Self::Item> {
44         loop {
45             if self.input.is_empty() {
46                 return None;
47             }
48             let mut split2 = self.input.splitn(2, |&b| b == b'&');
49             let sequence = split2.next().unwrap();
50             self.input = split2.next().unwrap_or(&[][..]);
51             if sequence.is_empty() {
52                 continue;
53             }
54             let mut split2 = sequence.splitn(2, |&b| b == b'=');
55             let name = split2.next().unwrap();
56             let value = split2.next().unwrap_or(&[][..]);
57             return Some((decode(name), decode(value)));
58         }
59     }
60 }
61 
decode(input: &[u8]) -> Cow<'_, str>62 fn decode(input: &[u8]) -> Cow<'_, str> {
63     let replaced = replace_plus(input);
64     decode_utf8_lossy(match percent_decode(&replaced).into() {
65         Cow::Owned(vec) => Cow::Owned(vec),
66         Cow::Borrowed(_) => replaced,
67     })
68 }
69 
70 /// Replace b'+' with b' '
replace_plus(input: &[u8]) -> Cow<'_, [u8]>71 fn replace_plus(input: &[u8]) -> Cow<'_, [u8]> {
72     match input.iter().position(|&b| b == b'+') {
73         None => Cow::Borrowed(input),
74         Some(first_position) => {
75             let mut replaced = input.to_owned();
76             replaced[first_position] = b' ';
77             for byte in &mut replaced[first_position + 1..] {
78                 if *byte == b'+' {
79                     *byte = b' ';
80                 }
81             }
82             Cow::Owned(replaced)
83         }
84     }
85 }
86 
87 impl<'a> Parse<'a> {
88     /// Return a new iterator that yields pairs of `String` instead of pairs of `Cow<str>`.
into_owned(self) -> ParseIntoOwned<'a>89     pub fn into_owned(self) -> ParseIntoOwned<'a> {
90         ParseIntoOwned { inner: self }
91     }
92 }
93 
94 /// Like `Parse`, but yields pairs of `String` instead of pairs of `Cow<str>`.
95 pub struct ParseIntoOwned<'a> {
96     inner: Parse<'a>,
97 }
98 
99 impl<'a> Iterator for ParseIntoOwned<'a> {
100     type Item = (String, String);
101 
next(&mut self) -> Option<Self::Item>102     fn next(&mut self) -> Option<Self::Item> {
103         self.inner
104             .next()
105             .map(|(k, v)| (k.into_owned(), v.into_owned()))
106     }
107 }
108 
109 /// The [`application/x-www-form-urlencoded` byte serializer](
110 /// https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer).
111 ///
112 /// Return an iterator of `&str` slices.
byte_serialize(input: &[u8]) -> ByteSerialize<'_>113 pub fn byte_serialize(input: &[u8]) -> ByteSerialize<'_> {
114     ByteSerialize { bytes: input }
115 }
116 
117 /// Return value of `byte_serialize()`.
118 #[derive(Debug)]
119 pub struct ByteSerialize<'a> {
120     bytes: &'a [u8],
121 }
122 
byte_serialized_unchanged(byte: u8) -> bool123 fn byte_serialized_unchanged(byte: u8) -> bool {
124     matches!(byte, b'*' | b'-' | b'.' | b'0' ..= b'9' | b'A' ..= b'Z' | b'_' | b'a' ..= b'z')
125 }
126 
127 impl<'a> Iterator for ByteSerialize<'a> {
128     type Item = &'a str;
129 
next(&mut self) -> Option<&'a str>130     fn next(&mut self) -> Option<&'a str> {
131         if let Some((&first, tail)) = self.bytes.split_first() {
132             if !byte_serialized_unchanged(first) {
133                 self.bytes = tail;
134                 return Some(if first == b' ' {
135                     "+"
136                 } else {
137                     percent_encode_byte(first)
138                 });
139             }
140             let position = tail.iter().position(|&b| !byte_serialized_unchanged(b));
141             let (unchanged_slice, remaining) = match position {
142                 // 1 for first_byte + i unchanged in tail
143                 Some(i) => self.bytes.split_at(1 + i),
144                 None => (self.bytes, &[][..]),
145             };
146             self.bytes = remaining;
147             // This unsafe is appropriate because we have already checked these
148             // bytes in byte_serialized_unchanged, which checks for a subset
149             // of UTF-8. So we know these bytes are valid UTF-8, and doing
150             // another UTF-8 check would be wasteful.
151             Some(unsafe { str::from_utf8_unchecked(unchanged_slice) })
152         } else {
153             None
154         }
155     }
156 
size_hint(&self) -> (usize, Option<usize>)157     fn size_hint(&self) -> (usize, Option<usize>) {
158         if self.bytes.is_empty() {
159             (0, Some(0))
160         } else {
161             (1, Some(self.bytes.len()))
162         }
163     }
164 }
165 
166 /// The [`application/x-www-form-urlencoded` serializer](
167 /// https://url.spec.whatwg.org/#concept-urlencoded-serializer).
168 pub struct Serializer<'a, T: Target> {
169     target: Option<T>,
170     start_position: usize,
171     encoding: EncodingOverride<'a>,
172 }
173 
174 pub trait Target {
as_mut_string(&mut self) -> &mut String175     fn as_mut_string(&mut self) -> &mut String;
finish(self) -> Self::Finished176     fn finish(self) -> Self::Finished;
177     type Finished;
178 }
179 
180 impl Target for String {
as_mut_string(&mut self) -> &mut String181     fn as_mut_string(&mut self) -> &mut String {
182         self
183     }
finish(self) -> Self184     fn finish(self) -> Self {
185         self
186     }
187     type Finished = Self;
188 }
189 
190 impl<'a> Target for &'a mut String {
as_mut_string(&mut self) -> &mut String191     fn as_mut_string(&mut self) -> &mut String {
192         &mut **self
193     }
finish(self) -> Self194     fn finish(self) -> Self {
195         self
196     }
197     type Finished = Self;
198 }
199 
200 impl<'a, T: Target> Serializer<'a, T> {
201     /// Create a new `application/x-www-form-urlencoded` serializer for the given target.
202     ///
203     /// If the target is non-empty,
204     /// its content is assumed to already be in `application/x-www-form-urlencoded` syntax.
new(target: T) -> Self205     pub fn new(target: T) -> Self {
206         Self::for_suffix(target, 0)
207     }
208 
209     /// Create a new `application/x-www-form-urlencoded` serializer
210     /// for a suffix of the given target.
211     ///
212     /// If that suffix is non-empty,
213     /// its content is assumed to already be in `application/x-www-form-urlencoded` syntax.
for_suffix(mut target: T, start_position: usize) -> Self214     pub fn for_suffix(mut target: T, start_position: usize) -> Self {
215         if target.as_mut_string().len() < start_position {
216             panic!(
217                 "invalid length {} for target of length {}",
218                 start_position,
219                 target.as_mut_string().len()
220             );
221         }
222 
223         Serializer {
224             target: Some(target),
225             start_position,
226             encoding: None,
227         }
228     }
229 
230     /// Remove any existing name/value pair.
231     ///
232     /// Panics if called after `.finish()`.
clear(&mut self) -> &mut Self233     pub fn clear(&mut self) -> &mut Self {
234         string(&mut self.target).truncate(self.start_position);
235         self
236     }
237 
238     /// Set the character encoding to be used for names and values before percent-encoding.
encoding_override(&mut self, new: EncodingOverride<'a>) -> &mut Self239     pub fn encoding_override(&mut self, new: EncodingOverride<'a>) -> &mut Self {
240         self.encoding = new;
241         self
242     }
243 
244     /// Serialize and append a name/value pair.
245     ///
246     /// Panics if called after `.finish()`.
append_pair(&mut self, name: &str, value: &str) -> &mut Self247     pub fn append_pair(&mut self, name: &str, value: &str) -> &mut Self {
248         append_pair(
249             string(&mut self.target),
250             self.start_position,
251             self.encoding,
252             name,
253             value,
254         );
255         self
256     }
257 
258     /// Serialize and append a name of parameter without any value.
259     ///
260     /// Panics if called after `.finish()`.
append_key_only(&mut self, name: &str) -> &mut Self261     pub fn append_key_only(&mut self, name: &str) -> &mut Self {
262         append_key_only(
263             string(&mut self.target),
264             self.start_position,
265             self.encoding,
266             name,
267         );
268         self
269     }
270 
271     /// Serialize and append a number of name/value pairs.
272     ///
273     /// This simply calls `append_pair` repeatedly.
274     /// This can be more convenient, so the user doesn’t need to introduce a block
275     /// to limit the scope of `Serializer`’s borrow of its string.
276     ///
277     /// Panics if called after `.finish()`.
extend_pairs<I, K, V>(&mut self, iter: I) -> &mut Self where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef<str>, V: AsRef<str>,278     pub fn extend_pairs<I, K, V>(&mut self, iter: I) -> &mut Self
279     where
280         I: IntoIterator,
281         I::Item: Borrow<(K, V)>,
282         K: AsRef<str>,
283         V: AsRef<str>,
284     {
285         {
286             let string = string(&mut self.target);
287             for pair in iter {
288                 let &(ref k, ref v) = pair.borrow();
289                 append_pair(
290                     string,
291                     self.start_position,
292                     self.encoding,
293                     k.as_ref(),
294                     v.as_ref(),
295                 );
296             }
297         }
298         self
299     }
300 
301     /// Serialize and append a number of names without values.
302     ///
303     /// This simply calls `append_key_only` repeatedly.
304     /// This can be more convenient, so the user doesn’t need to introduce a block
305     /// to limit the scope of `Serializer`’s borrow of its string.
306     ///
307     /// Panics if called after `.finish()`.
extend_keys_only<I, K>(&mut self, iter: I) -> &mut Self where I: IntoIterator, I::Item: Borrow<K>, K: AsRef<str>,308     pub fn extend_keys_only<I, K>(&mut self, iter: I) -> &mut Self
309     where
310         I: IntoIterator,
311         I::Item: Borrow<K>,
312         K: AsRef<str>,
313     {
314         {
315             let string = string(&mut self.target);
316             for key in iter {
317                 let k = key.borrow().as_ref();
318                 append_key_only(string, self.start_position, self.encoding, k);
319             }
320         }
321         self
322     }
323 
324     /// If this serializer was constructed with a string, take and return that string.
325     ///
326     /// ```rust
327     /// use form_urlencoded;
328     /// let encoded: String = form_urlencoded::Serializer::new(String::new())
329     ///     .append_pair("foo", "bar & baz")
330     ///     .append_pair("saison", "Été+hiver")
331     ///     .finish();
332     /// assert_eq!(encoded, "foo=bar+%26+baz&saison=%C3%89t%C3%A9%2Bhiver");
333     /// ```
334     ///
335     /// Panics if called more than once.
finish(&mut self) -> T::Finished336     pub fn finish(&mut self) -> T::Finished {
337         self.target
338             .take()
339             .expect("url::form_urlencoded::Serializer double finish")
340             .finish()
341     }
342 }
343 
append_separator_if_needed(string: &mut String, start_position: usize)344 fn append_separator_if_needed(string: &mut String, start_position: usize) {
345     if string.len() > start_position {
346         string.push('&')
347     }
348 }
349 
string<T: Target>(target: &mut Option<T>) -> &mut String350 fn string<T: Target>(target: &mut Option<T>) -> &mut String {
351     target
352         .as_mut()
353         .expect("url::form_urlencoded::Serializer finished")
354         .as_mut_string()
355 }
356 
append_pair( string: &mut String, start_position: usize, encoding: EncodingOverride<'_>, name: &str, value: &str, )357 fn append_pair(
358     string: &mut String,
359     start_position: usize,
360     encoding: EncodingOverride<'_>,
361     name: &str,
362     value: &str,
363 ) {
364     append_separator_if_needed(string, start_position);
365     append_encoded(name, string, encoding);
366     string.push('=');
367     append_encoded(value, string, encoding);
368 }
369 
append_key_only( string: &mut String, start_position: usize, encoding: EncodingOverride, name: &str, )370 fn append_key_only(
371     string: &mut String,
372     start_position: usize,
373     encoding: EncodingOverride,
374     name: &str,
375 ) {
376     append_separator_if_needed(string, start_position);
377     append_encoded(name, string, encoding);
378 }
379 
append_encoded(s: &str, string: &mut String, encoding: EncodingOverride<'_>)380 fn append_encoded(s: &str, string: &mut String, encoding: EncodingOverride<'_>) {
381     string.extend(byte_serialize(&encode(encoding, s)))
382 }
383 
encode<'a>(encoding_override: EncodingOverride<'_>, input: &'a str) -> Cow<'a, [u8]>384 pub(crate) fn encode<'a>(encoding_override: EncodingOverride<'_>, input: &'a str) -> Cow<'a, [u8]> {
385     if let Some(o) = encoding_override {
386         return o(input);
387     }
388     input.as_bytes().into()
389 }
390 
decode_utf8_lossy(input: Cow<'_, [u8]>) -> Cow<'_, str>391 pub(crate) fn decode_utf8_lossy(input: Cow<'_, [u8]>) -> Cow<'_, str> {
392     // Note: This function is duplicated in `percent_encoding/lib.rs`.
393     match input {
394         Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
395         Cow::Owned(bytes) => {
396             match String::from_utf8_lossy(&bytes) {
397                 Cow::Borrowed(utf8) => {
398                     // If from_utf8_lossy returns a Cow::Borrowed, then we can
399                     // be sure our original bytes were valid UTF-8. This is because
400                     // if the bytes were invalid UTF-8 from_utf8_lossy would have
401                     // to allocate a new owned string to back the Cow so it could
402                     // replace invalid bytes with a placeholder.
403 
404                     // First we do a debug_assert to confirm our description above.
405                     let raw_utf8: *const [u8];
406                     raw_utf8 = utf8.as_bytes();
407                     debug_assert!(raw_utf8 == &*bytes as *const [u8]);
408 
409                     // Given we know the original input bytes are valid UTF-8,
410                     // and we have ownership of those bytes, we re-use them and
411                     // return a Cow::Owned here.
412                     Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
413                 }
414                 Cow::Owned(s) => Cow::Owned(s),
415             }
416         }
417     }
418 }
419 
420 pub type EncodingOverride<'a> = Option<&'a dyn Fn(&str) -> Cow<'_, [u8]>>;
421