1 // Copyright 2015 Nicholas Allegra (comex).
2 // Licensed under the Apache License, Version 2.0 <http://www.apache.org/licenses/LICENSE-2.0> or
3 // the MIT license <http://opensource.org/licenses/MIT>, at your option. This file may not be
4 // copied, modified, or distributed except according to those terms.
5 
6 //! Same idea as (but implementation not directly based on) the Python shlex module.  However, this
7 //! implementation does not support any of the Python module's customization because it makes
8 //! parsing slower and is fairly useless.  You only get the default settings of shlex.split, which
9 //! mimic the POSIX shell:
10 //! http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html
11 //!
12 //! This implementation also deviates from the Python version in not treating \r specially, which I
13 //! believe is more compliant.
14 //!
15 //! The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate over the bytes
16 //! directly as a micro-optimization.
17 
18 use std::borrow::Cow;
19 
20 /// An iterator that takes an input string and splits it into the words using the same syntax as
21 /// the POSIX shell.
22 pub struct Shlex<'a> {
23     in_iter: std::str::Bytes<'a>,
24     /// The number of newlines read so far, plus one.
25     pub line_no: usize,
26     /// An input string is erroneous if it ends while inside a quotation or right after an
27     /// unescaped backslash.  Since Iterator does not have a mechanism to return an error, if that
28     /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
29     /// true; best to check it after you're done iterating.
30     pub had_error: bool,
31 }
32 
33 impl<'a> Shlex<'a> {
new(in_str: &'a str) -> Self34     pub fn new(in_str: &'a str) -> Self {
35         Shlex {
36             in_iter: in_str.bytes(),
37             line_no: 1,
38             had_error: false,
39         }
40     }
41 
parse_word(&mut self, mut ch: u8) -> Option<String>42     fn parse_word(&mut self, mut ch: u8) -> Option<String> {
43         let mut result: Vec<u8> = Vec::new();
44         loop {
45             match ch as char {
46                 '"' => if let Err(()) = self.parse_double(&mut result) {
47                     self.had_error = true;
48                     return None;
49                 },
50                 '\'' => if let Err(()) = self.parse_single(&mut result) {
51                     self.had_error = true;
52                     return None;
53                 },
54                 '\\' => if let Some(ch2) = self.next_char() {
55                     if ch2 != '\n' as u8 { result.push(ch2); }
56                 } else {
57                     self.had_error = true;
58                     return None;
59                 },
60                 ' ' | '\t' | '\n' => { break; },
61                 _ => { result.push(ch as u8); },
62             }
63             if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
64         }
65         unsafe { Some(String::from_utf8_unchecked(result)) }
66     }
67 
parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()>68     fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
69         loop {
70             if let Some(ch2) = self.next_char() {
71                 match ch2 as char {
72                     '\\' => {
73                         if let Some(ch3) = self.next_char() {
74                             match ch3 as char {
75                                 // \$ => $
76                                 '$' | '`' | '"' | '\\' => { result.push(ch3); },
77                                 // \<newline> => nothing
78                                 '\n' => {},
79                                 // \x => =x
80                                 _ => { result.push('\\' as u8); result.push(ch3); }
81                             }
82                         } else {
83                             return Err(());
84                         }
85                     },
86                     '"' => { return Ok(()); },
87                     _ => { result.push(ch2); },
88                 }
89             } else {
90                 return Err(());
91             }
92         }
93     }
94 
parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()>95     fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
96         loop {
97             if let Some(ch2) = self.next_char() {
98                 match ch2 as char {
99                     '\'' => { return Ok(()); },
100                     _ => { result.push(ch2); },
101                 }
102             } else {
103                 return Err(());
104             }
105         }
106     }
107 
next_char(&mut self) -> Option<u8>108     fn next_char(&mut self) -> Option<u8> {
109         let res = self.in_iter.next();
110         if res == Some('\n' as u8) { self.line_no += 1; }
111         res
112     }
113 }
114 
115 impl<'a> Iterator for Shlex<'a> {
116     type Item = String;
next(&mut self) -> Option<String>117     fn next(&mut self) -> Option<String> {
118         if let Some(mut ch) = self.next_char() {
119             // skip initial whitespace
120             loop {
121                 match ch as char {
122                     ' ' | '\t' | '\n' => {},
123                     '#' => {
124                         while let Some(ch2) = self.next_char() {
125                             if ch2 as char == '\n' { break; }
126                         }
127                     },
128                     _ => { break; }
129                 }
130                 if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; }
131             }
132             self.parse_word(ch)
133         } else { // no initial character
134             None
135         }
136     }
137 
138 }
139 
140 /// Convenience function that consumes the whole string at once.  Returns None if the input was
141 /// erroneous.
split(in_str: &str) -> Option<Vec<String>>142 pub fn split(in_str: &str) -> Option<Vec<String>> {
143     let mut shl = Shlex::new(in_str);
144     let res = shl.by_ref().collect();
145     if shl.had_error { None } else { Some(res) }
146 }
147 
148 /// Given a single word, return a string suitable to encode it as a shell argument.
quote(in_str: &str) -> Cow<str>149 pub fn quote(in_str: &str) -> Cow<str> {
150     if in_str.len() == 0 {
151         "\"\"".into()
152     } else if in_str.bytes().any(|c| match c as char {
153         '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' |
154         '\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true,
155         _ => false
156     }) {
157         let mut out: Vec<u8> = Vec::new();
158         out.push('"' as u8);
159         for c in in_str.bytes() {
160             match c as char {
161                 '$' | '`' | '"' | '\\' => out.push('\\' as u8),
162                 _ => ()
163             }
164             out.push(c);
165         }
166         out.push('"' as u8);
167         unsafe { String::from_utf8_unchecked(out) }.into()
168     } else {
169         in_str.into()
170     }
171 }
172 
173 /// Convenience function that consumes an iterable of words and turns it into a single string,
174 /// quoting words when necessary. Consecutive words will be separated by a single space.
join<'a, I: IntoIterator<Item = &'a str>>(words: I) -> String175 pub fn join<'a, I: IntoIterator<Item = &'a str>>(words: I) -> String {
176     words.into_iter()
177         .map(quote)
178         .collect::<Vec<_>>()
179         .join(" ")
180 }
181 
182 #[cfg(test)]
183 static SPLIT_TEST_ITEMS: &'static [(&'static str, Option<&'static [&'static str]>)] = &[
184     ("foo$baz", Some(&["foo$baz"])),
185     ("foo baz", Some(&["foo", "baz"])),
186     ("foo\"bar\"baz", Some(&["foobarbaz"])),
187     ("foo \"bar\"baz", Some(&["foo", "barbaz"])),
188     ("   foo \nbar", Some(&["foo", "bar"])),
189     ("foo\\\nbar", Some(&["foobar"])),
190     ("\"foo\\\nbar\"", Some(&["foobar"])),
191     ("'baz\\$b'", Some(&["baz\\$b"])),
192     ("'baz\\\''", None),
193     ("\\", None),
194     ("\"\\", None),
195     ("'\\", None),
196     ("\"", None),
197     ("'", None),
198     ("foo #bar\nbaz", Some(&["foo", "baz"])),
199     ("foo #bar", Some(&["foo"])),
200     ("foo#bar", Some(&["foo#bar"])),
201     ("foo\"#bar", None),
202     ("'\\n'", Some(&["\\n"])),
203     ("'\\\\n'", Some(&["\\\\n"])),
204 ];
205 
206 #[test]
test_split()207 fn test_split() {
208     for &(input, output) in SPLIT_TEST_ITEMS {
209         assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect()));
210     }
211 }
212 
213 #[test]
test_lineno()214 fn test_lineno() {
215     let mut sh = Shlex::new("\nfoo\nbar");
216     while let Some(word) = sh.next() {
217         if word == "bar" {
218             assert_eq!(sh.line_no, 3);
219         }
220     }
221 }
222 
223 #[test]
test_quote()224 fn test_quote() {
225     assert_eq!(quote("foobar"), "foobar");
226     assert_eq!(quote("foo bar"), "\"foo bar\"");
227     assert_eq!(quote("\""), "\"\\\"\"");
228     assert_eq!(quote(""), "\"\"");
229 }
230 
231 #[test]
test_join()232 fn test_join() {
233     assert_eq!(join(vec![]), "");
234     assert_eq!(join(vec![""]), "\"\"");
235     assert_eq!(join(vec!["a", "b"]), "a b");
236     assert_eq!(join(vec!["foo bar", "baz"]), "\"foo bar\" baz");
237 }
238