1"""A lexical analyzer class for simple shell-like syntaxes.""" 2 3# Module and documentation by Eric S. Raymond, 21 Dec 1998 4# Input stacking and error message cleanup added by ESR, March 2000 5# push_source() and pop_source() made explicit by ESR, January 2001. 6# Posix compliance, split(), string arguments, and 7# iterator interface by Gustavo Niemeyer, April 2003. 8# changes to tokenize more like Posix shells by Vinay Sajip, July 2016. 9 10import os 11import re 12import sys 13from collections import deque 14 15from io import StringIO 16 17__all__ = ["shlex", "split", "quote"] 18 19class shlex: 20 "A lexical analyzer class for simple shell-like syntaxes." 21 def __init__(self, instream=None, infile=None, posix=False, 22 punctuation_chars=False): 23 if isinstance(instream, str): 24 instream = StringIO(instream) 25 if instream is not None: 26 self.instream = instream 27 self.infile = infile 28 else: 29 self.instream = sys.stdin 30 self.infile = None 31 self.posix = posix 32 if posix: 33 self.eof = None 34 else: 35 self.eof = '' 36 self.commenters = '#' 37 self.wordchars = ('abcdfeghijklmnopqrstuvwxyz' 38 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_') 39 if self.posix: 40 self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ' 41 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ') 42 self.whitespace = ' \t\r\n' 43 self.whitespace_split = False 44 self.quotes = '\'"' 45 self.escape = '\\' 46 self.escapedquotes = '"' 47 self.state = ' ' 48 self.pushback = deque() 49 self.lineno = 1 50 self.debug = 0 51 self.token = '' 52 self.filestack = deque() 53 self.source = None 54 if not punctuation_chars: 55 punctuation_chars = '' 56 elif punctuation_chars is True: 57 punctuation_chars = '();<>|&' 58 self.punctuation_chars = punctuation_chars 59 if punctuation_chars: 60 # _pushback_chars is a push back queue used by lookahead logic 61 self._pushback_chars = deque() 62 # these chars added because allowed in file names, args, wildcards 63 self.wordchars += '~-./*?=' 64 #remove any punctuation chars from wordchars 65 t = self.wordchars.maketrans(dict.fromkeys(punctuation_chars)) 66 self.wordchars = self.wordchars.translate(t) 67 68 def push_token(self, tok): 69 "Push a token onto the stack popped by the get_token method" 70 if self.debug >= 1: 71 print("shlex: pushing token " + repr(tok)) 72 self.pushback.appendleft(tok) 73 74 def push_source(self, newstream, newfile=None): 75 "Push an input source onto the lexer's input source stack." 76 if isinstance(newstream, str): 77 newstream = StringIO(newstream) 78 self.filestack.appendleft((self.infile, self.instream, self.lineno)) 79 self.infile = newfile 80 self.instream = newstream 81 self.lineno = 1 82 if self.debug: 83 if newfile is not None: 84 print('shlex: pushing to file %s' % (self.infile,)) 85 else: 86 print('shlex: pushing to stream %s' % (self.instream,)) 87 88 def pop_source(self): 89 "Pop the input source stack." 90 self.instream.close() 91 (self.infile, self.instream, self.lineno) = self.filestack.popleft() 92 if self.debug: 93 print('shlex: popping to %s, line %d' \ 94 % (self.instream, self.lineno)) 95 self.state = ' ' 96 97 def get_token(self): 98 "Get a token from the input stream (or from stack if it's nonempty)" 99 if self.pushback: 100 tok = self.pushback.popleft() 101 if self.debug >= 1: 102 print("shlex: popping token " + repr(tok)) 103 return tok 104 # No pushback. Get a token. 105 raw = self.read_token() 106 # Handle inclusions 107 if self.source is not None: 108 while raw == self.source: 109 spec = self.sourcehook(self.read_token()) 110 if spec: 111 (newfile, newstream) = spec 112 self.push_source(newstream, newfile) 113 raw = self.get_token() 114 # Maybe we got EOF instead? 115 while raw == self.eof: 116 if not self.filestack: 117 return self.eof 118 else: 119 self.pop_source() 120 raw = self.get_token() 121 # Neither inclusion nor EOF 122 if self.debug >= 1: 123 if raw != self.eof: 124 print("shlex: token=" + repr(raw)) 125 else: 126 print("shlex: token=EOF") 127 return raw 128 129 def read_token(self): 130 quoted = False 131 escapedstate = ' ' 132 while True: 133 if self.punctuation_chars and self._pushback_chars: 134 nextchar = self._pushback_chars.pop() 135 else: 136 nextchar = self.instream.read(1) 137 if nextchar == '\n': 138 self.lineno += 1 139 if self.debug >= 3: 140 print("shlex: in state %r I see character: %r" % (self.state, 141 nextchar)) 142 if self.state is None: 143 self.token = '' # past end of file 144 break 145 elif self.state == ' ': 146 if not nextchar: 147 self.state = None # end of file 148 break 149 elif nextchar in self.whitespace: 150 if self.debug >= 2: 151 print("shlex: I see whitespace in whitespace state") 152 if self.token or (self.posix and quoted): 153 break # emit current token 154 else: 155 continue 156 elif nextchar in self.commenters: 157 self.instream.readline() 158 self.lineno += 1 159 elif self.posix and nextchar in self.escape: 160 escapedstate = 'a' 161 self.state = nextchar 162 elif nextchar in self.wordchars: 163 self.token = nextchar 164 self.state = 'a' 165 elif nextchar in self.punctuation_chars: 166 self.token = nextchar 167 self.state = 'c' 168 elif nextchar in self.quotes: 169 if not self.posix: 170 self.token = nextchar 171 self.state = nextchar 172 elif self.whitespace_split: 173 self.token = nextchar 174 self.state = 'a' 175 else: 176 self.token = nextchar 177 if self.token or (self.posix and quoted): 178 break # emit current token 179 else: 180 continue 181 elif self.state in self.quotes: 182 quoted = True 183 if not nextchar: # end of file 184 if self.debug >= 2: 185 print("shlex: I see EOF in quotes state") 186 # XXX what error should be raised here? 187 raise ValueError("No closing quotation") 188 if nextchar == self.state: 189 if not self.posix: 190 self.token += nextchar 191 self.state = ' ' 192 break 193 else: 194 self.state = 'a' 195 elif (self.posix and nextchar in self.escape and self.state 196 in self.escapedquotes): 197 escapedstate = self.state 198 self.state = nextchar 199 else: 200 self.token += nextchar 201 elif self.state in self.escape: 202 if not nextchar: # end of file 203 if self.debug >= 2: 204 print("shlex: I see EOF in escape state") 205 # XXX what error should be raised here? 206 raise ValueError("No escaped character") 207 # In posix shells, only the quote itself or the escape 208 # character may be escaped within quotes. 209 if (escapedstate in self.quotes and 210 nextchar != self.state and nextchar != escapedstate): 211 self.token += self.state 212 self.token += nextchar 213 self.state = escapedstate 214 elif self.state in ('a', 'c'): 215 if not nextchar: 216 self.state = None # end of file 217 break 218 elif nextchar in self.whitespace: 219 if self.debug >= 2: 220 print("shlex: I see whitespace in word state") 221 self.state = ' ' 222 if self.token or (self.posix and quoted): 223 break # emit current token 224 else: 225 continue 226 elif nextchar in self.commenters: 227 self.instream.readline() 228 self.lineno += 1 229 if self.posix: 230 self.state = ' ' 231 if self.token or (self.posix and quoted): 232 break # emit current token 233 else: 234 continue 235 elif self.state == 'c': 236 if nextchar in self.punctuation_chars: 237 self.token += nextchar 238 else: 239 if nextchar not in self.whitespace: 240 self._pushback_chars.append(nextchar) 241 self.state = ' ' 242 break 243 elif self.posix and nextchar in self.quotes: 244 self.state = nextchar 245 elif self.posix and nextchar in self.escape: 246 escapedstate = 'a' 247 self.state = nextchar 248 elif (nextchar in self.wordchars or nextchar in self.quotes 249 or self.whitespace_split): 250 self.token += nextchar 251 else: 252 if self.punctuation_chars: 253 self._pushback_chars.append(nextchar) 254 else: 255 self.pushback.appendleft(nextchar) 256 if self.debug >= 2: 257 print("shlex: I see punctuation in word state") 258 self.state = ' ' 259 if self.token or (self.posix and quoted): 260 break # emit current token 261 else: 262 continue 263 result = self.token 264 self.token = '' 265 if self.posix and not quoted and result == '': 266 result = None 267 if self.debug > 1: 268 if result: 269 print("shlex: raw token=" + repr(result)) 270 else: 271 print("shlex: raw token=EOF") 272 return result 273 274 def sourcehook(self, newfile): 275 "Hook called on a filename to be sourced." 276 if newfile[0] == '"': 277 newfile = newfile[1:-1] 278 # This implements cpp-like semantics for relative-path inclusion. 279 if isinstance(self.infile, str) and not os.path.isabs(newfile): 280 newfile = os.path.join(os.path.dirname(self.infile), newfile) 281 return (newfile, open(newfile, "r")) 282 283 def error_leader(self, infile=None, lineno=None): 284 "Emit a C-compiler-like, Emacs-friendly error-message leader." 285 if infile is None: 286 infile = self.infile 287 if lineno is None: 288 lineno = self.lineno 289 return "\"%s\", line %d: " % (infile, lineno) 290 291 def __iter__(self): 292 return self 293 294 def __next__(self): 295 token = self.get_token() 296 if token == self.eof: 297 raise StopIteration 298 return token 299 300def split(s, comments=False, posix=True): 301 lex = shlex(s, posix=posix) 302 lex.whitespace_split = True 303 if not comments: 304 lex.commenters = '' 305 return list(lex) 306 307 308_find_unsafe = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search 309 310def quote(s): 311 """Return a shell-escaped version of the string *s*.""" 312 if not s: 313 return "''" 314 if _find_unsafe(s) is None: 315 return s 316 317 # use single quotes, and put single quotes into double quotes 318 # the string $'b is then quoted as '$'"'"'b' 319 return "'" + s.replace("'", "'\"'\"'") + "'" 320 321 322def _print_tokens(lexer): 323 while 1: 324 tt = lexer.get_token() 325 if not tt: 326 break 327 print("Token: " + repr(tt)) 328 329if __name__ == '__main__': 330 if len(sys.argv) == 1: 331 _print_tokens(shlex()) 332 else: 333 fn = sys.argv[1] 334 with open(fn) as f: 335 _print_tokens(shlex(f, fn)) 336