1# Copyright (C) 2003-2007, 2009, 2010 Nominum, Inc. 2# 3# Permission to use, copy, modify, and distribute this software and its 4# documentation for any purpose with or without fee is hereby granted, 5# provided that the above copyright notice and this permission notice 6# appear in all copies. 7# 8# THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES 9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR 11# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 14# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 16"""Tokenize DNS master file format""" 17 18import cStringIO 19import sys 20 21import dns.exception 22import dns.name 23import dns.ttl 24 25_DELIMITERS = { 26 ' ' : True, 27 '\t' : True, 28 '\n' : True, 29 ';' : True, 30 '(' : True, 31 ')' : True, 32 '"' : True } 33 34_QUOTING_DELIMITERS = { '"' : True } 35 36EOF = 0 37EOL = 1 38WHITESPACE = 2 39IDENTIFIER = 3 40QUOTED_STRING = 4 41COMMENT = 5 42DELIMITER = 6 43 44class UngetBufferFull(dns.exception.DNSException): 45 """Raised when an attempt is made to unget a token when the unget 46 buffer is full.""" 47 pass 48 49class Token(object): 50 """A DNS master file format token. 51 52 @ivar ttype: The token type 53 @type ttype: int 54 @ivar value: The token value 55 @type value: string 56 @ivar has_escape: Does the token value contain escapes? 57 @type has_escape: bool 58 """ 59 60 def __init__(self, ttype, value='', has_escape=False): 61 """Initialize a token instance. 62 63 @param ttype: The token type 64 @type ttype: int 65 @ivar value: The token value 66 @type value: string 67 @ivar has_escape: Does the token value contain escapes? 68 @type has_escape: bool 69 """ 70 self.ttype = ttype 71 self.value = value 72 self.has_escape = has_escape 73 74 def is_eof(self): 75 return self.ttype == EOF 76 77 def is_eol(self): 78 return self.ttype == EOL 79 80 def is_whitespace(self): 81 return self.ttype == WHITESPACE 82 83 def is_identifier(self): 84 return self.ttype == IDENTIFIER 85 86 def is_quoted_string(self): 87 return self.ttype == QUOTED_STRING 88 89 def is_comment(self): 90 return self.ttype == COMMENT 91 92 def is_delimiter(self): 93 return self.ttype == DELIMITER 94 95 def is_eol_or_eof(self): 96 return (self.ttype == EOL or self.ttype == EOF) 97 98 def __eq__(self, other): 99 if not isinstance(other, Token): 100 return False 101 return (self.ttype == other.ttype and 102 self.value == other.value) 103 104 def __ne__(self, other): 105 if not isinstance(other, Token): 106 return True 107 return (self.ttype != other.ttype or 108 self.value != other.value) 109 110 def __str__(self): 111 return '%d "%s"' % (self.ttype, self.value) 112 113 def unescape(self): 114 if not self.has_escape: 115 return self 116 unescaped = '' 117 l = len(self.value) 118 i = 0 119 while i < l: 120 c = self.value[i] 121 i += 1 122 if c == '\\': 123 if i >= l: 124 raise dns.exception.UnexpectedEnd 125 c = self.value[i] 126 i += 1 127 if c.isdigit(): 128 if i >= l: 129 raise dns.exception.UnexpectedEnd 130 c2 = self.value[i] 131 i += 1 132 if i >= l: 133 raise dns.exception.UnexpectedEnd 134 c3 = self.value[i] 135 i += 1 136 if not (c2.isdigit() and c3.isdigit()): 137 raise dns.exception.SyntaxError 138 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 139 unescaped += c 140 return Token(self.ttype, unescaped) 141 142 # compatibility for old-style tuple tokens 143 144 def __len__(self): 145 return 2 146 147 def __iter__(self): 148 return iter((self.ttype, self.value)) 149 150 def __getitem__(self, i): 151 if i == 0: 152 return self.ttype 153 elif i == 1: 154 return self.value 155 else: 156 raise IndexError 157 158class Tokenizer(object): 159 """A DNS master file format tokenizer. 160 161 A token is a (type, value) tuple, where I{type} is an int, and 162 I{value} is a string. The valid types are EOF, EOL, WHITESPACE, 163 IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER. 164 165 @ivar file: The file to tokenize 166 @type file: file 167 @ivar ungotten_char: The most recently ungotten character, or None. 168 @type ungotten_char: string 169 @ivar ungotten_token: The most recently ungotten token, or None. 170 @type ungotten_token: (int, string) token tuple 171 @ivar multiline: The current multiline level. This value is increased 172 by one every time a '(' delimiter is read, and decreased by one every time 173 a ')' delimiter is read. 174 @type multiline: int 175 @ivar quoting: This variable is true if the tokenizer is currently 176 reading a quoted string. 177 @type quoting: bool 178 @ivar eof: This variable is true if the tokenizer has encountered EOF. 179 @type eof: bool 180 @ivar delimiters: The current delimiter dictionary. 181 @type delimiters: dict 182 @ivar line_number: The current line number 183 @type line_number: int 184 @ivar filename: A filename that will be returned by the L{where} method. 185 @type filename: string 186 """ 187 188 def __init__(self, f=sys.stdin, filename=None): 189 """Initialize a tokenizer instance. 190 191 @param f: The file to tokenize. The default is sys.stdin. 192 This parameter may also be a string, in which case the tokenizer 193 will take its input from the contents of the string. 194 @type f: file or string 195 @param filename: the name of the filename that the L{where} method 196 will return. 197 @type filename: string 198 """ 199 200 if isinstance(f, str): 201 f = cStringIO.StringIO(f) 202 if filename is None: 203 filename = '<string>' 204 else: 205 if filename is None: 206 if f is sys.stdin: 207 filename = '<stdin>' 208 else: 209 filename = '<file>' 210 self.file = f 211 self.ungotten_char = None 212 self.ungotten_token = None 213 self.multiline = 0 214 self.quoting = False 215 self.eof = False 216 self.delimiters = _DELIMITERS 217 self.line_number = 1 218 self.filename = filename 219 220 def _get_char(self): 221 """Read a character from input. 222 @rtype: string 223 """ 224 225 if self.ungotten_char is None: 226 if self.eof: 227 c = '' 228 else: 229 c = self.file.read(1) 230 if c == '': 231 self.eof = True 232 elif c == '\n': 233 self.line_number += 1 234 else: 235 c = self.ungotten_char 236 self.ungotten_char = None 237 return c 238 239 def where(self): 240 """Return the current location in the input. 241 242 @rtype: (string, int) tuple. The first item is the filename of 243 the input, the second is the current line number. 244 """ 245 246 return (self.filename, self.line_number) 247 248 def _unget_char(self, c): 249 """Unget a character. 250 251 The unget buffer for characters is only one character large; it is 252 an error to try to unget a character when the unget buffer is not 253 empty. 254 255 @param c: the character to unget 256 @type c: string 257 @raises UngetBufferFull: there is already an ungotten char 258 """ 259 260 if not self.ungotten_char is None: 261 raise UngetBufferFull 262 self.ungotten_char = c 263 264 def skip_whitespace(self): 265 """Consume input until a non-whitespace character is encountered. 266 267 The non-whitespace character is then ungotten, and the number of 268 whitespace characters consumed is returned. 269 270 If the tokenizer is in multiline mode, then newlines are whitespace. 271 272 @rtype: int 273 """ 274 275 skipped = 0 276 while True: 277 c = self._get_char() 278 if c != ' ' and c != '\t': 279 if (c != '\n') or not self.multiline: 280 self._unget_char(c) 281 return skipped 282 skipped += 1 283 284 def get(self, want_leading = False, want_comment = False): 285 """Get the next token. 286 287 @param want_leading: If True, return a WHITESPACE token if the 288 first character read is whitespace. The default is False. 289 @type want_leading: bool 290 @param want_comment: If True, return a COMMENT token if the 291 first token read is a comment. The default is False. 292 @type want_comment: bool 293 @rtype: Token object 294 @raises dns.exception.UnexpectedEnd: input ended prematurely 295 @raises dns.exception.SyntaxError: input was badly formed 296 """ 297 298 if not self.ungotten_token is None: 299 token = self.ungotten_token 300 self.ungotten_token = None 301 if token.is_whitespace(): 302 if want_leading: 303 return token 304 elif token.is_comment(): 305 if want_comment: 306 return token 307 else: 308 return token 309 skipped = self.skip_whitespace() 310 if want_leading and skipped > 0: 311 return Token(WHITESPACE, ' ') 312 token = '' 313 ttype = IDENTIFIER 314 has_escape = False 315 while True: 316 c = self._get_char() 317 if c == '' or c in self.delimiters: 318 if c == '' and self.quoting: 319 raise dns.exception.UnexpectedEnd 320 if token == '' and ttype != QUOTED_STRING: 321 if c == '(': 322 self.multiline += 1 323 self.skip_whitespace() 324 continue 325 elif c == ')': 326 if not self.multiline > 0: 327 raise dns.exception.SyntaxError 328 self.multiline -= 1 329 self.skip_whitespace() 330 continue 331 elif c == '"': 332 if not self.quoting: 333 self.quoting = True 334 self.delimiters = _QUOTING_DELIMITERS 335 ttype = QUOTED_STRING 336 continue 337 else: 338 self.quoting = False 339 self.delimiters = _DELIMITERS 340 self.skip_whitespace() 341 continue 342 elif c == '\n': 343 return Token(EOL, '\n') 344 elif c == ';': 345 while 1: 346 c = self._get_char() 347 if c == '\n' or c == '': 348 break 349 token += c 350 if want_comment: 351 self._unget_char(c) 352 return Token(COMMENT, token) 353 elif c == '': 354 if self.multiline: 355 raise dns.exception.SyntaxError('unbalanced parentheses') 356 return Token(EOF) 357 elif self.multiline: 358 self.skip_whitespace() 359 token = '' 360 continue 361 else: 362 return Token(EOL, '\n') 363 else: 364 # This code exists in case we ever want a 365 # delimiter to be returned. It never produces 366 # a token currently. 367 token = c 368 ttype = DELIMITER 369 else: 370 self._unget_char(c) 371 break 372 elif self.quoting: 373 if c == '\\': 374 c = self._get_char() 375 if c == '': 376 raise dns.exception.UnexpectedEnd 377 if c.isdigit(): 378 c2 = self._get_char() 379 if c2 == '': 380 raise dns.exception.UnexpectedEnd 381 c3 = self._get_char() 382 if c == '': 383 raise dns.exception.UnexpectedEnd 384 if not (c2.isdigit() and c3.isdigit()): 385 raise dns.exception.SyntaxError 386 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 387 elif c == '\n': 388 raise dns.exception.SyntaxError('newline in quoted string') 389 elif c == '\\': 390 # 391 # It's an escape. Put it and the next character into 392 # the token; it will be checked later for goodness. 393 # 394 token += c 395 has_escape = True 396 c = self._get_char() 397 if c == '' or c == '\n': 398 raise dns.exception.UnexpectedEnd 399 token += c 400 if token == '' and ttype != QUOTED_STRING: 401 if self.multiline: 402 raise dns.exception.SyntaxError('unbalanced parentheses') 403 ttype = EOF 404 return Token(ttype, token, has_escape) 405 406 def unget(self, token): 407 """Unget a token. 408 409 The unget buffer for tokens is only one token large; it is 410 an error to try to unget a token when the unget buffer is not 411 empty. 412 413 @param token: the token to unget 414 @type token: Token object 415 @raises UngetBufferFull: there is already an ungotten token 416 """ 417 418 if not self.ungotten_token is None: 419 raise UngetBufferFull 420 self.ungotten_token = token 421 422 def next(self): 423 """Return the next item in an iteration. 424 @rtype: (int, string) 425 """ 426 427 token = self.get() 428 if token.is_eof(): 429 raise StopIteration 430 return token 431 432 def __iter__(self): 433 return self 434 435 # Helpers 436 437 def get_int(self): 438 """Read the next token and interpret it as an integer. 439 440 @raises dns.exception.SyntaxError: 441 @rtype: int 442 """ 443 444 token = self.get().unescape() 445 if not token.is_identifier(): 446 raise dns.exception.SyntaxError('expecting an identifier') 447 if not token.value.isdigit(): 448 raise dns.exception.SyntaxError('expecting an integer') 449 return int(token.value) 450 451 def get_uint8(self): 452 """Read the next token and interpret it as an 8-bit unsigned 453 integer. 454 455 @raises dns.exception.SyntaxError: 456 @rtype: int 457 """ 458 459 value = self.get_int() 460 if value < 0 or value > 255: 461 raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value) 462 return value 463 464 def get_uint16(self): 465 """Read the next token and interpret it as a 16-bit unsigned 466 integer. 467 468 @raises dns.exception.SyntaxError: 469 @rtype: int 470 """ 471 472 value = self.get_int() 473 if value < 0 or value > 65535: 474 raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value) 475 return value 476 477 def get_uint32(self): 478 """Read the next token and interpret it as a 32-bit unsigned 479 integer. 480 481 @raises dns.exception.SyntaxError: 482 @rtype: int 483 """ 484 485 token = self.get().unescape() 486 if not token.is_identifier(): 487 raise dns.exception.SyntaxError('expecting an identifier') 488 if not token.value.isdigit(): 489 raise dns.exception.SyntaxError('expecting an integer') 490 value = long(token.value) 491 if value < 0 or value > 4294967296L: 492 raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value) 493 return value 494 495 def get_string(self, origin=None): 496 """Read the next token and interpret it as a string. 497 498 @raises dns.exception.SyntaxError: 499 @rtype: string 500 """ 501 502 token = self.get().unescape() 503 if not (token.is_identifier() or token.is_quoted_string()): 504 raise dns.exception.SyntaxError('expecting a string') 505 return token.value 506 507 def get_identifier(self, origin=None): 508 """Read the next token and raise an exception if it is not an identifier. 509 510 @raises dns.exception.SyntaxError: 511 @rtype: string 512 """ 513 514 token = self.get().unescape() 515 if not token.is_identifier(): 516 raise dns.exception.SyntaxError('expecting an identifier') 517 return token.value 518 519 def get_name(self, origin=None): 520 """Read the next token and interpret it as a DNS name. 521 522 @raises dns.exception.SyntaxError: 523 @rtype: dns.name.Name object""" 524 525 token = self.get() 526 if not token.is_identifier(): 527 raise dns.exception.SyntaxError('expecting an identifier') 528 return dns.name.from_text(token.value, origin) 529 530 def get_eol(self): 531 """Read the next token and raise an exception if it isn't EOL or 532 EOF. 533 534 @raises dns.exception.SyntaxError: 535 @rtype: string 536 """ 537 538 token = self.get() 539 if not token.is_eol_or_eof(): 540 raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value)) 541 return token.value 542 543 def get_ttl(self): 544 token = self.get().unescape() 545 if not token.is_identifier(): 546 raise dns.exception.SyntaxError('expecting an identifier') 547 return dns.ttl.from_text(token.value) 548