1# -*- coding: iso-8859-1 -*- 2""" A SAX2 driver for libxml2, on top of it's XmlReader API 3 4USAGE 5 # put this file (drv_libxml2.py) in PYTHONPATH 6 import xml.sax 7 reader = xml.sax.make_parser(["drv_libxml2"]) 8 # ...and the rest is standard python sax. 9 10CAVEATS 11 - Lexical handlers are supported, except for start/endEntity 12 (waiting for XmlReader.ResolveEntity) and start/endDTD 13 - Error callbacks are not exactly synchronous, they tend 14 to be invoked before the corresponding content callback, 15 because the underlying reader interface parses 16 data by chunks of 512 bytes 17 18TODO 19 - search for TODO 20 - some ErrorHandler events (warning) 21 - some ContentHandler events (setDocumentLocator, skippedEntity) 22 - EntityResolver (using libxml2.?) 23 - DTDHandler (if/when libxml2 exposes such node types) 24 - DeclHandler (if/when libxml2 exposes such node types) 25 - property_xml_string? 26 - feature_string_interning? 27 - Incremental parser 28 - additional performance tuning: 29 - one might cache callbacks to avoid some name lookups 30 - one might implement a smarter way to pass attributes to startElement 31 (some kind of lazy evaluation?) 32 - there might be room for improvement in start/endPrefixMapping 33 - other? 34 35""" 36 37__author__ = "St�phane Bidoul <sbi@skynet.be>" 38__version__ = "0.3" 39 40import sys 41import codecs 42 43if sys.version_info[0] < 3: 44 __author__ = codecs.unicode_escape_decode(__author__)[0] 45 46 StringTypes = (str, unicode) 47else: 48 StringTypes = str 49 50from xml.sax._exceptions import * 51from xml.sax import xmlreader, saxutils 52from xml.sax.handler import \ 53 feature_namespaces, \ 54 feature_namespace_prefixes, \ 55 feature_string_interning, \ 56 feature_validation, \ 57 feature_external_ges, \ 58 feature_external_pes, \ 59 property_lexical_handler, \ 60 property_declaration_handler, \ 61 property_dom_node, \ 62 property_xml_string 63 64# libxml2 returns strings as UTF8 65_decoder = codecs.lookup("utf8")[1] 66def _d(s): 67 if s is None: 68 return s 69 else: 70 return _decoder(s)[0] 71 72try: 73 import libxml2 74except ImportError: 75 raise SAXReaderNotAvailable("libxml2 not available: " \ 76 "import error was: %s" % sys.exc_info()[1]) 77 78class Locator(xmlreader.Locator): 79 """SAX Locator adapter for libxml2.xmlTextReaderLocator""" 80 81 def __init__(self,locator): 82 self.__locator = locator 83 84 def getColumnNumber(self): 85 "Return the column number where the current event ends." 86 return -1 87 88 def getLineNumber(self): 89 "Return the line number where the current event ends." 90 return self.__locator.LineNumber() 91 92 def getPublicId(self): 93 "Return the public identifier for the current event." 94 return None 95 96 def getSystemId(self): 97 "Return the system identifier for the current event." 98 return self.__locator.BaseURI() 99 100class LibXml2Reader(xmlreader.XMLReader): 101 102 def __init__(self): 103 xmlreader.XMLReader.__init__(self) 104 # features 105 self.__ns = 0 106 self.__nspfx = 0 107 self.__validate = 0 108 self.__extparams = 1 109 # parsing flag 110 self.__parsing = 0 111 # additional handlers 112 self.__lex_handler = None 113 self.__decl_handler = None 114 # error messages accumulator 115 self.__errors = None 116 117 def _errorHandler(self,arg,msg,severity,locator): 118 if self.__errors is None: 119 self.__errors = [] 120 self.__errors.append((severity, 121 SAXParseException(msg,None, 122 Locator(locator)))) 123 124 def _reportErrors(self,fatal): 125 for severity,exception in self.__errors: 126 if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING, 127 libxml2.PARSER_SEVERITY_WARNING): 128 self._err_handler.warning(exception) 129 else: 130 # when fatal is set, the parse will stop; 131 # we consider that the last error reported 132 # is the fatal one. 133 if fatal and exception is self.__errors[-1][1]: 134 self._err_handler.fatalError(exception) 135 else: 136 self._err_handler.error(exception) 137 self.__errors = None 138 139 def parse(self, source): 140 self.__parsing = 1 141 try: 142 # prepare source and create reader 143 if isinstance(source, StringTypes): 144 reader = libxml2.newTextReaderFilename(source) 145 else: 146 source = saxutils.prepare_input_source(source) 147 input = libxml2.inputBuffer(source.getByteStream()) 148 reader = input.newTextReader(source.getSystemId()) 149 reader.SetErrorHandler(self._errorHandler,None) 150 # configure reader 151 if self.__extparams: 152 reader.SetParserProp(libxml2.PARSER_LOADDTD,1) 153 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) 154 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) 155 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) 156 else: 157 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0) 158 # we reuse attribute maps (for a slight performance gain) 159 if self.__ns: 160 attributesNSImpl = xmlreader.AttributesNSImpl({},{}) 161 else: 162 attributesImpl = xmlreader.AttributesImpl({}) 163 # prefixes to pop (for endPrefixMapping) 164 prefixes = [] 165 # start loop 166 self._cont_handler.startDocument() 167 while 1: 168 r = reader.Read() 169 # check for errors 170 if r == 1: 171 if not self.__errors is None: 172 self._reportErrors(0) 173 elif r == 0: 174 if not self.__errors is None: 175 self._reportErrors(0) 176 break # end of parse 177 else: 178 if not self.__errors is None: 179 self._reportErrors(1) 180 else: 181 self._err_handler.fatalError(\ 182 SAXException("Read failed (no details available)")) 183 break # fatal parse error 184 # get node type 185 nodeType = reader.NodeType() 186 # Element 187 if nodeType == 1: 188 if self.__ns: 189 eltName = (_d(reader.NamespaceUri()),\ 190 _d(reader.LocalName())) 191 eltQName = _d(reader.Name()) 192 attributesNSImpl._attrs = attrs = {} 193 attributesNSImpl._qnames = qnames = {} 194 newPrefixes = [] 195 while reader.MoveToNextAttribute(): 196 qname = _d(reader.Name()) 197 value = _d(reader.Value()) 198 if qname.startswith("xmlns"): 199 if len(qname) > 5: 200 newPrefix = qname[6:] 201 else: 202 newPrefix = None 203 newPrefixes.append(newPrefix) 204 self._cont_handler.startPrefixMapping(\ 205 newPrefix,value) 206 if not self.__nspfx: 207 continue # don't report xmlns attribute 208 attName = (_d(reader.NamespaceUri()), 209 _d(reader.LocalName())) 210 qnames[attName] = qname 211 attrs[attName] = value 212 reader.MoveToElement() 213 self._cont_handler.startElementNS( \ 214 eltName,eltQName,attributesNSImpl) 215 if reader.IsEmptyElement(): 216 self._cont_handler.endElementNS(eltName,eltQName) 217 for newPrefix in newPrefixes: 218 self._cont_handler.endPrefixMapping(newPrefix) 219 else: 220 prefixes.append(newPrefixes) 221 else: 222 eltName = _d(reader.Name()) 223 attributesImpl._attrs = attrs = {} 224 while reader.MoveToNextAttribute(): 225 attName = _d(reader.Name()) 226 attrs[attName] = _d(reader.Value()) 227 reader.MoveToElement() 228 self._cont_handler.startElement( \ 229 eltName,attributesImpl) 230 if reader.IsEmptyElement(): 231 self._cont_handler.endElement(eltName) 232 # EndElement 233 elif nodeType == 15: 234 if self.__ns: 235 self._cont_handler.endElementNS( \ 236 (_d(reader.NamespaceUri()),_d(reader.LocalName())), 237 _d(reader.Name())) 238 for prefix in prefixes.pop(): 239 self._cont_handler.endPrefixMapping(prefix) 240 else: 241 self._cont_handler.endElement(_d(reader.Name())) 242 # Text 243 elif nodeType == 3: 244 self._cont_handler.characters(_d(reader.Value())) 245 # Whitespace 246 elif nodeType == 13: 247 self._cont_handler.ignorableWhitespace(_d(reader.Value())) 248 # SignificantWhitespace 249 elif nodeType == 14: 250 self._cont_handler.characters(_d(reader.Value())) 251 # CDATA 252 elif nodeType == 4: 253 if not self.__lex_handler is None: 254 self.__lex_handler.startCDATA() 255 self._cont_handler.characters(_d(reader.Value())) 256 if not self.__lex_handler is None: 257 self.__lex_handler.endCDATA() 258 # EntityReference 259 elif nodeType == 5: 260 if not self.__lex_handler is None: 261 self.startEntity(_d(reader.Name())) 262 reader.ResolveEntity() 263 # EndEntity 264 elif nodeType == 16: 265 if not self.__lex_handler is None: 266 self.endEntity(_d(reader.Name())) 267 # ProcessingInstruction 268 elif nodeType == 7: 269 self._cont_handler.processingInstruction( \ 270 _d(reader.Name()),_d(reader.Value())) 271 # Comment 272 elif nodeType == 8: 273 if not self.__lex_handler is None: 274 self.__lex_handler.comment(_d(reader.Value())) 275 # DocumentType 276 elif nodeType == 10: 277 #if not self.__lex_handler is None: 278 # self.__lex_handler.startDTD() 279 pass # TODO (how to detect endDTD? on first non-dtd event?) 280 # XmlDeclaration 281 elif nodeType == 17: 282 pass # TODO 283 # Entity 284 elif nodeType == 6: 285 pass # TODO (entity decl) 286 # Notation (decl) 287 elif nodeType == 12: 288 pass # TODO 289 # Attribute (never in this loop) 290 #elif nodeType == 2: 291 # pass 292 # Document (not exposed) 293 #elif nodeType == 9: 294 # pass 295 # DocumentFragment (never returned by XmlReader) 296 #elif nodeType == 11: 297 # pass 298 # None 299 #elif nodeType == 0: 300 # pass 301 # - 302 else: 303 raise SAXException("Unexpected node type %d" % nodeType) 304 if r == 0: 305 self._cont_handler.endDocument() 306 reader.Close() 307 finally: 308 self.__parsing = 0 309 310 def setDTDHandler(self, handler): 311 # TODO (when supported, the inherited method works just fine) 312 raise SAXNotSupportedException("DTDHandler not supported") 313 314 def setEntityResolver(self, resolver): 315 # TODO (when supported, the inherited method works just fine) 316 raise SAXNotSupportedException("EntityResolver not supported") 317 318 def getFeature(self, name): 319 if name == feature_namespaces: 320 return self.__ns 321 elif name == feature_namespace_prefixes: 322 return self.__nspfx 323 elif name == feature_validation: 324 return self.__validate 325 elif name == feature_external_ges: 326 return 1 # TODO (does that relate to PARSER_LOADDTD)? 327 elif name == feature_external_pes: 328 return self.__extparams 329 else: 330 raise SAXNotRecognizedException("Feature '%s' not recognized" % \ 331 name) 332 333 def setFeature(self, name, state): 334 if self.__parsing: 335 raise SAXNotSupportedException("Cannot set feature %s " \ 336 "while parsing" % name) 337 if name == feature_namespaces: 338 self.__ns = state 339 elif name == feature_namespace_prefixes: 340 self.__nspfx = state 341 elif name == feature_validation: 342 self.__validate = state 343 elif name == feature_external_ges: 344 if state == 0: 345 # TODO (does that relate to PARSER_LOADDTD)? 346 raise SAXNotSupportedException("Feature '%s' not supported" % \ 347 name) 348 elif name == feature_external_pes: 349 self.__extparams = state 350 else: 351 raise SAXNotRecognizedException("Feature '%s' not recognized" % \ 352 name) 353 354 def getProperty(self, name): 355 if name == property_lexical_handler: 356 return self.__lex_handler 357 elif name == property_declaration_handler: 358 return self.__decl_handler 359 else: 360 raise SAXNotRecognizedException("Property '%s' not recognized" % \ 361 name) 362 363 def setProperty(self, name, value): 364 if name == property_lexical_handler: 365 self.__lex_handler = value 366 elif name == property_declaration_handler: 367 # TODO: remove if/when libxml2 supports dtd events 368 raise SAXNotSupportedException("Property '%s' not supported" % \ 369 name) 370 self.__decl_handler = value 371 else: 372 raise SAXNotRecognizedException("Property '%s' not recognized" % \ 373 name) 374 375def create_parser(): 376 return LibXml2Reader() 377 378