1 * Summary: interface for an HTML 4.0 non-verifying parser 2 * Description: this module implements an HTML 4.0 non-verifying parser 3 * with API compatible with the XML parser ones. It should 4 * be able to parse "real world" HTML, even if severely 5 * broken from a specification point of view. 6 * 7 * Copy: See Copyright for the status of this software. 8 * 9 * Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A. 10 11 /if not defined(HTML_PARSER_H__) 12 /define HTML_PARSER_H__ 13 14 /include "libxmlrpg/xmlversion" 15 /include "libxmlrpg/xmlTypesC" 16 /include "libxmlrpg/parser" 17 18 /if defined(LIBXML_HTML_ENABLED) 19 20 * Most of the back-end structures from XML and HTML are shared. 21 22 d htmlParserCtxtPtr... 23 d s based(######typedef######) 24 d like(xmlParserCtxtPtr) 25 26 d htmlParserCtxt ds based(htmlParserCtxtPtr) 27 d likeds(xmlParserCtxt) 28 29 d htmlParserNodeInfoPtr... 30 d s based(######typedef######) 31 d like(xmlParserNodeInfoPtr) 32 33 d htmlParserNodeInfo... 34 d ds based(htmlParserNodeInfoPtr) 35 d likeds(xmlParserNodeInfo) 36 37 d htmlSAXHandlerPtr... 38 d s based(######typedef######) 39 d like(xmlSAXHandlerPtr) 40 41 d htmlSAXHandler ds based(htmlSAXHandlerPtr) 42 d likeds(xmlSAXHandler) 43 44 d htmlParserInputPtr... 45 d s based(######typedef######) 46 d like(xmlParserInputPtr) 47 48 d htmlParserInput... 49 d ds based(htmlParserInputPtr) 50 d likeds(xmlParserInput) 51 52 d htmlDocPtr s based(######typedef######) 53 d like(xmlDocPtr) 54 55 d htmlNodePtr s based(######typedef######) 56 d like(xmlNodePtr) 57 58 * Internal description of an HTML element, representing HTML 4.01 59 * and XHTML 1.0 (which share the same structure). 60 61 d htmlElemDescPtr... 62 d s * based(######typedef######) 63 64 d htmlElemDesc ds based(htmlElemDescPtr) 65 d align qualified 66 d name * const char * 67 d startTag like(xmlCchar) Start tag implied ? 68 d endTag like(xmlCchar) End tag implied ? 69 d saveEndTag like(xmlCchar) Save end tag ? 70 d empty like(xmlCchar) Empty element ? 71 d depr like(xmlCchar) Deprecated element ? 72 d dtd like(xmlCchar) Loose DTD/Frameset 73 d isinline like(xmlCchar) Block 0/inline elem? 74 d desc * const char * 75 * 76 * New fields encapsulating HTML structure 77 * 78 * Bugs: 79 * This is a very limited representation. It fails to tell us when 80 * an element *requires* subelements (we only have whether they're 81 * allowed or not), and it doesn't tell us where CDATA and PCDATA 82 * are allowed. Some element relationships are not fully represented: 83 * these are flagged with the word MODIFIER 84 * 85 d subelts * const char * * 86 d defaultsubelt * const char * 87 d attrs_opt * const char * * 88 d attrs_depr * const char * * 89 d attrs_req * const char * * 90 91 * Internal description of an HTML entity. 92 93 d htmlEntityDescPtr... 94 d s * based(######typedef######) 95 96 d htmlEntityDesc... 97 d ds based(htmlEntityDescPtr) 98 d align qualified 99 d value 10u 0 Unicode char value 100 d name * const char * 101 d desc * const char * 102 103 * There is only few public functions. 104 105 d htmlTagLookup pr extproc('htmlTagLookup') 106 d like(htmlElemDescPtr) const 107 d tag * value options(*string) const xmlChar * 108 109 d htmlEntityLookup... 110 d pr extproc('htmlEntityLookup') 111 d like(htmlEntityDescPtr) const 112 d name * value options(*string) const xmlChar * 113 114 d htmlEntityValueLookup... 115 d pr extproc('htmlEntityValueLookup') 116 d like(htmlEntityDescPtr) const 117 d value 10u 0 value 118 119 d htmlIsAutoClosed... 120 d pr 10i 0 extproc('htmlIsAutoClosed') 121 d doc value like(htmlDocPtr) 122 d elem value like(htmlNodePtr) 123 124 d htmlAutoCloseTag... 125 d pr 10i 0 extproc('htmlAutoCloseTag') 126 d doc value like(htmlDocPtr) 127 d name * value options(*string) const xmlChar * 128 d elem value like(htmlNodePtr) 129 130 d htmlParseEntityRef... 131 d pr extproc('htmlParseEntityRef') 132 d like(htmlEntityDescPtr) const 133 d ctxt value like(htmlParserCtxtPtr) 134 d str * const xmlChar *(*) 135 136 d htmlParseCharRef... 137 d pr 10i 0 extproc('htmlParseCharRef') 138 d ctxt value like(htmlParserCtxtPtr) 139 140 d htmlParseElement... 141 d pr extproc('htmlParseElement') 142 d ctxt value like(htmlParserCtxtPtr) 143 144 d htmlNewParserCtxt... 145 d pr extproc('htmlNewParserCtxt') 146 d like(htmlParserCtxtPtr) 147 148 d htmlCreateMemoryParserCtxt... 149 d pr extproc('htmlCreateMemoryParserCtxt') 150 d like(htmlParserCtxtPtr) 151 d buffer * value options(*string) const char * 152 d size 10i 0 value 153 154 d htmlParseDocument... 155 d pr 10i 0 extproc('htmlParseDocument') 156 d ctxt value like(htmlParserCtxtPtr) 157 158 d htmlSAXParseDoc... 159 d pr extproc('htmlSAXParseDoc') 160 d like(htmlDocPtr) 161 d cur * value options(*string) xmlChar * 162 d encoding * value options(*string) const char * 163 d sax value like(htmlSAXHandlerPtr) 164 d userData * value void * 165 166 d htmlParseDoc pr extproc('htmlParseDoc') 167 d like(htmlDocPtr) 168 d cur * value options(*string) xmlChar * 169 d encoding * value options(*string) const char * 170 171 d htmlSAXParseFile... 172 d pr extproc('htmlSAXParseFile') 173 d like(htmlDocPtr) 174 d filename * value options(*string) const char * 175 d encoding * value options(*string) const char * 176 d sax value like(htmlSAXHandlerPtr) 177 d userData * value void * 178 179 d htmlParseFile pr extproc('htmlParseFile') 180 d like(htmlDocPtr) 181 d filename * value options(*string) const char * 182 d encoding * value options(*string) const char * 183 184 d UTF8ToHtml pr 10i 0 extproc('UTF8ToHtml') 185 d out 65535 options(*varsize) unsigned char [] 186 d outlen 10i 0 187 d in * value options(*string) const unsigned char* 188 d inlen 10i 0 189 190 d htmlEncodeEntities... 191 d pr 10i 0 extproc('htmlEncodeEntities') 192 d out 65535 options(*varsize) unsigned char [] 193 d outlen 10i 0 194 d in * value options(*string) const unsigned char* 195 d inlen 10i 0 196 d quoteChar 10i 0 value 197 198 d htmlIsScriptAttribute... 199 d pr 10i 0 extproc('htmlIsScriptAttribute') 200 d name * value options(*string) const xmlChar * 201 202 d htmlHandleOmittedElem... 203 d pr 10i 0 extproc('htmlHandleOmittedElem') 204 d val 10i 0 value 205 206 /if defined(LIBXML_PUSH_ENABLED) 207 208 * Interfaces for the Push mode. 209 210 d htmlCreatePushParserCtxt... 211 d pr extproc('htmlCreatePushParserCtxt') 212 d like(htmlParserCtxtPtr) 213 d sax value like(htmlSAXHandlerPtr) 214 d user_data * value void * 215 d chunk * value options(*string) const char * 216 d size 10i 0 value 217 d filename * value options(*string) const char * 218 d enc value like(xmlCharEncoding) 219 220 d htmlParseChunk pr 10i 0 extproc('htmlParseChunk') 221 d ctxt value like(htmlParserCtxtPtr) 222 d chunk * value options(*string) const char * 223 d size 10i 0 value 224 d terminate 10i 0 value 225 /endif LIBXML_PUSH_ENABLED 226 227 d htmlFreeParserCtxt... 228 d pr extproc('htmlFreeParserCtxt') 229 d ctxt value like(htmlParserCtxtPtr) 230 231 * New set of simpler/more flexible APIs 232 233 * xmlParserOption: 234 * 235 * This is the set of XML parser options that can be passed down 236 * to the xmlReadDoc() and similar calls. 237 238 d htmlParserOption... 239 d s 10i 0 based(######typedef######) enum 240 d HTML_PARSE_RECOVER... Relaxed parsing 241 d c X'00000001' 242 d HTML_PARSE_NODEFDTD... No default doctype 243 d c X'00000004' 244 d HTML_PARSE_NOERROR... No error reports 245 d c X'00000020' 246 d HTML_PARSE_NOWARNING... No warning reports 247 d c X'00000040' 248 d HTML_PARSE_PEDANTIC... Pedantic err reports 249 d c X'00000080' 250 d HTML_PARSE_NOBLANKS... Remove blank nodes 251 d c X'00000100' 252 d HTML_PARSE_NONET... Forbid net access 253 d c X'00000800' 254 d HTML_PARSE_NOIMPLIED... No implied html/body 255 d c X'00002000' 256 d HTML_PARSE_COMPACT... compact small txtnod 257 d c X'00010000' 258 d HTML_PARSE_IGNORE_ENC... Ignore encoding hint 259 d c X'00200000' 260 261 d htmlCtxtReset pr extproc('htmlCtxtReset') 262 d ctxt value like(htmlParserCtxtPtr) 263 264 d htmlCtxtUseOptions... 265 d pr 10i 0 extproc('htmlCtxtUseOptions') 266 d ctxt value like(htmlParserCtxtPtr) 267 d options 10i 0 value 268 269 d htmlReadDoc pr extproc('htmlReadDoc') 270 d like(htmlDocPtr) 271 d cur * value options(*string) const xmlChar * 272 d URL * value options(*string) const char * 273 d encoding * value options(*string) const char * 274 d options 10i 0 value 275 276 d htmlReadFile pr extproc('htmlReadFile') 277 d like(htmlDocPtr) 278 d URL * value options(*string) const char * 279 d encoding * value options(*string) const char * 280 d options 10i 0 value 281 282 d htmlReadMemory pr extproc('htmlReadMemory') 283 d like(htmlDocPtr) 284 d buffer * value options(*string) const char * 285 d size 10i 0 value 286 d URL * value options(*string) const char * 287 d encoding * value options(*string) const char * 288 d options 10i 0 value 289 290 d htmlReadFd pr extproc('htmlReadFd') 291 d like(htmlDocPtr) 292 d fd 10i 0 value 293 d URL * value options(*string) const char * 294 d encoding * value options(*string) const char * 295 d options 10i 0 value 296 297 d htmlReadIO pr extproc('htmlReadIO') 298 d like(htmlDocPtr) 299 d ioread value like(xmlInputReadCallback) 300 d ioclose value like(xmlInputCloseCallback) 301 d ioctx * value void * 302 d URL * value options(*string) const char * 303 d encoding * value options(*string) const char * 304 d options 10i 0 value 305 306 d htmlCtxtReadDoc... 307 d pr extproc('htmlCtxtReadDoc') 308 d like(htmlDocPtr) 309 d ctxt value like(xmlParserCtxtPtr) 310 d cur * value options(*string) const xmlChar * 311 d URL * value options(*string) const char * 312 d encoding * value options(*string) const char * 313 d options 10i 0 value 314 315 d htmlCtxtReadFile... 316 d pr extproc('htmlCtxtReadFile') 317 d like(htmlDocPtr) 318 d ctxt value like(xmlParserCtxtPtr) 319 d filename * value options(*string) const char * 320 d encoding * value options(*string) const char * 321 d options 10i 0 value 322 323 d htmlCtxtReadMemory... 324 d pr extproc('htmlCtxtReadMemory') 325 d like(htmlDocPtr) 326 d ctxt value like(xmlParserCtxtPtr) 327 d buffer * value options(*string) const char * 328 d size 10i 0 value 329 d URL * value options(*string) const char * 330 d encoding * value options(*string) const char * 331 d options 10i 0 value 332 333 d htmlCtxtReadFd pr extproc('htmlCtxtReadFd') 334 d like(htmlDocPtr) 335 d ctxt value like(xmlParserCtxtPtr) 336 d fd 10i 0 value 337 d URL * value options(*string) const char * 338 d encoding * value options(*string) const char * 339 d options 10i 0 value 340 341 d htmlCtxtReadIO pr extproc('htmlCtxtReadIO') 342 d like(htmlDocPtr) 343 d ctxt value like(xmlParserCtxtPtr) 344 d ioread value like(xmlInputReadCallback) 345 d ioclose value like(xmlInputCloseCallback) 346 d ioctx * value void * 347 d URL * value options(*string) const char * 348 d encoding * value options(*string) const char * 349 d options 10i 0 value 350 351 * Further knowledge of HTML structure 352 353 d htmlStatus s 10i 0 based(######typedef######) enum 354 d HTML_NA c X'0000' No check at all 355 d HTML_INVALID c X'0001' 356 d HTML_DEPRECATED... 357 d c X'0002' 358 d HTML_VALID c X'0004' 359 d HTML_REQUIRED c X'000C' HTML_VALID ored-in 360 361 * Using htmlElemDesc rather than name here, to emphasise the fact 362 * that otherwise there's a lookup overhead 363 364 d htmlAttrAllowed... 365 d pr extproc('htmlAttrAllowed') 366 d like(htmlStatus) 367 d #param1 value like(htmlElemDescPtr) const 368 d #param2 * value options(*string) const xmlChar * 369 d #param3 10i 0 value 370 371 d htmlElementAllowedHere... 372 d pr 10i 0 extproc('htmlElementAllowedHere') 373 d #param1 value like(htmlElemDescPtr) const 374 d #param2 * value options(*string) const xmlChar * 375 376 d htmlElementStatusHere... 377 d pr extproc('htmlElementStatusHere') 378 d like(htmlStatus) 379 d #param1 value like(htmlElemDescPtr) const 380 d #param2 value like(htmlElemDescPtr) const 381 382 d htmlNodeStatus pr extproc('htmlNodeStatus') 383 d like(htmlStatus) 384 d #param1 value like(htmlNodePtr) 385 d #param2 10i 0 value 386 387 * C macros implemented as procedures for ILE/RPG support. 388 389 d htmlDefaultSubelement... 390 d pr * extproc('__htmlDefaultSubelement') const char * 391 d elt * value const htmlElemDesc * 392 393 d htmlElementAllowedHereDesc... 394 d pr 10i 0 extproc( 395 d '__htmlElementAllowedHereDesc') 396 d parent * value const htmlElemDesc * 397 d elt * value const htmlElemDesc * 398 399 d htmlRequiredAttrs... 400 d pr * extproc('__htmlRequiredAttrs') const char * * 401 d elt * value const htmlElemDesc * 402 403 /endif LIBXML_HTML_ENABLED 404 /endif HTML_PARSER_H__ 405