1      * Summary: interface for an HTML 4.0 non-verifying parser
2      * Description: this module implements an HTML 4.0 non-verifying parser
3      *              with API compatible with the XML parser ones. It should
4      *              be able to parse "real world" HTML, even if severely
5      *              broken from a specification point of view.
6      *
7      * Copy: See Copyright for the status of this software.
8      *
9      * Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A.
10
11      /if not defined(HTML_PARSER_H__)
12      /define HTML_PARSER_H__
13
14      /include "libxmlrpg/xmlversion"
15      /include "libxmlrpg/xmlTypesC"
16      /include "libxmlrpg/parser"
17
18      /if defined(LIBXML_HTML_ENABLED)
19
20      * Most of the back-end structures from XML and HTML are shared.
21
22     d htmlParserCtxtPtr...
23     d                 s                   based(######typedef######)
24     d                                     like(xmlParserCtxtPtr)
25
26     d htmlParserCtxt  ds                  based(htmlParserCtxtPtr)
27     d                                     likeds(xmlParserCtxt)
28
29     d htmlParserNodeInfoPtr...
30     d                 s                   based(######typedef######)
31     d                                     like(xmlParserNodeInfoPtr)
32
33     d htmlParserNodeInfo...
34     d                 ds                  based(htmlParserNodeInfoPtr)
35     d                                     likeds(xmlParserNodeInfo)
36
37     d htmlSAXHandlerPtr...
38     d                 s                   based(######typedef######)
39     d                                     like(xmlSAXHandlerPtr)
40
41     d htmlSAXHandler  ds                  based(htmlSAXHandlerPtr)
42     d                                     likeds(xmlSAXHandler)
43
44     d htmlParserInputPtr...
45     d                 s                   based(######typedef######)
46     d                                     like(xmlParserInputPtr)
47
48     d htmlParserInput...
49     d                 ds                  based(htmlParserInputPtr)
50     d                                     likeds(xmlParserInput)
51
52     d htmlDocPtr      s                   based(######typedef######)
53     d                                     like(xmlDocPtr)
54
55     d htmlNodePtr     s                   based(######typedef######)
56     d                                     like(xmlNodePtr)
57
58      * Internal description of an HTML element, representing HTML 4.01
59      * and XHTML 1.0 (which share the same structure).
60
61     d htmlElemDescPtr...
62     d                 s               *   based(######typedef######)
63
64     d htmlElemDesc    ds                  based(htmlElemDescPtr)
65     d                                     align qualified
66     d  name                           *                                        const char *
67     d  startTag                           like(xmlCchar)                       Start tag implied ?
68     d  endTag                             like(xmlCchar)                       End tag implied ?
69     d  saveEndTag                         like(xmlCchar)                       Save end tag ?
70     d  empty                              like(xmlCchar)                       Empty element ?
71     d  depr                               like(xmlCchar)                       Deprecated element ?
72     d  dtd                                like(xmlCchar)                       Loose DTD/Frameset
73     d  isinline                           like(xmlCchar)                       Block 0/inline elem?
74     d  desc                           *                                        const char *
75      *
76      * New fields encapsulating HTML structure
77      *
78      * Bugs:
79      *      This is a very limited representation.  It fails to tell us when
80      *      an element *requires* subelements (we only have whether they're
81      *      allowed or not), and it doesn't tell us where CDATA and PCDATA
82      *      are allowed.  Some element relationships are not fully represented:
83      *      these are flagged with the word MODIFIER
84      *
85     d  subelts                        *                                        const char * *
86     d  defaultsubelt                  *                                        const char *
87     d  attrs_opt                      *                                        const char * *
88     d  attrs_depr                     *                                        const char * *
89     d  attrs_req                      *                                        const char * *
90
91      * Internal description of an HTML entity.
92
93     d htmlEntityDescPtr...
94     d                 s               *   based(######typedef######)
95
96     d htmlEntityDesc...
97     d                 ds                  based(htmlEntityDescPtr)
98     d                                     align qualified
99     d  value                        10u 0                                      Unicode char value
100     d  name                           *                                        const char *
101     d  desc                           *                                        const char *
102
103      * There is only few public functions.
104
105     d htmlTagLookup   pr                  extproc('htmlTagLookup')
106     d                                     like(htmlElemDescPtr)                const
107     d  tag                            *   value options(*string)               const xmlChar *
108
109     d htmlEntityLookup...
110     d                 pr                  extproc('htmlEntityLookup')
111     d                                     like(htmlEntityDescPtr)              const
112     d  name                           *   value options(*string)               const xmlChar *
113
114     d htmlEntityValueLookup...
115     d                 pr                  extproc('htmlEntityValueLookup')
116     d                                     like(htmlEntityDescPtr)              const
117     d  value                        10u 0 value
118
119     d htmlIsAutoClosed...
120     d                 pr            10i 0 extproc('htmlIsAutoClosed')
121     d  doc                                value like(htmlDocPtr)
122     d  elem                               value like(htmlNodePtr)
123
124     d htmlAutoCloseTag...
125     d                 pr            10i 0 extproc('htmlAutoCloseTag')
126     d  doc                                value like(htmlDocPtr)
127     d  name                           *   value options(*string)               const xmlChar *
128     d  elem                               value like(htmlNodePtr)
129
130     d htmlParseEntityRef...
131     d                 pr                  extproc('htmlParseEntityRef')
132     d                                     like(htmlEntityDescPtr)              const
133     d  ctxt                               value like(htmlParserCtxtPtr)
134     d  str                            *                                        const xmlChar *(*)
135
136     d htmlParseCharRef...
137     d                 pr            10i 0 extproc('htmlParseCharRef')
138     d  ctxt                               value like(htmlParserCtxtPtr)
139
140     d htmlParseElement...
141     d                 pr                  extproc('htmlParseElement')
142     d  ctxt                               value like(htmlParserCtxtPtr)
143
144     d htmlNewParserCtxt...
145     d                 pr                  extproc('htmlNewParserCtxt')
146     d                                     like(htmlParserCtxtPtr)
147
148     d htmlCreateMemoryParserCtxt...
149     d                 pr                  extproc('htmlCreateMemoryParserCtxt')
150     d                                     like(htmlParserCtxtPtr)
151     d  buffer                         *   value options(*string)               const char *
152     d  size                         10i 0 value
153
154     d htmlParseDocument...
155     d                 pr            10i 0 extproc('htmlParseDocument')
156     d  ctxt                               value like(htmlParserCtxtPtr)
157
158     d htmlSAXParseDoc...
159     d                 pr                  extproc('htmlSAXParseDoc')
160     d                                     like(htmlDocPtr)
161     d  cur                            *   value options(*string)               xmlChar *
162     d  encoding                       *   value options(*string)               const char *
163     d  sax                                value like(htmlSAXHandlerPtr)
164     d  userData                       *   value                                void *
165
166     d htmlParseDoc    pr                  extproc('htmlParseDoc')
167     d                                     like(htmlDocPtr)
168     d  cur                            *   value options(*string)               xmlChar *
169     d  encoding                       *   value options(*string)               const char *
170
171     d htmlSAXParseFile...
172     d                 pr                  extproc('htmlSAXParseFile')
173     d                                     like(htmlDocPtr)
174     d  filename                       *   value options(*string)               const char *
175     d  encoding                       *   value options(*string)               const char *
176     d  sax                                value like(htmlSAXHandlerPtr)
177     d  userData                       *   value                                void *
178
179     d htmlParseFile   pr                  extproc('htmlParseFile')
180     d                                     like(htmlDocPtr)
181     d  filename                       *   value options(*string)               const char *
182     d  encoding                       *   value options(*string)               const char *
183
184     d UTF8ToHtml      pr            10i 0 extproc('UTF8ToHtml')
185     d  out                       65535    options(*varsize)                    unsigned char []
186     d  outlen                       10i 0
187     d  in                             *   value options(*string)               const unsigned char*
188     d  inlen                        10i 0
189
190     d htmlEncodeEntities...
191     d                 pr            10i 0 extproc('htmlEncodeEntities')
192     d  out                       65535    options(*varsize)                    unsigned char []
193     d  outlen                       10i 0
194     d  in                             *   value options(*string)               const unsigned char*
195     d  inlen                        10i 0
196     d  quoteChar                    10i 0 value
197
198     d htmlIsScriptAttribute...
199     d                 pr            10i 0 extproc('htmlIsScriptAttribute')
200     d  name                           *   value options(*string)               const xmlChar *
201
202     d htmlHandleOmittedElem...
203     d                 pr            10i 0 extproc('htmlHandleOmittedElem')
204     d  val                          10i 0 value
205
206      /if defined(LIBXML_PUSH_ENABLED)
207
208      * Interfaces for the Push mode.
209
210     d htmlCreatePushParserCtxt...
211     d                 pr                  extproc('htmlCreatePushParserCtxt')
212     d                                     like(htmlParserCtxtPtr)
213     d  sax                                value like(htmlSAXHandlerPtr)
214     d  user_data                      *   value                                void *
215     d  chunk                          *   value options(*string)               const char *
216     d  size                         10i 0 value
217     d  filename                       *   value options(*string)               const char *
218     d  enc                                value like(xmlCharEncoding)
219
220     d htmlParseChunk  pr            10i 0 extproc('htmlParseChunk')
221     d  ctxt                               value like(htmlParserCtxtPtr)
222     d  chunk                          *   value options(*string)               const char *
223     d  size                         10i 0 value
224     d  terminate                    10i 0 value
225      /endif                                                                    LIBXML_PUSH_ENABLED
226
227     d htmlFreeParserCtxt...
228     d                 pr                  extproc('htmlFreeParserCtxt')
229     d  ctxt                               value like(htmlParserCtxtPtr)
230
231      * New set of simpler/more flexible APIs
232
233      * xmlParserOption:
234      *
235      * This is the set of XML parser options that can be passed down
236      * to the xmlReadDoc() and similar calls.
237
238     d htmlParserOption...
239     d                 s             10i 0 based(######typedef######)           enum
240     d  HTML_PARSE_RECOVER...                                                   Relaxed parsing
241     d                 c                   X'00000001'
242     d  HTML_PARSE_NODEFDTD...                                                  No default doctype
243     d                 c                   X'00000004'
244     d  HTML_PARSE_NOERROR...                                                   No error reports
245     d                 c                   X'00000020'
246     d  HTML_PARSE_NOWARNING...                                                 No warning reports
247     d                 c                   X'00000040'
248     d  HTML_PARSE_PEDANTIC...                                                  Pedantic err reports
249     d                 c                   X'00000080'
250     d  HTML_PARSE_NOBLANKS...                                                  Remove blank nodes
251     d                 c                   X'00000100'
252     d  HTML_PARSE_NONET...                                                     Forbid net access
253     d                 c                   X'00000800'
254     d  HTML_PARSE_NOIMPLIED...                                                 No implied html/body
255     d                 c                   X'00002000'
256     d  HTML_PARSE_COMPACT...                                                   compact small txtnod
257     d                 c                   X'00010000'
258     d  HTML_PARSE_IGNORE_ENC...                                                Ignore encoding hint
259     d                 c                   X'00200000'
260
261     d htmlCtxtReset   pr                  extproc('htmlCtxtReset')
262     d ctxt                                value like(htmlParserCtxtPtr)
263
264     d htmlCtxtUseOptions...
265     d                 pr            10i 0 extproc('htmlCtxtUseOptions')
266     d ctxt                                value like(htmlParserCtxtPtr)
267     d options                       10i 0 value
268
269     d htmlReadDoc     pr                  extproc('htmlReadDoc')
270     d                                     like(htmlDocPtr)
271     d  cur                            *   value options(*string)               const xmlChar *
272     d  URL                            *   value options(*string)               const char *
273     d  encoding                       *   value options(*string)               const char *
274     d  options                      10i 0 value
275
276     d htmlReadFile    pr                  extproc('htmlReadFile')
277     d                                     like(htmlDocPtr)
278     d  URL                            *   value options(*string)               const char *
279     d  encoding                       *   value options(*string)               const char *
280     d  options                      10i 0 value
281
282     d htmlReadMemory  pr                  extproc('htmlReadMemory')
283     d                                     like(htmlDocPtr)
284     d  buffer                         *   value options(*string)               const char *
285     d  size                         10i 0 value
286     d  URL                            *   value options(*string)               const char *
287     d  encoding                       *   value options(*string)               const char *
288     d  options                      10i 0 value
289
290     d htmlReadFd      pr                  extproc('htmlReadFd')
291     d                                     like(htmlDocPtr)
292     d  fd                           10i 0 value
293     d  URL                            *   value options(*string)               const char *
294     d  encoding                       *   value options(*string)               const char *
295     d  options                      10i 0 value
296
297     d htmlReadIO      pr                  extproc('htmlReadIO')
298     d                                     like(htmlDocPtr)
299     d  ioread                             value like(xmlInputReadCallback)
300     d  ioclose                            value like(xmlInputCloseCallback)
301     d  ioctx                          *   value                                void *
302     d  URL                            *   value options(*string)               const char *
303     d  encoding                       *   value options(*string)               const char *
304     d  options                      10i 0 value
305
306     d htmlCtxtReadDoc...
307     d                 pr                  extproc('htmlCtxtReadDoc')
308     d                                     like(htmlDocPtr)
309     d  ctxt                               value like(xmlParserCtxtPtr)
310     d  cur                            *   value options(*string)               const xmlChar *
311     d  URL                            *   value options(*string)               const char *
312     d  encoding                       *   value options(*string)               const char *
313     d  options                      10i 0 value
314
315     d htmlCtxtReadFile...
316     d                 pr                  extproc('htmlCtxtReadFile')
317     d                                     like(htmlDocPtr)
318     d  ctxt                               value like(xmlParserCtxtPtr)
319     d  filename                       *   value options(*string)               const char *
320     d  encoding                       *   value options(*string)               const char *
321     d  options                      10i 0 value
322
323     d htmlCtxtReadMemory...
324     d                 pr                  extproc('htmlCtxtReadMemory')
325     d                                     like(htmlDocPtr)
326     d  ctxt                               value like(xmlParserCtxtPtr)
327     d  buffer                         *   value options(*string)               const char *
328     d  size                         10i 0 value
329     d  URL                            *   value options(*string)               const char *
330     d  encoding                       *   value options(*string)               const char *
331     d  options                      10i 0 value
332
333     d htmlCtxtReadFd  pr                  extproc('htmlCtxtReadFd')
334     d                                     like(htmlDocPtr)
335     d  ctxt                               value like(xmlParserCtxtPtr)
336     d  fd                           10i 0 value
337     d  URL                            *   value options(*string)               const char *
338     d  encoding                       *   value options(*string)               const char *
339     d  options                      10i 0 value
340
341     d htmlCtxtReadIO  pr                  extproc('htmlCtxtReadIO')
342     d                                     like(htmlDocPtr)
343     d  ctxt                               value like(xmlParserCtxtPtr)
344     d  ioread                             value like(xmlInputReadCallback)
345     d  ioclose                            value like(xmlInputCloseCallback)
346     d  ioctx                          *   value                                void *
347     d  URL                            *   value options(*string)               const char *
348     d  encoding                       *   value options(*string)               const char *
349     d  options                      10i 0 value
350
351      * Further knowledge of HTML structure
352
353     d htmlStatus      s             10i 0 based(######typedef######)           enum
354     d  HTML_NA        c                   X'0000'                              No check at all
355     d  HTML_INVALID   c                   X'0001'
356     d  HTML_DEPRECATED...
357     d                 c                   X'0002'
358     d  HTML_VALID     c                   X'0004'
359     d  HTML_REQUIRED  c                   X'000C'                              HTML_VALID ored-in
360
361      * Using htmlElemDesc rather than name here, to emphasise the fact
362      *  that otherwise there's a lookup overhead
363
364     d htmlAttrAllowed...
365     d                 pr                  extproc('htmlAttrAllowed')
366     d                                     like(htmlStatus)
367     d  #param1                            value like(htmlElemDescPtr)          const
368     d  #param2                        *   value options(*string)               const xmlChar *
369     d  #param3                      10i 0 value
370
371     d htmlElementAllowedHere...
372     d                 pr            10i 0 extproc('htmlElementAllowedHere')
373     d  #param1                            value like(htmlElemDescPtr)          const
374     d  #param2                        *   value options(*string)               const xmlChar *
375
376     d htmlElementStatusHere...
377     d                 pr                  extproc('htmlElementStatusHere')
378     d                                     like(htmlStatus)
379     d  #param1                            value like(htmlElemDescPtr)          const
380     d  #param2                            value like(htmlElemDescPtr)          const
381
382     d htmlNodeStatus  pr                  extproc('htmlNodeStatus')
383     d                                     like(htmlStatus)
384     d  #param1                            value like(htmlNodePtr)
385     d  #param2                      10i 0 value
386
387      * C macros implemented as procedures for ILE/RPG support.
388
389     d htmlDefaultSubelement...
390     d                 pr              *   extproc('__htmlDefaultSubelement')   const char *
391     d  elt                            *   value                                const htmlElemDesc *
392
393     d htmlElementAllowedHereDesc...
394     d                 pr            10i 0 extproc(
395     d                                     '__htmlElementAllowedHereDesc')
396     d  parent                         *   value                                const htmlElemDesc *
397     d  elt                            *   value                                const htmlElemDesc *
398
399     d htmlRequiredAttrs...
400     d                 pr              *   extproc('__htmlRequiredAttrs')        const char * *
401     d  elt                            *   value                                const htmlElemDesc *
402
403      /endif                                                                    LIBXML_HTML_ENABLED
404      /endif                                                                    HTML_PARSER_H__
405