1      * Summary: interface for an HTML 4.0 non-verifying parser
2      * Description: this module implements an HTML 4.0 non-verifying parser
3      *              with API compatible with the XML parser ones. It should
4      *              be able to parse "real world" HTML, even if severely
5      *              broken from a specification point of view.
6      *
7      * Copy: See Copyright for the status of this software.
8      *
9      * Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A.
10
11      /if not defined(HTML_PARSER_H__)
12      /define HTML_PARSER_H__
13
14      /include "libxmlrpg/xmlversion"
15
16      /if defined(LIBXML_HTML_ENABLED)
17
18      /include "libxmlrpg/xmlTypesC"
19      /include "libxmlrpg/parser"
20
21      * Most of the back-end structures from XML and HTML are shared.
22
23     d htmlParserCtxtPtr...
24     d                 s                   based(######typedef######)
25     d                                     like(xmlParserCtxtPtr)
26
27     d htmlParserCtxt  ds                  based(htmlParserCtxtPtr)
28     d                                     likeds(xmlParserCtxt)
29
30     d htmlParserNodeInfoPtr...
31     d                 s                   based(######typedef######)
32     d                                     like(xmlParserNodeInfoPtr)
33
34     d htmlParserNodeInfo...
35     d                 ds                  based(htmlParserNodeInfoPtr)
36     d                                     likeds(xmlParserNodeInfo)
37
38     d htmlSAXHandlerPtr...
39     d                 s                   based(######typedef######)
40     d                                     like(xmlSAXHandlerPtr)
41
42     d htmlSAXHandler  ds                  based(htmlSAXHandlerPtr)
43     d                                     likeds(xmlSAXHandler)
44
45     d htmlParserInputPtr...
46     d                 s                   based(######typedef######)
47     d                                     like(xmlParserInputPtr)
48
49     d htmlParserInput...
50     d                 ds                  based(htmlParserInputPtr)
51     d                                     likeds(xmlParserInput)
52
53     d htmlDocPtr      s                   based(######typedef######)
54     d                                     like(xmlDocPtr)
55
56     d htmlNodePtr     s                   based(######typedef######)
57     d                                     like(xmlNodePtr)
58
59      * Internal description of an HTML element, representing HTML 4.01
60      * and XHTML 1.0 (which share the same structure).
61
62     d htmlElemDescPtr...
63     d                 s               *   based(######typedef######)
64
65     d htmlElemDesc    ds                  based(htmlElemDescPtr)
66     d                                     align qualified
67     d  name                           *                                        const char *
68     d  startTag                           like(xmlCchar)                       Start tag implied ?
69     d  endTag                             like(xmlCchar)                       End tag implied ?
70     d  saveEndTag                         like(xmlCchar)                       Save end tag ?
71     d  empty                              like(xmlCchar)                       Empty element ?
72     d  depr                               like(xmlCchar)                       Deprecated element ?
73     d  dtd                                like(xmlCchar)                       Loose DTD/Frameset
74     d  isinline                           like(xmlCchar)                       Block 0/inline elem?
75     d  desc                           *                                        const char *
76      *
77      * New fields encapsulating HTML structure
78      *
79      * Bugs:
80      *      This is a very limited representation.  It fails to tell us when
81      *      an element *requires* subelements (we only have whether they're
82      *      allowed or not), and it doesn't tell us where CDATA and PCDATA
83      *      are allowed.  Some element relationships are not fully represented:
84      *      these are flagged with the word MODIFIER
85      *
86     d  subelts                        *                                        const char * *
87     d  defaultsubelt                  *                                        const char *
88     d  attrs_opt                      *                                        const char * *
89     d  attrs_depr                     *                                        const char * *
90     d  attrs_req                      *                                        const char * *
91
92      * Internal description of an HTML entity.
93
94     d htmlEntityDescPtr...
95     d                 s               *   based(######typedef######)
96
97     d htmlEntityDesc...
98     d                 ds                  based(htmlEntityDescPtr)
99     d                                     align qualified
100     d  value                              like(xmlCuint)
101     d  name                           *                                        const char *
102     d  desc                           *                                        const char *
103
104      * There is only few public functions.
105
106     d htmlTagLookup   pr                  extproc('htmlTagLookup')
107     d                                     like(htmlElemDescPtr)                const
108     d  tag                            *   value options(*string)               const xmlChar *
109
110     d htmlEntityLookup...
111     d                 pr                  extproc('htmlEntityLookup')
112     d                                     like(htmlEntityDescPtr)              const
113     d  name                           *   value options(*string)               const xmlChar *
114
115     d htmlEntityValueLookup...
116     d                 pr                  extproc('htmlEntityValueLookup')
117     d                                     like(htmlEntityDescPtr)              const
118     d  value                              value like(xmlCuint)
119
120     d htmlIsAutoClosed...
121     d                 pr                  extproc('htmlIsAutoClosed')
122     d                                     like(xmlCint)
123     d  doc                                value like(htmlDocPtr)
124     d  elem                               value like(htmlNodePtr)
125
126     d htmlAutoCloseTag...
127     d                 pr                  extproc('htmlAutoCloseTag')
128     d                                     like(xmlCint)
129     d  doc                                value like(htmlDocPtr)
130     d  name                           *   value options(*string)               const xmlChar *
131     d  elem                               value like(htmlNodePtr)
132
133     d htmlParseEntityRef...
134     d                 pr                  extproc('htmlParseEntityRef')
135     d                                     like(htmlEntityDescPtr)              const
136     d  ctxt                               value like(htmlParserCtxtPtr)
137     d  str                            *                                        const xmlChar *(*)
138
139     d htmlParseCharRef...
140     d                 pr                  extproc('htmlParseCharRef')
141     d                                     like(xmlCint)
142     d  ctxt                               value like(htmlParserCtxtPtr)
143
144     d htmlParseElement...
145     d                 pr                  extproc('htmlParseElement')
146     d  ctxt                               value like(htmlParserCtxtPtr)
147
148     d htmlNewParserCtxt...
149     d                 pr                  extproc('htmlNewParserCtxt')
150     d                                     like(htmlParserCtxtPtr)
151
152     d htmlCreateMemoryParserCtxt...
153     d                 pr                  extproc('htmlCreateMemoryParserCtxt')
154     d                                     like(htmlParserCtxtPtr)
155     d  buffer                         *   value options(*string)               const char *
156     d  size                               value like(xmlCint)
157
158     d htmlParseDocument...
159     d                 pr                  extproc('htmlParseDocument')
160     d                                     like(xmlCint)
161     d  ctxt                               value like(htmlParserCtxtPtr)
162
163     d htmlSAXParseDoc...
164     d                 pr                  extproc('htmlSAXParseDoc')
165     d                                     like(htmlDocPtr)
166     d  cur                            *   value options(*string)               xmlChar *
167     d  encoding                       *   value options(*string)               const char *
168     d  sax                                value like(htmlSAXHandlerPtr)
169     d  userData                       *   value                                void *
170
171     d htmlParseDoc    pr                  extproc('htmlParseDoc')
172     d                                     like(htmlDocPtr)
173     d  cur                            *   value options(*string)               xmlChar *
174     d  encoding                       *   value options(*string)               const char *
175
176     d htmlSAXParseFile...
177     d                 pr                  extproc('htmlSAXParseFile')
178     d                                     like(htmlDocPtr)
179     d  filename                       *   value options(*string)               const char *
180     d  encoding                       *   value options(*string)               const char *
181     d  sax                                value like(htmlSAXHandlerPtr)
182     d  userData                       *   value                                void *
183
184     d htmlParseFile   pr                  extproc('htmlParseFile')
185     d                                     like(htmlDocPtr)
186     d  filename                       *   value options(*string)               const char *
187     d  encoding                       *   value options(*string)               const char *
188
189     d UTF8ToHtml      pr                  extproc('UTF8ToHtml')
190     d                                     like(xmlCint)
191     d  out                       65535    options(*varsize)                    unsigned char []
192     d  outlen                             like(xmlCint)
193     d  in                             *   value options(*string)               const unsigned char*
194     d  inlen                              like(xmlCint)
195
196     d htmlEncodeEntities...
197     d                 pr                  extproc('htmlEncodeEntities')
198     d                                     like(xmlCint)
199     d  out                       65535    options(*varsize)                    unsigned char []
200     d  outlen                             like(xmlCint)
201     d  in                             *   value options(*string)               const unsigned char*
202     d  inlen                              like(xmlCint)
203     d  quoteChar                          value like(xmlCint)
204
205     d htmlIsScriptAttribute...
206     d                 pr                  extproc('htmlIsScriptAttribute')
207     d                                     like(xmlCint)
208     d  name                           *   value options(*string)               const xmlChar *
209
210     d htmlHandleOmittedElem...
211     d                 pr                  extproc('htmlHandleOmittedElem')
212     d                                     like(xmlCint)
213     d  val                                value like(xmlCint)
214
215      /if defined(LIBXML_PUSH_ENABLED)
216
217      * Interfaces for the Push mode.
218
219     d htmlCreatePushParserCtxt...
220     d                 pr                  extproc('htmlCreatePushParserCtxt')
221     d                                     like(htmlParserCtxtPtr)
222     d  sax                                value like(htmlSAXHandlerPtr)
223     d  user_data                      *   value                                void *
224     d  chunk                          *   value options(*string)               const char *
225     d  size                               value like(xmlCint)
226     d  filename                       *   value options(*string)               const char *
227     d  enc                                value like(xmlCharEncoding)
228
229     d htmlParseChunk  pr                  extproc('htmlParseChunk')
230     d                                     like(xmlCint)
231     d  ctxt                               value like(htmlParserCtxtPtr)
232     d  chunk                          *   value options(*string)               const char *
233     d  size                               value like(xmlCint)
234     d  terminate                          value like(xmlCint)
235      /endif                                                                    LIBXML_PUSH_ENABLED
236
237     d htmlFreeParserCtxt...
238     d                 pr                  extproc('htmlFreeParserCtxt')
239     d  ctxt                               value like(htmlParserCtxtPtr)
240
241      * New set of simpler/more flexible APIs
242
243      * xmlParserOption:
244      *
245      * This is the set of XML parser options that can be passed down
246      * to the xmlReadDoc() and similar calls.
247
248     d htmlParserOption...
249     d                 s                   based(######typedef######)
250     d                                     like(xmlCenum)
251     d  HTML_PARSE_RECOVER...                                                   Relaxed parsing
252     d                 c                   X'00000001'
253     d  HTML_PARSE_NODEFDTD...                                                  No default doctype
254     d                 c                   X'00000004'
255     d  HTML_PARSE_NOERROR...                                                   No error reports
256     d                 c                   X'00000020'
257     d  HTML_PARSE_NOWARNING...                                                 No warning reports
258     d                 c                   X'00000040'
259     d  HTML_PARSE_PEDANTIC...                                                  Pedantic err reports
260     d                 c                   X'00000080'
261     d  HTML_PARSE_NOBLANKS...                                                  Remove blank nodes
262     d                 c                   X'00000100'
263     d  HTML_PARSE_NONET...                                                     Forbid net access
264     d                 c                   X'00000800'
265     d  HTML_PARSE_NOIMPLIED...                                                 No implied html/body
266     d                 c                   X'00002000'
267     d  HTML_PARSE_COMPACT...                                                   compact small txtnod
268     d                 c                   X'00010000'
269     d  HTML_PARSE_IGNORE_ENC...                                                Ignore encoding hint
270     d                 c                   X'00200000'
271
272     d htmlCtxtReset   pr                  extproc('htmlCtxtReset')
273     d ctxt                                value like(htmlParserCtxtPtr)
274
275     d htmlCtxtUseOptions...
276     d                 pr                  extproc('htmlCtxtUseOptions')
277     d                                     like(xmlCint)
278     d ctxt                                value like(htmlParserCtxtPtr)
279     d options                             value like(xmlCint)
280
281     d htmlReadDoc     pr                  extproc('htmlReadDoc')
282     d                                     like(htmlDocPtr)
283     d  cur                            *   value options(*string)               const xmlChar *
284     d  URL                            *   value options(*string)               const char *
285     d  encoding                       *   value options(*string)               const char *
286     d  options                            value like(xmlCint)
287
288     d htmlReadFile    pr                  extproc('htmlReadFile')
289     d                                     like(htmlDocPtr)
290     d  URL                            *   value options(*string)               const char *
291     d  encoding                       *   value options(*string)               const char *
292     d  options                            value like(xmlCint)
293
294     d htmlReadMemory  pr                  extproc('htmlReadMemory')
295     d                                     like(htmlDocPtr)
296     d  buffer                         *   value options(*string)               const char *
297     d  size                               value like(xmlCint)
298     d  URL                            *   value options(*string)               const char *
299     d  encoding                       *   value options(*string)               const char *
300     d  options                            value like(xmlCint)
301
302     d htmlReadFd      pr                  extproc('htmlReadFd')
303     d                                     like(htmlDocPtr)
304     d  fd                                 value like(xmlCint)
305     d  URL                            *   value options(*string)               const char *
306     d  encoding                       *   value options(*string)               const char *
307     d  options                            value like(xmlCint)
308
309     d htmlReadIO      pr                  extproc('htmlReadIO')
310     d                                     like(htmlDocPtr)
311     d  ioread                             value like(xmlInputReadCallback)
312     d  ioclose                            value like(xmlInputCloseCallback)
313     d  ioctx                          *   value                                void *
314     d  URL                            *   value options(*string)               const char *
315     d  encoding                       *   value options(*string)               const char *
316     d  options                            value like(xmlCint)
317
318     d htmlCtxtReadDoc...
319     d                 pr                  extproc('htmlCtxtReadDoc')
320     d                                     like(htmlDocPtr)
321     d  ctxt                               value like(xmlParserCtxtPtr)
322     d  cur                            *   value options(*string)               const xmlChar *
323     d  URL                            *   value options(*string)               const char *
324     d  encoding                       *   value options(*string)               const char *
325     d  options                            value like(xmlCint)
326
327     d htmlCtxtReadFile...
328     d                 pr                  extproc('htmlCtxtReadFile')
329     d                                     like(htmlDocPtr)
330     d  ctxt                               value like(xmlParserCtxtPtr)
331     d  filename                       *   value options(*string)               const char *
332     d  encoding                       *   value options(*string)               const char *
333     d  options                            value like(xmlCint)
334
335     d htmlCtxtReadMemory...
336     d                 pr                  extproc('htmlCtxtReadMemory')
337     d                                     like(htmlDocPtr)
338     d  ctxt                               value like(xmlParserCtxtPtr)
339     d  buffer                         *   value options(*string)               const char *
340     d  size                               value like(xmlCint)
341     d  URL                            *   value options(*string)               const char *
342     d  encoding                       *   value options(*string)               const char *
343     d  options                            value like(xmlCint)
344
345     d htmlCtxtReadFd  pr                  extproc('htmlCtxtReadFd')
346     d                                     like(htmlDocPtr)
347     d  ctxt                               value like(xmlParserCtxtPtr)
348     d  fd                                 value like(xmlCint)
349     d  URL                            *   value options(*string)               const char *
350     d  encoding                       *   value options(*string)               const char *
351     d  options                            value like(xmlCint)
352
353     d htmlCtxtReadIO  pr                  extproc('htmlCtxtReadIO')
354     d                                     like(htmlDocPtr)
355     d  ctxt                               value like(xmlParserCtxtPtr)
356     d  ioread                             value like(xmlInputReadCallback)
357     d  ioclose                            value like(xmlInputCloseCallback)
358     d  ioctx                          *   value                                void *
359     d  URL                            *   value options(*string)               const char *
360     d  encoding                       *   value options(*string)               const char *
361     d  options                            value like(xmlCint)
362
363      * Further knowledge of HTML structure
364
365     d htmlStatus      s                   based(######typedef######)
366     d                                     like(xmlCenum)
367     d  HTML_NA        c                   X'0000'                              No check at all
368     d  HTML_INVALID   c                   X'0001'
369     d  HTML_DEPRECATED...
370     d                 c                   X'0002'
371     d  HTML_VALID     c                   X'0004'
372     d  HTML_REQUIRED  c                   X'000C'                              HTML_VALID ored-in
373
374      * Using htmlElemDesc rather than name here, to emphasise the fact
375      *  that otherwise there's a lookup overhead
376
377     d htmlAttrAllowed...
378     d                 pr                  extproc('htmlAttrAllowed')
379     d                                     like(htmlStatus)
380     d  #param1                            value like(htmlElemDescPtr)          const
381     d  #param2                        *   value options(*string)               const xmlChar *
382     d  #param3                            value like(xmlCint)
383
384     d htmlElementAllowedHere...
385     d                 pr                  extproc('htmlElementAllowedHere')
386     d                                     like(xmlCint)
387     d  #param1                            value like(htmlElemDescPtr)          const
388     d  #param2                        *   value options(*string)               const xmlChar *
389
390     d htmlElementStatusHere...
391     d                 pr                  extproc('htmlElementStatusHere')
392     d                                     like(htmlStatus)
393     d  #param1                            value like(htmlElemDescPtr)          const
394     d  #param2                            value like(htmlElemDescPtr)          const
395
396     d htmlNodeStatus  pr                  extproc('htmlNodeStatus')
397     d                                     like(htmlStatus)
398     d  #param1                            value like(htmlNodePtr)
399     d  #param2                            value like(xmlCint)
400
401      * C macros implemented as procedures for ILE/RPG support.
402
403     d htmlDefaultSubelement...
404     d                 pr              *   extproc('__htmlDefaultSubelement')   const char *
405     d  elt                            *   value                                const htmlElemDesc *
406
407     d htmlElementAllowedHereDesc...
408     d                 pr                  extproc(
409     d                                     '__htmlElementAllowedHereDesc')
410     d                                     like(xmlCint)
411     d  parent                         *   value                                const htmlElemDesc *
412     d  elt                            *   value                                const htmlElemDesc *
413
414     d htmlRequiredAttrs...
415     d                 pr              *   extproc('__htmlRequiredAttrs')        const char * *
416     d  elt                            *   value                                const htmlElemDesc *
417
418      /endif                                                                    LIBXML_HTML_ENABLED
419      /endif                                                                    HTML_PARSER_H__
420