1 /*
2  * HTMLtree.c : implementation of access function for an HTML tree.
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13 
14 #include <string.h> /* for memset() only ! */
15 
16 #ifdef HAVE_CTYPE_H
17 #include <ctype.h>
18 #endif
19 #ifdef HAVE_STDLIB_H
20 #include <stdlib.h>
21 #endif
22 
23 #include <libxml/xmlmemory.h>
24 #include <libxml/HTMLparser.h>
25 #include <libxml/HTMLtree.h>
26 #include <libxml/entities.h>
27 #include <libxml/valid.h>
28 #include <libxml/xmlerror.h>
29 #include <libxml/parserInternals.h>
30 #include <libxml/globals.h>
31 #include <libxml/uri.h>
32 
33 #include "buf.h"
34 
35 /************************************************************************
36  *									*
37  *		Getting/Setting encoding meta tags			*
38  *									*
39  ************************************************************************/
40 
41 /**
42  * htmlGetMetaEncoding:
43  * @doc:  the document
44  *
45  * Encoding definition lookup in the Meta tags
46  *
47  * Returns the current encoding as flagged in the HTML source
48  */
49 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)50 htmlGetMetaEncoding(htmlDocPtr doc) {
51     htmlNodePtr cur;
52     const xmlChar *content;
53     const xmlChar *encoding;
54 
55     if (doc == NULL)
56 	return(NULL);
57     cur = doc->children;
58 
59     /*
60      * Search the html
61      */
62     while (cur != NULL) {
63 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
64 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
65 		break;
66 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
67 		goto found_head;
68 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
69 		goto found_meta;
70 	}
71 	cur = cur->next;
72     }
73     if (cur == NULL)
74 	return(NULL);
75     cur = cur->children;
76 
77     /*
78      * Search the head
79      */
80     while (cur != NULL) {
81 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
82 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
83 		break;
84 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
85 		goto found_meta;
86 	}
87 	cur = cur->next;
88     }
89     if (cur == NULL)
90 	return(NULL);
91 found_head:
92     cur = cur->children;
93 
94     /*
95      * Search the meta elements
96      */
97 found_meta:
98     while (cur != NULL) {
99 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
100 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
101 		xmlAttrPtr attr = cur->properties;
102 		int http;
103 		const xmlChar *value;
104 
105 		content = NULL;
106 		http = 0;
107 		while (attr != NULL) {
108 		    if ((attr->children != NULL) &&
109 		        (attr->children->type == XML_TEXT_NODE) &&
110 		        (attr->children->next == NULL)) {
111 			value = attr->children->content;
112 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
113 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
114 			    http = 1;
115 			else if ((value != NULL)
116 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
117 			    content = value;
118 			if ((http != 0) && (content != NULL))
119 			    goto found_content;
120 		    }
121 		    attr = attr->next;
122 		}
123 	    }
124 	}
125 	cur = cur->next;
126     }
127     return(NULL);
128 
129 found_content:
130     encoding = xmlStrstr(content, BAD_CAST"charset=");
131     if (encoding == NULL)
132 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
133     if (encoding == NULL)
134 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
135     if (encoding != NULL) {
136 	encoding += 8;
137     } else {
138 	encoding = xmlStrstr(content, BAD_CAST"charset =");
139 	if (encoding == NULL)
140 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
141 	if (encoding == NULL)
142 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
143 	if (encoding != NULL)
144 	    encoding += 9;
145     }
146     if (encoding != NULL) {
147 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
148     }
149     return(encoding);
150 }
151 
152 /**
153  * htmlSetMetaEncoding:
154  * @doc:  the document
155  * @encoding:  the encoding string
156  *
157  * Sets the current encoding in the Meta tags
158  * NOTE: this will not change the document content encoding, just
159  * the META flag associated.
160  *
161  * Returns 0 in case of success and -1 in case of error
162  */
163 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)164 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
165     htmlNodePtr cur, meta = NULL, head = NULL;
166     const xmlChar *content = NULL;
167     char newcontent[100];
168 
169     newcontent[0] = 0;
170 
171     if (doc == NULL)
172 	return(-1);
173 
174     /* html isn't a real encoding it's just libxml2 way to get entities */
175     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
176         return(-1);
177 
178     if (encoding != NULL) {
179 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
180                 (char *)encoding);
181 	newcontent[sizeof(newcontent) - 1] = 0;
182     }
183 
184     cur = doc->children;
185 
186     /*
187      * Search the html
188      */
189     while (cur != NULL) {
190 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
191 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
192 		break;
193 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
194 		goto found_head;
195 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
196 		goto found_meta;
197 	}
198 	cur = cur->next;
199     }
200     if (cur == NULL)
201 	return(-1);
202     cur = cur->children;
203 
204     /*
205      * Search the head
206      */
207     while (cur != NULL) {
208 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
209 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
210 		break;
211 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
212                 head = cur->parent;
213 		goto found_meta;
214             }
215 	}
216 	cur = cur->next;
217     }
218     if (cur == NULL)
219 	return(-1);
220 found_head:
221     head = cur;
222     if (cur->children == NULL)
223         goto create;
224     cur = cur->children;
225 
226 found_meta:
227     /*
228      * Search and update all the remaining the meta elements carrying
229      * encoding informations
230      */
231     while (cur != NULL) {
232 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
233 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
234 		xmlAttrPtr attr = cur->properties;
235 		int http;
236 		const xmlChar *value;
237 
238 		content = NULL;
239 		http = 0;
240 		while (attr != NULL) {
241 		    if ((attr->children != NULL) &&
242 		        (attr->children->type == XML_TEXT_NODE) &&
243 		        (attr->children->next == NULL)) {
244 			value = attr->children->content;
245 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
246 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
247 			    http = 1;
248 			else
249                         {
250                            if ((value != NULL) &&
251                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
252 			       content = value;
253                         }
254 		        if ((http != 0) && (content != NULL))
255 			    break;
256 		    }
257 		    attr = attr->next;
258 		}
259 		if ((http != 0) && (content != NULL)) {
260 		    meta = cur;
261 		    break;
262 		}
263 
264 	    }
265 	}
266 	cur = cur->next;
267     }
268 create:
269     if (meta == NULL) {
270         if ((encoding != NULL) && (head != NULL)) {
271             /*
272              * Create a new Meta element with the right attributes
273              */
274 
275             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
276             if (head->children == NULL)
277                 xmlAddChild(head, meta);
278             else
279                 xmlAddPrevSibling(head->children, meta);
280             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
281             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
282         }
283     } else {
284         /* remove the meta tag if NULL is passed */
285         if (encoding == NULL) {
286             xmlUnlinkNode(meta);
287             xmlFreeNode(meta);
288         }
289         /* change the document only if there is a real encoding change */
290         else if (xmlStrcasestr(content, encoding) == NULL) {
291             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
292         }
293     }
294 
295 
296     return(0);
297 }
298 
299 /**
300  * booleanHTMLAttrs:
301  *
302  * These are the HTML attributes which will be output
303  * in minimized form, i.e. <option selected="selected"> will be
304  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
305  *
306  */
307 static const char* htmlBooleanAttrs[] = {
308   "checked", "compact", "declare", "defer", "disabled", "ismap",
309   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
310   "selected", NULL
311 };
312 
313 
314 /**
315  * htmlIsBooleanAttr:
316  * @name:  the name of the attribute to check
317  *
318  * Determine if a given attribute is a boolean attribute.
319  *
320  * returns: false if the attribute is not boolean, true otherwise.
321  */
322 int
htmlIsBooleanAttr(const xmlChar * name)323 htmlIsBooleanAttr(const xmlChar *name)
324 {
325     int i = 0;
326 
327     while (htmlBooleanAttrs[i] != NULL) {
328         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
329             return 1;
330         i++;
331     }
332     return 0;
333 }
334 
335 #ifdef LIBXML_OUTPUT_ENABLED
336 /*
337  * private routine exported from xmlIO.c
338  */
339 xmlOutputBufferPtr
340 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
341 /************************************************************************
342  *									*
343  *			Output error handlers				*
344  *									*
345  ************************************************************************/
346 /**
347  * htmlSaveErrMemory:
348  * @extra:  extra informations
349  *
350  * Handle an out of memory condition
351  */
352 static void
htmlSaveErrMemory(const char * extra)353 htmlSaveErrMemory(const char *extra)
354 {
355     __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
356 }
357 
358 /**
359  * htmlSaveErr:
360  * @code:  the error number
361  * @node:  the location of the error.
362  * @extra:  extra informations
363  *
364  * Handle an out of memory condition
365  */
366 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)367 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
368 {
369     const char *msg = NULL;
370 
371     switch(code) {
372         case XML_SAVE_NOT_UTF8:
373 	    msg = "string is not in UTF-8\n";
374 	    break;
375 	case XML_SAVE_CHAR_INVALID:
376 	    msg = "invalid character value\n";
377 	    break;
378 	case XML_SAVE_UNKNOWN_ENCODING:
379 	    msg = "unknown encoding %s\n";
380 	    break;
381 	case XML_SAVE_NO_DOCTYPE:
382 	    msg = "HTML has no DOCTYPE\n";
383 	    break;
384 	default:
385 	    msg = "unexpected error number\n";
386     }
387     __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
388 }
389 
390 /************************************************************************
391  *									*
392  *		Dumping HTML tree content to a simple buffer		*
393  *									*
394  ************************************************************************/
395 
396 /**
397  * htmlBufNodeDumpFormat:
398  * @buf:  the xmlBufPtr output
399  * @doc:  the document
400  * @cur:  the current node
401  * @format:  should formatting spaces been added
402  *
403  * Dump an HTML node, recursive behaviour,children are printed too.
404  *
405  * Returns the number of byte written or -1 in case of error
406  */
407 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)408 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
409 	           int format) {
410     size_t use;
411     int ret;
412     xmlOutputBufferPtr outbuf;
413 
414     if (cur == NULL) {
415 	return (-1);
416     }
417     if (buf == NULL) {
418 	return (-1);
419     }
420     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
421     if (outbuf == NULL) {
422         htmlSaveErrMemory("allocating HTML output buffer");
423 	return (-1);
424     }
425     memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
426     outbuf->buffer = buf;
427     outbuf->encoder = NULL;
428     outbuf->writecallback = NULL;
429     outbuf->closecallback = NULL;
430     outbuf->context = NULL;
431     outbuf->written = 0;
432 
433     use = xmlBufUse(buf);
434     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
435     xmlFree(outbuf);
436     ret = xmlBufUse(buf) - use;
437     return (ret);
438 }
439 
440 /**
441  * htmlNodeDump:
442  * @buf:  the HTML buffer output
443  * @doc:  the document
444  * @cur:  the current node
445  *
446  * Dump an HTML node, recursive behaviour,children are printed too,
447  * and formatting returns are added.
448  *
449  * Returns the number of byte written or -1 in case of error
450  */
451 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)452 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
453     xmlBufPtr buffer;
454     size_t ret;
455 
456     if ((buf == NULL) || (cur == NULL))
457         return(-1);
458 
459     xmlInitParser();
460     buffer = xmlBufFromBuffer(buf);
461     if (buffer == NULL)
462         return(-1);
463 
464     ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
465 
466     xmlBufBackToBuffer(buffer);
467 
468     if (ret > INT_MAX)
469         return(-1);
470     return((int) ret);
471 }
472 
473 /**
474  * htmlNodeDumpFileFormat:
475  * @out:  the FILE pointer
476  * @doc:  the document
477  * @cur:  the current node
478  * @encoding: the document encoding
479  * @format:  should formatting spaces been added
480  *
481  * Dump an HTML node, recursive behaviour,children are printed too.
482  *
483  * TODO: if encoding == NULL try to save in the doc encoding
484  *
485  * returns: the number of byte written or -1 in case of failure.
486  */
487 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)488 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
489 	               xmlNodePtr cur, const char *encoding, int format) {
490     xmlOutputBufferPtr buf;
491     xmlCharEncodingHandlerPtr handler = NULL;
492     int ret;
493 
494     xmlInitParser();
495 
496     if (encoding != NULL) {
497 	xmlCharEncoding enc;
498 
499 	enc = xmlParseCharEncoding(encoding);
500 	if (enc != XML_CHAR_ENCODING_UTF8) {
501 	    handler = xmlFindCharEncodingHandler(encoding);
502 	    if (handler == NULL)
503 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
504 	}
505     } else {
506         /*
507          * Fallback to HTML or ASCII when the encoding is unspecified
508          */
509         if (handler == NULL)
510             handler = xmlFindCharEncodingHandler("HTML");
511         if (handler == NULL)
512             handler = xmlFindCharEncodingHandler("ascii");
513     }
514 
515     /*
516      * save the content to a temp buffer.
517      */
518     buf = xmlOutputBufferCreateFile(out, handler);
519     if (buf == NULL) return(0);
520 
521     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
522 
523     ret = xmlOutputBufferClose(buf);
524     return(ret);
525 }
526 
527 /**
528  * htmlNodeDumpFile:
529  * @out:  the FILE pointer
530  * @doc:  the document
531  * @cur:  the current node
532  *
533  * Dump an HTML node, recursive behaviour,children are printed too,
534  * and formatting returns are added.
535  */
536 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)537 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
538     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
539 }
540 
541 /**
542  * htmlDocDumpMemoryFormat:
543  * @cur:  the document
544  * @mem:  OUT: the memory pointer
545  * @size:  OUT: the memory length
546  * @format:  should formatting spaces been added
547  *
548  * Dump an HTML document in memory and return the xmlChar * and it's size.
549  * It's up to the caller to free the memory.
550  */
551 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)552 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
553     xmlOutputBufferPtr buf;
554     xmlCharEncodingHandlerPtr handler = NULL;
555     const char *encoding;
556 
557     xmlInitParser();
558 
559     if ((mem == NULL) || (size == NULL))
560         return;
561     if (cur == NULL) {
562 	*mem = NULL;
563 	*size = 0;
564 	return;
565     }
566 
567     encoding = (const char *) htmlGetMetaEncoding(cur);
568 
569     if (encoding != NULL) {
570 	xmlCharEncoding enc;
571 
572 	enc = xmlParseCharEncoding(encoding);
573 	if (enc != XML_CHAR_ENCODING_UTF8) {
574 	    handler = xmlFindCharEncodingHandler(encoding);
575 	    if (handler == NULL)
576                 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
577 
578 	}
579     } else {
580         /*
581          * Fallback to HTML or ASCII when the encoding is unspecified
582          */
583         if (handler == NULL)
584             handler = xmlFindCharEncodingHandler("HTML");
585         if (handler == NULL)
586             handler = xmlFindCharEncodingHandler("ascii");
587     }
588 
589     buf = xmlAllocOutputBufferInternal(handler);
590     if (buf == NULL) {
591 	*mem = NULL;
592 	*size = 0;
593 	return;
594     }
595 
596     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
597 
598     xmlOutputBufferFlush(buf);
599     if (buf->conv != NULL) {
600 	*size = xmlBufUse(buf->conv);
601 	*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
602     } else {
603 	*size = xmlBufUse(buf->buffer);
604 	*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
605     }
606     (void)xmlOutputBufferClose(buf);
607 }
608 
609 /**
610  * htmlDocDumpMemory:
611  * @cur:  the document
612  * @mem:  OUT: the memory pointer
613  * @size:  OUT: the memory length
614  *
615  * Dump an HTML document in memory and return the xmlChar * and it's size.
616  * It's up to the caller to free the memory.
617  */
618 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)619 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
620 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
621 }
622 
623 
624 /************************************************************************
625  *									*
626  *		Dumping HTML tree content to an I/O output buffer	*
627  *									*
628  ************************************************************************/
629 
630 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
631 
632 /**
633  * htmlDtdDumpOutput:
634  * @buf:  the HTML buffer output
635  * @doc:  the document
636  * @encoding:  the encoding string
637  *
638  * TODO: check whether encoding is needed
639  *
640  * Dump the HTML document DTD, if any.
641  */
642 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)643 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
644 	          const char *encoding ATTRIBUTE_UNUSED) {
645     xmlDtdPtr cur = doc->intSubset;
646 
647     if (cur == NULL) {
648 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
649 	return;
650     }
651     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
652     xmlOutputBufferWriteString(buf, (const char *)cur->name);
653     if (cur->ExternalID != NULL) {
654 	xmlOutputBufferWriteString(buf, " PUBLIC ");
655 	xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
656 	if (cur->SystemID != NULL) {
657 	    xmlOutputBufferWriteString(buf, " ");
658 	    xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
659 	}
660     } else if (cur->SystemID != NULL &&
661 	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
662 	xmlOutputBufferWriteString(buf, " SYSTEM ");
663 	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
664     }
665     xmlOutputBufferWriteString(buf, ">\n");
666 }
667 
668 /**
669  * htmlAttrDumpOutput:
670  * @buf:  the HTML buffer output
671  * @doc:  the document
672  * @cur:  the attribute pointer
673  * @encoding:  the encoding string
674  *
675  * Dump an HTML attribute
676  */
677 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding ATTRIBUTE_UNUSED)678 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
679 	           const char *encoding ATTRIBUTE_UNUSED) {
680     xmlChar *value;
681 
682     /*
683      * The html output method should not escape a & character
684      * occurring in an attribute value immediately followed by
685      * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
686      * This is implemented in xmlEncodeEntitiesReentrant
687      */
688 
689     if (cur == NULL) {
690 	return;
691     }
692     xmlOutputBufferWriteString(buf, " ");
693     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
694         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
695 	xmlOutputBufferWriteString(buf, ":");
696     }
697     xmlOutputBufferWriteString(buf, (const char *)cur->name);
698     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
699 	value = xmlNodeListGetString(doc, cur->children, 0);
700 	if (value) {
701 	    xmlOutputBufferWriteString(buf, "=");
702 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
703 		(cur->parent->ns == NULL) &&
704 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
705 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
706 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
707 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
708 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
709 		xmlChar *tmp = value;
710 		/* xmlURIEscapeStr() escapes '"' so it can be safely used. */
711 		xmlBufCCat(buf->buffer, "\"");
712 
713 		while (IS_BLANK_CH(*tmp)) tmp++;
714 
715 		/* URI Escape everything, except server side includes. */
716 		for ( ; ; ) {
717 		    xmlChar *escaped;
718 		    xmlChar endChar;
719 		    xmlChar *end = NULL;
720 		    xmlChar *start = (xmlChar *)xmlStrstr(tmp, BAD_CAST "<!--");
721 		    if (start != NULL) {
722 			end = (xmlChar *)xmlStrstr(tmp, BAD_CAST "-->");
723 			if (end != NULL) {
724 			    *start = '\0';
725 			}
726 		    }
727 
728 		    /* Escape the whole string, or until start (set to '\0'). */
729 		    escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
730 		    if (escaped != NULL) {
731 		        xmlBufCat(buf->buffer, escaped);
732 		        xmlFree(escaped);
733 		    } else {
734 		        xmlBufCat(buf->buffer, tmp);
735 		    }
736 
737 		    if (end == NULL) { /* Everything has been written. */
738 			break;
739 		    }
740 
741 		    /* Do not escape anything within server side includes. */
742 		    *start = '<'; /* Restore the first character of "<!--". */
743 		    end += 3; /* strlen("-->") */
744 		    endChar = *end;
745 		    *end = '\0';
746 		    xmlBufCat(buf->buffer, start);
747 		    *end = endChar;
748 		    tmp = end;
749 		}
750 
751 		xmlBufCCat(buf->buffer, "\"");
752 	    } else {
753 		xmlBufWriteQuotedString(buf->buffer, value);
754 	    }
755 	    xmlFree(value);
756 	} else  {
757 	    xmlOutputBufferWriteString(buf, "=\"\"");
758 	}
759     }
760 }
761 
762 /**
763  * htmlAttrListDumpOutput:
764  * @buf:  the HTML buffer output
765  * @doc:  the document
766  * @cur:  the first attribute pointer
767  * @encoding:  the encoding string
768  *
769  * Dump a list of HTML attributes
770  */
771 static void
htmlAttrListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur,const char * encoding)772 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
773     if (cur == NULL) {
774 	return;
775     }
776     while (cur != NULL) {
777         htmlAttrDumpOutput(buf, doc, cur, encoding);
778 	cur = cur->next;
779     }
780 }
781 
782 
783 
784 /**
785  * htmlNodeListDumpOutput:
786  * @buf:  the HTML buffer output
787  * @doc:  the document
788  * @cur:  the first node
789  * @encoding:  the encoding string
790  * @format:  should formatting spaces been added
791  *
792  * Dump an HTML node list, recursive behaviour,children are printed too.
793  */
794 static void
htmlNodeListDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)795 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
796 	               xmlNodePtr cur, const char *encoding, int format) {
797     if (cur == NULL) {
798 	return;
799     }
800     while (cur != NULL) {
801         htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
802 	cur = cur->next;
803     }
804 }
805 
806 /**
807  * htmlNodeDumpFormatOutput:
808  * @buf:  the HTML buffer output
809  * @doc:  the document
810  * @cur:  the current node
811  * @encoding:  the encoding string
812  * @format:  should formatting spaces been added
813  *
814  * Dump an HTML node, recursive behaviour,children are printed too.
815  */
816 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)817 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
818 	                 xmlNodePtr cur, const char *encoding, int format) {
819     const htmlElemDesc * info;
820 
821     xmlInitParser();
822 
823     if ((cur == NULL) || (buf == NULL)) {
824 	return;
825     }
826     /*
827      * Special cases.
828      */
829     if (cur->type == XML_DTD_NODE)
830 	return;
831     if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
832         (cur->type == XML_DOCUMENT_NODE)){
833 	htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
834 	return;
835     }
836     if (cur->type == XML_ATTRIBUTE_NODE) {
837         htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
838 	return;
839     }
840     if (cur->type == HTML_TEXT_NODE) {
841 	if (cur->content != NULL) {
842 	    if (((cur->name == (const xmlChar *)xmlStringText) ||
843 		 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
844 		((cur->parent == NULL) ||
845 		 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
846 		  (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
847 		xmlChar *buffer;
848 
849 		buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
850 		if (buffer != NULL) {
851 		    xmlOutputBufferWriteString(buf, (const char *)buffer);
852 		    xmlFree(buffer);
853 		}
854 	    } else {
855 		xmlOutputBufferWriteString(buf, (const char *)cur->content);
856 	    }
857 	}
858 	return;
859     }
860     if (cur->type == HTML_COMMENT_NODE) {
861 	if (cur->content != NULL) {
862 	    xmlOutputBufferWriteString(buf, "<!--");
863 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
864 	    xmlOutputBufferWriteString(buf, "-->");
865 	}
866 	return;
867     }
868     if (cur->type == HTML_PI_NODE) {
869 	if (cur->name == NULL)
870 	    return;
871 	xmlOutputBufferWriteString(buf, "<?");
872 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
873 	if (cur->content != NULL) {
874 	    xmlOutputBufferWriteString(buf, " ");
875 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
876 	}
877 	xmlOutputBufferWriteString(buf, ">");
878 	return;
879     }
880     if (cur->type == HTML_ENTITY_REF_NODE) {
881         xmlOutputBufferWriteString(buf, "&");
882 	xmlOutputBufferWriteString(buf, (const char *)cur->name);
883         xmlOutputBufferWriteString(buf, ";");
884 	return;
885     }
886     if (cur->type == HTML_PRESERVE_NODE) {
887 	if (cur->content != NULL) {
888 	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
889 	}
890 	return;
891     }
892 
893     /*
894      * Get specific HTML info for that node.
895      */
896     if (cur->ns == NULL)
897 	info = htmlTagLookup(cur->name);
898     else
899 	info = NULL;
900 
901     xmlOutputBufferWriteString(buf, "<");
902     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
903         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
904 	xmlOutputBufferWriteString(buf, ":");
905     }
906     xmlOutputBufferWriteString(buf, (const char *)cur->name);
907     if (cur->nsDef)
908 	xmlNsListDumpOutput(buf, cur->nsDef);
909     if (cur->properties != NULL)
910         htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
911 
912     if ((info != NULL) && (info->empty)) {
913         xmlOutputBufferWriteString(buf, ">");
914 	if ((format) && (!info->isinline) && (cur->next != NULL)) {
915 	    if ((cur->next->type != HTML_TEXT_NODE) &&
916 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
917 		(cur->parent != NULL) &&
918 		(cur->parent->name != NULL) &&
919 		(cur->parent->name[0] != 'p')) /* p, pre, param */
920 		xmlOutputBufferWriteString(buf, "\n");
921 	}
922 	return;
923     }
924     if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
925 	(cur->children == NULL)) {
926         if ((info != NULL) && (info->saveEndTag != 0) &&
927 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
928 	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
929 	    xmlOutputBufferWriteString(buf, ">");
930 	} else {
931 	    xmlOutputBufferWriteString(buf, "></");
932             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
933                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
934                 xmlOutputBufferWriteString(buf, ":");
935             }
936 	    xmlOutputBufferWriteString(buf, (const char *)cur->name);
937 	    xmlOutputBufferWriteString(buf, ">");
938 	}
939 	if ((format) && (cur->next != NULL) &&
940             (info != NULL) && (!info->isinline)) {
941 	    if ((cur->next->type != HTML_TEXT_NODE) &&
942 		(cur->next->type != HTML_ENTITY_REF_NODE) &&
943 		(cur->parent != NULL) &&
944 		(cur->parent->name != NULL) &&
945 		(cur->parent->name[0] != 'p')) /* p, pre, param */
946 		xmlOutputBufferWriteString(buf, "\n");
947 	}
948 	return;
949     }
950     xmlOutputBufferWriteString(buf, ">");
951     if ((cur->type != XML_ELEMENT_NODE) &&
952 	(cur->content != NULL)) {
953 	    /*
954 	     * Uses the OutputBuffer property to automatically convert
955 	     * invalids to charrefs
956 	     */
957 
958             xmlOutputBufferWriteString(buf, (const char *) cur->content);
959     }
960     if (cur->children != NULL) {
961         if ((format) && (info != NULL) && (!info->isinline) &&
962 	    (cur->children->type != HTML_TEXT_NODE) &&
963 	    (cur->children->type != HTML_ENTITY_REF_NODE) &&
964 	    (cur->children != cur->last) &&
965 	    (cur->name != NULL) &&
966 	    (cur->name[0] != 'p')) /* p, pre, param */
967 	    xmlOutputBufferWriteString(buf, "\n");
968 	htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
969         if ((format) && (info != NULL) && (!info->isinline) &&
970 	    (cur->last->type != HTML_TEXT_NODE) &&
971 	    (cur->last->type != HTML_ENTITY_REF_NODE) &&
972 	    (cur->children != cur->last) &&
973 	    (cur->name != NULL) &&
974 	    (cur->name[0] != 'p')) /* p, pre, param */
975 	    xmlOutputBufferWriteString(buf, "\n");
976     }
977     xmlOutputBufferWriteString(buf, "</");
978     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
979         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
980 	xmlOutputBufferWriteString(buf, ":");
981     }
982     xmlOutputBufferWriteString(buf, (const char *)cur->name);
983     xmlOutputBufferWriteString(buf, ">");
984     if ((format) && (info != NULL) && (!info->isinline) &&
985 	(cur->next != NULL)) {
986         if ((cur->next->type != HTML_TEXT_NODE) &&
987 	    (cur->next->type != HTML_ENTITY_REF_NODE) &&
988 	    (cur->parent != NULL) &&
989 	    (cur->parent->name != NULL) &&
990 	    (cur->parent->name[0] != 'p')) /* p, pre, param */
991 	    xmlOutputBufferWriteString(buf, "\n");
992     }
993 }
994 
995 /**
996  * htmlNodeDumpOutput:
997  * @buf:  the HTML buffer output
998  * @doc:  the document
999  * @cur:  the current node
1000  * @encoding:  the encoding string
1001  *
1002  * Dump an HTML node, recursive behaviour,children are printed too,
1003  * and formatting returns/spaces are added.
1004  */
1005 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding)1006 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
1007 	           xmlNodePtr cur, const char *encoding) {
1008     htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
1009 }
1010 
1011 /**
1012  * htmlDocContentDumpFormatOutput:
1013  * @buf:  the HTML buffer output
1014  * @cur:  the document
1015  * @encoding:  the encoding string
1016  * @format:  should formatting spaces been added
1017  *
1018  * Dump an HTML document.
1019  */
1020 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding,int format)1021 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1022 	                       const char *encoding, int format) {
1023     int type;
1024 
1025     xmlInitParser();
1026 
1027     if ((buf == NULL) || (cur == NULL))
1028         return;
1029 
1030     /*
1031      * force to output the stuff as HTML, especially for entities
1032      */
1033     type = cur->type;
1034     cur->type = XML_HTML_DOCUMENT_NODE;
1035     if (cur->intSubset != NULL) {
1036         htmlDtdDumpOutput(buf, cur, NULL);
1037     }
1038     if (cur->children != NULL) {
1039         htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
1040     }
1041     xmlOutputBufferWriteString(buf, "\n");
1042     cur->type = (xmlElementType) type;
1043 }
1044 
1045 /**
1046  * htmlDocContentDumpOutput:
1047  * @buf:  the HTML buffer output
1048  * @cur:  the document
1049  * @encoding:  the encoding string
1050  *
1051  * Dump an HTML document. Formating return/spaces are added.
1052  */
1053 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding)1054 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1055 	                 const char *encoding) {
1056     htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1057 }
1058 
1059 /************************************************************************
1060  *									*
1061  *		Saving functions front-ends				*
1062  *									*
1063  ************************************************************************/
1064 
1065 /**
1066  * htmlDocDump:
1067  * @f:  the FILE*
1068  * @cur:  the document
1069  *
1070  * Dump an HTML document to an open FILE.
1071  *
1072  * returns: the number of byte written or -1 in case of failure.
1073  */
1074 int
htmlDocDump(FILE * f,xmlDocPtr cur)1075 htmlDocDump(FILE *f, xmlDocPtr cur) {
1076     xmlOutputBufferPtr buf;
1077     xmlCharEncodingHandlerPtr handler = NULL;
1078     const char *encoding;
1079     int ret;
1080 
1081     xmlInitParser();
1082 
1083     if ((cur == NULL) || (f == NULL)) {
1084 	return(-1);
1085     }
1086 
1087     encoding = (const char *) htmlGetMetaEncoding(cur);
1088 
1089     if (encoding != NULL) {
1090 	xmlCharEncoding enc;
1091 
1092 	enc = xmlParseCharEncoding(encoding);
1093 	if (enc != XML_CHAR_ENCODING_UTF8) {
1094 	    handler = xmlFindCharEncodingHandler(encoding);
1095 	    if (handler == NULL)
1096 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1097 	}
1098     } else {
1099         /*
1100          * Fallback to HTML or ASCII when the encoding is unspecified
1101          */
1102         if (handler == NULL)
1103             handler = xmlFindCharEncodingHandler("HTML");
1104         if (handler == NULL)
1105             handler = xmlFindCharEncodingHandler("ascii");
1106     }
1107 
1108     buf = xmlOutputBufferCreateFile(f, handler);
1109     if (buf == NULL) return(-1);
1110     htmlDocContentDumpOutput(buf, cur, NULL);
1111 
1112     ret = xmlOutputBufferClose(buf);
1113     return(ret);
1114 }
1115 
1116 /**
1117  * htmlSaveFile:
1118  * @filename:  the filename (or URL)
1119  * @cur:  the document
1120  *
1121  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1122  * used.
1123  * returns: the number of byte written or -1 in case of failure.
1124  */
1125 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1126 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1127     xmlOutputBufferPtr buf;
1128     xmlCharEncodingHandlerPtr handler = NULL;
1129     const char *encoding;
1130     int ret;
1131 
1132     if ((cur == NULL) || (filename == NULL))
1133         return(-1);
1134 
1135     xmlInitParser();
1136 
1137     encoding = (const char *) htmlGetMetaEncoding(cur);
1138 
1139     if (encoding != NULL) {
1140 	xmlCharEncoding enc;
1141 
1142 	enc = xmlParseCharEncoding(encoding);
1143 	if (enc != XML_CHAR_ENCODING_UTF8) {
1144 	    handler = xmlFindCharEncodingHandler(encoding);
1145 	    if (handler == NULL)
1146 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1147 	}
1148     } else {
1149         /*
1150          * Fallback to HTML or ASCII when the encoding is unspecified
1151          */
1152         if (handler == NULL)
1153             handler = xmlFindCharEncodingHandler("HTML");
1154         if (handler == NULL)
1155             handler = xmlFindCharEncodingHandler("ascii");
1156     }
1157 
1158     /*
1159      * save the content to a temp buffer.
1160      */
1161     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1162     if (buf == NULL) return(0);
1163 
1164     htmlDocContentDumpOutput(buf, cur, NULL);
1165 
1166     ret = xmlOutputBufferClose(buf);
1167     return(ret);
1168 }
1169 
1170 /**
1171  * htmlSaveFileFormat:
1172  * @filename:  the filename
1173  * @cur:  the document
1174  * @format:  should formatting spaces been added
1175  * @encoding: the document encoding
1176  *
1177  * Dump an HTML document to a file using a given encoding.
1178  *
1179  * returns: the number of byte written or -1 in case of failure.
1180  */
1181 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1182 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1183 	           const char *encoding, int format) {
1184     xmlOutputBufferPtr buf;
1185     xmlCharEncodingHandlerPtr handler = NULL;
1186     int ret;
1187 
1188     if ((cur == NULL) || (filename == NULL))
1189         return(-1);
1190 
1191     xmlInitParser();
1192 
1193     if (encoding != NULL) {
1194 	xmlCharEncoding enc;
1195 
1196 	enc = xmlParseCharEncoding(encoding);
1197 	if (enc != XML_CHAR_ENCODING_UTF8) {
1198 	    handler = xmlFindCharEncodingHandler(encoding);
1199 	    if (handler == NULL)
1200 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1201 	}
1202         htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1203     } else {
1204 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1205 
1206         /*
1207          * Fallback to HTML or ASCII when the encoding is unspecified
1208          */
1209         if (handler == NULL)
1210             handler = xmlFindCharEncodingHandler("HTML");
1211         if (handler == NULL)
1212             handler = xmlFindCharEncodingHandler("ascii");
1213     }
1214 
1215     /*
1216      * save the content to a temp buffer.
1217      */
1218     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1219     if (buf == NULL) return(0);
1220 
1221     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1222 
1223     ret = xmlOutputBufferClose(buf);
1224     return(ret);
1225 }
1226 
1227 /**
1228  * htmlSaveFileEnc:
1229  * @filename:  the filename
1230  * @cur:  the document
1231  * @encoding: the document encoding
1232  *
1233  * Dump an HTML document to a file using a given encoding
1234  * and formatting returns/spaces are added.
1235  *
1236  * returns: the number of byte written or -1 in case of failure.
1237  */
1238 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1239 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1240     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1241 }
1242 
1243 #endif /* LIBXML_OUTPUT_ENABLED */
1244 
1245 #define bottom_HTMLtree
1246 #include "elfgcchack.h"
1247 #endif /* LIBXML_HTML_ENABLED */
1248