1 /*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel@haxx.se>, et al. 9 * 10 * This software is licensed as described in the file COPYING, which 11 * you should have received as part of this distribution. The terms 12 * are also available at https://curl.haxx.se/docs/copyright.html. 13 * 14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell 15 * copies of the Software, and permit persons to whom the Software is 16 * furnished to do so, under the terms of the COPYING file. 17 * 18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 19 * KIND, either express or implied. 20 * 21 ***************************************************************************/ 22 /* <DESC> 23 * Download a document and use libtidy to parse the HTML. 24 * </DESC> 25 */ 26 /* 27 * LibTidy => http://tidy.sourceforge.net 28 */ 29 30 #include <stdio.h> 31 #include <tidy/tidy.h> 32 #include <tidy/buffio.h> 33 #include <curl/curl.h> 34 35 /* curl write callback, to fill tidy's input buffer... */ 36 uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out) 37 { 38 uint r; 39 r = size * nmemb; 40 tidyBufAppend(out, in, r); 41 return r; 42 } 43 44 /* Traverse the document tree */ 45 void dumpNode(TidyDoc doc, TidyNode tnod, int indent) 46 { 47 TidyNode child; 48 for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) { 49 ctmbstr name = tidyNodeGetName(child); 50 if(name) { 51 /* if it has a name, then it's an HTML tag ... */ 52 TidyAttr attr; 53 printf("%*.*s%s ", indent, indent, "<", name); 54 /* walk the attribute list */ 55 for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) { 56 printf(tidyAttrName(attr)); 57 tidyAttrValue(attr)?printf("=\"%s\" ", 58 tidyAttrValue(attr)):printf(" "); 59 } 60 printf(">\n"); 61 } 62 else { 63 /* if it doesn't have a name, then it's probably text, cdata, etc... */ 64 TidyBuffer buf; 65 tidyBufInit(&buf); 66 tidyNodeGetText(doc, child, &buf); 67 printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:""); 68 tidyBufFree(&buf); 69 } 70 dumpNode(doc, child, indent + 4); /* recursive */ 71 } 72 } 73 74 75 int main(int argc, char **argv) 76 { 77 CURL *curl; 78 char curl_errbuf[CURL_ERROR_SIZE]; 79 TidyDoc tdoc; 80 TidyBuffer docbuf = {0}; 81 TidyBuffer tidy_errbuf = {0}; 82 int err; 83 if(argc == 2) { 84 curl = curl_easy_init(); 85 curl_easy_setopt(curl, CURLOPT_URL, argv[1]); 86 curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf); 87 curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); 88 curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); 89 curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); 90 91 tdoc = tidyCreate(); 92 tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */ 93 tidyOptSetInt(tdoc, TidyWrapLen, 4096); 94 tidySetErrorBuffer(tdoc, &tidy_errbuf); 95 tidyBufInit(&docbuf); 96 97 curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf); 98 err = curl_easy_perform(curl); 99 if(!err) { 100 err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */ 101 if(err >= 0) { 102 err = tidyCleanAndRepair(tdoc); /* fix any problems */ 103 if(err >= 0) { 104 err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */ 105 if(err >= 0) { 106 dumpNode(tdoc, tidyGetRoot(tdoc), 0); /* walk the tree */ 107 fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */ 108 } 109 } 110 } 111 } 112 else 113 fprintf(stderr, "%s\n", curl_errbuf); 114 115 /* clean-up */ 116 curl_easy_cleanup(curl); 117 tidyBufFree(&docbuf); 118 tidyBufFree(&tidy_errbuf); 119 tidyRelease(tdoc); 120 return err; 121 122 } 123 else 124 printf("usage: %s <url>\n", argv[0]); 125 126 return 0; 127 } 128