1 /***************************************************************************
2  *                                  _   _ ____  _
3  *  Project                     ___| | | |  _ \| |
4  *                             / __| | | | |_) | |
5  *                            | (__| |_| |  _ <| |___
6  *                             \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) 1998 - 2011, Daniel Stenberg, <daniel@haxx.se>, et al.
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at http://curl.haxx.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  ***************************************************************************/
22 // Get a web page, parse it with libxml.
23 //
24 // Written by Lars Nilsson
25 //
26 // GNU C++ compile command line suggestion (edit paths accordingly):
27 //
28 // g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \
29 // -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
30 
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include <string>
35 #include <curl/curl.h>
36 #include <libxml/HTMLparser.h>
37 
38 //
39 //  Case-insensitive string comparison
40 //
41 
42 #ifdef _MSC_VER
43 #define COMPARE(a, b) (!stricmp((a), (b)))
44 #else
45 #define COMPARE(a, b) (!strcasecmp((a), (b)))
46 #endif
47 
48 //
49 //  libxml callback context structure
50 //
51 
52 struct Context
53 {
ContextContext54   Context(): addTitle(false) { }
55 
56   bool addTitle;
57   std::string title;
58 };
59 
60 //
61 //  libcurl variables for error strings and returned data
62 
63 static char errorBuffer[CURL_ERROR_SIZE];
64 static std::string buffer;
65 
66 //
67 //  libcurl write callback function
68 //
69 
writer(char * data,size_t size,size_t nmemb,std::string * writerData)70 static int writer(char *data, size_t size, size_t nmemb,
71                   std::string *writerData)
72 {
73   if (writerData == NULL)
74     return 0;
75 
76   writerData->append(data, size*nmemb);
77 
78   return size * nmemb;
79 }
80 
81 //
82 //  libcurl connection initialization
83 //
84 
init(CURL * & conn,char * url)85 static bool init(CURL *&conn, char *url)
86 {
87   CURLcode code;
88 
89   conn = curl_easy_init();
90 
91   if (conn == NULL)
92   {
93     fprintf(stderr, "Failed to create CURL connection\n");
94 
95     exit(EXIT_FAILURE);
96   }
97 
98   code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
99   if (code != CURLE_OK)
100   {
101     fprintf(stderr, "Failed to set error buffer [%d]\n", code);
102 
103     return false;
104   }
105 
106   code = curl_easy_setopt(conn, CURLOPT_URL, url);
107   if (code != CURLE_OK)
108   {
109     fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
110 
111     return false;
112   }
113 
114   code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
115   if (code != CURLE_OK)
116   {
117     fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
118 
119     return false;
120   }
121 
122   code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
123   if (code != CURLE_OK)
124   {
125     fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
126 
127     return false;
128   }
129 
130   code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
131   if (code != CURLE_OK)
132   {
133     fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
134 
135     return false;
136   }
137 
138   return true;
139 }
140 
141 //
142 //  libxml start element callback function
143 //
144 
StartElement(void * voidContext,const xmlChar * name,const xmlChar ** attributes)145 static void StartElement(void *voidContext,
146                          const xmlChar *name,
147                          const xmlChar **attributes)
148 {
149   Context *context = (Context *)voidContext;
150 
151   if (COMPARE((char *)name, "TITLE"))
152   {
153     context->title = "";
154     context->addTitle = true;
155   }
156   (void) attributes;
157 }
158 
159 //
160 //  libxml end element callback function
161 //
162 
EndElement(void * voidContext,const xmlChar * name)163 static void EndElement(void *voidContext,
164                        const xmlChar *name)
165 {
166   Context *context = (Context *)voidContext;
167 
168   if (COMPARE((char *)name, "TITLE"))
169     context->addTitle = false;
170 }
171 
172 //
173 //  Text handling helper function
174 //
175 
handleCharacters(Context * context,const xmlChar * chars,int length)176 static void handleCharacters(Context *context,
177                              const xmlChar *chars,
178                              int length)
179 {
180   if (context->addTitle)
181     context->title.append((char *)chars, length);
182 }
183 
184 //
185 //  libxml PCDATA callback function
186 //
187 
Characters(void * voidContext,const xmlChar * chars,int length)188 static void Characters(void *voidContext,
189                        const xmlChar *chars,
190                        int length)
191 {
192   Context *context = (Context *)voidContext;
193 
194   handleCharacters(context, chars, length);
195 }
196 
197 //
198 //  libxml CDATA callback function
199 //
200 
cdata(void * voidContext,const xmlChar * chars,int length)201 static void cdata(void *voidContext,
202                   const xmlChar *chars,
203                   int length)
204 {
205   Context *context = (Context *)voidContext;
206 
207   handleCharacters(context, chars, length);
208 }
209 
210 //
211 //  libxml SAX callback structure
212 //
213 
214 static htmlSAXHandler saxHandler =
215 {
216   NULL,
217   NULL,
218   NULL,
219   NULL,
220   NULL,
221   NULL,
222   NULL,
223   NULL,
224   NULL,
225   NULL,
226   NULL,
227   NULL,
228   NULL,
229   NULL,
230   StartElement,
231   EndElement,
232   NULL,
233   Characters,
234   NULL,
235   NULL,
236   NULL,
237   NULL,
238   NULL,
239   NULL,
240   NULL,
241   cdata,
242   NULL
243 };
244 
245 //
246 //  Parse given (assumed to be) HTML text and return the title
247 //
248 
parseHtml(const std::string & html,std::string & title)249 static void parseHtml(const std::string &html,
250                       std::string &title)
251 {
252   htmlParserCtxtPtr ctxt;
253   Context context;
254 
255   ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
256                                   XML_CHAR_ENCODING_NONE);
257 
258   htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
259   htmlParseChunk(ctxt, "", 0, 1);
260 
261   htmlFreeParserCtxt(ctxt);
262 
263   title = context.title;
264 }
265 
main(int argc,char * argv[])266 int main(int argc, char *argv[])
267 {
268   CURL *conn = NULL;
269   CURLcode code;
270   std::string title;
271 
272   // Ensure one argument is given
273 
274   if (argc != 2)
275   {
276     fprintf(stderr, "Usage: %s <url>\n", argv[0]);
277 
278     exit(EXIT_FAILURE);
279   }
280 
281   curl_global_init(CURL_GLOBAL_DEFAULT);
282 
283   // Initialize CURL connection
284 
285   if (!init(conn, argv[1]))
286   {
287     fprintf(stderr, "Connection initializion failed\n");
288 
289     exit(EXIT_FAILURE);
290   }
291 
292   // Retrieve content for the URL
293 
294   code = curl_easy_perform(conn);
295   curl_easy_cleanup(conn);
296 
297   if (code != CURLE_OK)
298   {
299     fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
300 
301     exit(EXIT_FAILURE);
302   }
303 
304   // Parse the (assumed) HTML code
305 
306   parseHtml(buffer, title);
307 
308   // Display the extracted title
309 
310   printf("Title: %s\n", title.c_str());
311 
312   return EXIT_SUCCESS;
313 }
314