1"""HTML 2.0 parser.
2
3See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
6
7from warnings import warnpy3k
8warnpy3k("the htmllib module has been removed in Python 3.0",
9         stacklevel=2)
10del warnpy3k
11
12import sgmllib
13
14from formatter import AS_IS
15
16__all__ = ["HTMLParser", "HTMLParseError"]
17
18
19class HTMLParseError(sgmllib.SGMLParseError):
20    """Error raised when an HTML document can't be parsed."""
21
22
23class HTMLParser(sgmllib.SGMLParser):
24    """This is the basic HTML parser class.
25
26    It supports all entity names required by the XHTML 1.0 Recommendation.
27    It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
28    elements.
29
30    """
31
32    from htmlentitydefs import entitydefs
33
34    def __init__(self, formatter, verbose=0):
35        """Creates an instance of the HTMLParser class.
36
37        The formatter parameter is the formatter instance associated with
38        the parser.
39
40        """
41        sgmllib.SGMLParser.__init__(self, verbose)
42        self.formatter = formatter
43
44    def error(self, message):
45        raise HTMLParseError(message)
46
47    def reset(self):
48        sgmllib.SGMLParser.reset(self)
49        self.savedata = None
50        self.isindex = 0
51        self.title = None
52        self.base = None
53        self.anchor = None
54        self.anchorlist = []
55        self.nofill = 0
56        self.list_stack = []
57
58    # ------ Methods used internally; some may be overridden
59
60    # --- Formatter interface, taking care of 'savedata' mode;
61    # shouldn't need to be overridden
62
63    def handle_data(self, data):
64        if self.savedata is not None:
65            self.savedata = self.savedata + data
66        else:
67            if self.nofill:
68                self.formatter.add_literal_data(data)
69            else:
70                self.formatter.add_flowing_data(data)
71
72    # --- Hooks to save data; shouldn't need to be overridden
73
74    def save_bgn(self):
75        """Begins saving character data in a buffer instead of sending it
76        to the formatter object.
77
78        Retrieve the stored data via the save_end() method.  Use of the
79        save_bgn() / save_end() pair may not be nested.
80
81        """
82        self.savedata = ''
83
84    def save_end(self):
85        """Ends buffering character data and returns all data saved since
86        the preceding call to the save_bgn() method.
87
88        If the nofill flag is false, whitespace is collapsed to single
89        spaces.  A call to this method without a preceding call to the
90        save_bgn() method will raise a TypeError exception.
91
92        """
93        data = self.savedata
94        self.savedata = None
95        if not self.nofill:
96            data = ' '.join(data.split())
97        return data
98
99    # --- Hooks for anchors; should probably be overridden
100
101    def anchor_bgn(self, href, name, type):
102        """This method is called at the start of an anchor region.
103
104        The arguments correspond to the attributes of the <A> tag with
105        the same names.  The default implementation maintains a list of
106        hyperlinks (defined by the HREF attribute for <A> tags) within
107        the document.  The list of hyperlinks is available as the data
108        attribute anchorlist.
109
110        """
111        self.anchor = href
112        if self.anchor:
113            self.anchorlist.append(href)
114
115    def anchor_end(self):
116        """This method is called at the end of an anchor region.
117
118        The default implementation adds a textual footnote marker using an
119        index into the list of hyperlinks created by the anchor_bgn()method.
120
121        """
122        if self.anchor:
123            self.handle_data("[%d]" % len(self.anchorlist))
124            self.anchor = None
125
126    # --- Hook for images; should probably be overridden
127
128    def handle_image(self, src, alt, *args):
129        """This method is called to handle images.
130
131        The default implementation simply passes the alt value to the
132        handle_data() method.
133
134        """
135        self.handle_data(alt)
136
137    # --------- Top level elememts
138
139    def start_html(self, attrs): pass
140    def end_html(self): pass
141
142    def start_head(self, attrs): pass
143    def end_head(self): pass
144
145    def start_body(self, attrs): pass
146    def end_body(self): pass
147
148    # ------ Head elements
149
150    def start_title(self, attrs):
151        self.save_bgn()
152
153    def end_title(self):
154        self.title = self.save_end()
155
156    def do_base(self, attrs):
157        for a, v in attrs:
158            if a == 'href':
159                self.base = v
160
161    def do_isindex(self, attrs):
162        self.isindex = 1
163
164    def do_link(self, attrs):
165        pass
166
167    def do_meta(self, attrs):
168        pass
169
170    def do_nextid(self, attrs): # Deprecated
171        pass
172
173    # ------ Body elements
174
175    # --- Headings
176
177    def start_h1(self, attrs):
178        self.formatter.end_paragraph(1)
179        self.formatter.push_font(('h1', 0, 1, 0))
180
181    def end_h1(self):
182        self.formatter.end_paragraph(1)
183        self.formatter.pop_font()
184
185    def start_h2(self, attrs):
186        self.formatter.end_paragraph(1)
187        self.formatter.push_font(('h2', 0, 1, 0))
188
189    def end_h2(self):
190        self.formatter.end_paragraph(1)
191        self.formatter.pop_font()
192
193    def start_h3(self, attrs):
194        self.formatter.end_paragraph(1)
195        self.formatter.push_font(('h3', 0, 1, 0))
196
197    def end_h3(self):
198        self.formatter.end_paragraph(1)
199        self.formatter.pop_font()
200
201    def start_h4(self, attrs):
202        self.formatter.end_paragraph(1)
203        self.formatter.push_font(('h4', 0, 1, 0))
204
205    def end_h4(self):
206        self.formatter.end_paragraph(1)
207        self.formatter.pop_font()
208
209    def start_h5(self, attrs):
210        self.formatter.end_paragraph(1)
211        self.formatter.push_font(('h5', 0, 1, 0))
212
213    def end_h5(self):
214        self.formatter.end_paragraph(1)
215        self.formatter.pop_font()
216
217    def start_h6(self, attrs):
218        self.formatter.end_paragraph(1)
219        self.formatter.push_font(('h6', 0, 1, 0))
220
221    def end_h6(self):
222        self.formatter.end_paragraph(1)
223        self.formatter.pop_font()
224
225    # --- Block Structuring Elements
226
227    def do_p(self, attrs):
228        self.formatter.end_paragraph(1)
229
230    def start_pre(self, attrs):
231        self.formatter.end_paragraph(1)
232        self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
233        self.nofill = self.nofill + 1
234
235    def end_pre(self):
236        self.formatter.end_paragraph(1)
237        self.formatter.pop_font()
238        self.nofill = max(0, self.nofill - 1)
239
240    def start_xmp(self, attrs):
241        self.start_pre(attrs)
242        self.setliteral('xmp') # Tell SGML parser
243
244    def end_xmp(self):
245        self.end_pre()
246
247    def start_listing(self, attrs):
248        self.start_pre(attrs)
249        self.setliteral('listing') # Tell SGML parser
250
251    def end_listing(self):
252        self.end_pre()
253
254    def start_address(self, attrs):
255        self.formatter.end_paragraph(0)
256        self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
257
258    def end_address(self):
259        self.formatter.end_paragraph(0)
260        self.formatter.pop_font()
261
262    def start_blockquote(self, attrs):
263        self.formatter.end_paragraph(1)
264        self.formatter.push_margin('blockquote')
265
266    def end_blockquote(self):
267        self.formatter.end_paragraph(1)
268        self.formatter.pop_margin()
269
270    # --- List Elements
271
272    def start_ul(self, attrs):
273        self.formatter.end_paragraph(not self.list_stack)
274        self.formatter.push_margin('ul')
275        self.list_stack.append(['ul', '*', 0])
276
277    def end_ul(self):
278        if self.list_stack: del self.list_stack[-1]
279        self.formatter.end_paragraph(not self.list_stack)
280        self.formatter.pop_margin()
281
282    def do_li(self, attrs):
283        self.formatter.end_paragraph(0)
284        if self.list_stack:
285            [dummy, label, counter] = top = self.list_stack[-1]
286            top[2] = counter = counter+1
287        else:
288            label, counter = '*', 0
289        self.formatter.add_label_data(label, counter)
290
291    def start_ol(self, attrs):
292        self.formatter.end_paragraph(not self.list_stack)
293        self.formatter.push_margin('ol')
294        label = '1.'
295        for a, v in attrs:
296            if a == 'type':
297                if len(v) == 1: v = v + '.'
298                label = v
299        self.list_stack.append(['ol', label, 0])
300
301    def end_ol(self):
302        if self.list_stack: del self.list_stack[-1]
303        self.formatter.end_paragraph(not self.list_stack)
304        self.formatter.pop_margin()
305
306    def start_menu(self, attrs):
307        self.start_ul(attrs)
308
309    def end_menu(self):
310        self.end_ul()
311
312    def start_dir(self, attrs):
313        self.start_ul(attrs)
314
315    def end_dir(self):
316        self.end_ul()
317
318    def start_dl(self, attrs):
319        self.formatter.end_paragraph(1)
320        self.list_stack.append(['dl', '', 0])
321
322    def end_dl(self):
323        self.ddpop(1)
324        if self.list_stack: del self.list_stack[-1]
325
326    def do_dt(self, attrs):
327        self.ddpop()
328
329    def do_dd(self, attrs):
330        self.ddpop()
331        self.formatter.push_margin('dd')
332        self.list_stack.append(['dd', '', 0])
333
334    def ddpop(self, bl=0):
335        self.formatter.end_paragraph(bl)
336        if self.list_stack:
337            if self.list_stack[-1][0] == 'dd':
338                del self.list_stack[-1]
339                self.formatter.pop_margin()
340
341    # --- Phrase Markup
342
343    # Idiomatic Elements
344
345    def start_cite(self, attrs): self.start_i(attrs)
346    def end_cite(self): self.end_i()
347
348    def start_code(self, attrs): self.start_tt(attrs)
349    def end_code(self): self.end_tt()
350
351    def start_em(self, attrs): self.start_i(attrs)
352    def end_em(self): self.end_i()
353
354    def start_kbd(self, attrs): self.start_tt(attrs)
355    def end_kbd(self): self.end_tt()
356
357    def start_samp(self, attrs): self.start_tt(attrs)
358    def end_samp(self): self.end_tt()
359
360    def start_strong(self, attrs): self.start_b(attrs)
361    def end_strong(self): self.end_b()
362
363    def start_var(self, attrs): self.start_i(attrs)
364    def end_var(self): self.end_i()
365
366    # Typographic Elements
367
368    def start_i(self, attrs):
369        self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
370    def end_i(self):
371        self.formatter.pop_font()
372
373    def start_b(self, attrs):
374        self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
375    def end_b(self):
376        self.formatter.pop_font()
377
378    def start_tt(self, attrs):
379        self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
380    def end_tt(self):
381        self.formatter.pop_font()
382
383    def start_a(self, attrs):
384        href = ''
385        name = ''
386        type = ''
387        for attrname, value in attrs:
388            value = value.strip()
389            if attrname == 'href':
390                href = value
391            if attrname == 'name':
392                name = value
393            if attrname == 'type':
394                type = value.lower()
395        self.anchor_bgn(href, name, type)
396
397    def end_a(self):
398        self.anchor_end()
399
400    # --- Line Break
401
402    def do_br(self, attrs):
403        self.formatter.add_line_break()
404
405    # --- Horizontal Rule
406
407    def do_hr(self, attrs):
408        self.formatter.add_hor_rule()
409
410    # --- Image
411
412    def do_img(self, attrs):
413        align = ''
414        alt = '(image)'
415        ismap = ''
416        src = ''
417        width = 0
418        height = 0
419        for attrname, value in attrs:
420            if attrname == 'align':
421                align = value
422            if attrname == 'alt':
423                alt = value
424            if attrname == 'ismap':
425                ismap = value
426            if attrname == 'src':
427                src = value
428            if attrname == 'width':
429                try: width = int(value)
430                except ValueError: pass
431            if attrname == 'height':
432                try: height = int(value)
433                except ValueError: pass
434        self.handle_image(src, alt, ismap, align, width, height)
435
436    # --- Really Old Unofficial Deprecated Stuff
437
438    def do_plaintext(self, attrs):
439        self.start_pre(attrs)
440        self.setnomoretags() # Tell SGML parser
441
442    # --- Unhandled tags
443
444    def unknown_starttag(self, tag, attrs):
445        pass
446
447    def unknown_endtag(self, tag):
448        pass
449
450
451def test(args = None):
452    import sys, formatter
453
454    if not args:
455        args = sys.argv[1:]
456
457    silent = args and args[0] == '-s'
458    if silent:
459        del args[0]
460
461    if args:
462        file = args[0]
463    else:
464        file = 'test.html'
465
466    if file == '-':
467        f = sys.stdin
468    else:
469        try:
470            f = open(file, 'r')
471        except IOError, msg:
472            print file, ":", msg
473            sys.exit(1)
474
475    data = f.read()
476
477    if f is not sys.stdin:
478        f.close()
479
480    if silent:
481        f = formatter.NullFormatter()
482    else:
483        f = formatter.AbstractFormatter(formatter.DumbWriter())
484
485    p = HTMLParser(f)
486    p.feed(data)
487    p.close()
488
489
490if __name__ == '__main__':
491    test()
492