1# Authors: John Dennis <jdennis@redhat.com>
2#
3# Copyright (C) 2007 Red Hat, Inc.
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 2 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program; if not, write to the Free Software
17# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18#
19
20
21__all__ = [
22    'escape_html',
23    'unescape_html',
24    'html_to_text',
25
26    'html_document',
27]
28
29import htmllib
30import formatter as Formatter
31import string
32from types import *
33import StringIO
34
35#------------------------------------------------------------------------------
36
37
38class TextWriter(Formatter.DumbWriter):
39
40    def __init__(self, file=None, maxcol=80, indent_width=4):
41        Formatter.DumbWriter.__init__(self, file, maxcol)
42        self.indent_level = 0
43        self.indent_width = indent_width
44        self._set_indent()
45
46    def _set_indent(self):
47        self.indent_col = self.indent_level * self.indent_width
48        self.indent = ' ' * self.indent_col
49
50    def new_margin(self, margin, level):
51        self.indent_level = level
52        self._set_indent()
53
54    def send_label_data(self, data):
55        data = data + ' '
56        if len(data) > self.indent_col:
57            self.send_literal_data(data)
58        else:
59            offset = self.indent_col - len(data)
60            self.send_literal_data(' ' * offset + data)
61
62    def send_flowing_data(self, data):
63        if not data:
64            return
65        atbreak = self.atbreak or data[0] in string.whitespace
66        col = self.col
67        maxcol = self.maxcol
68        write = self.file.write
69        col = self.col
70        if col == 0:
71            write(self.indent)
72            col = self.indent_col
73        for word in data.split():
74            if atbreak:
75                if col + len(word) >= maxcol:
76                    write('\n' + self.indent)
77                    col = self.indent_col
78                else:
79                    write(' ')
80                    col = col + 1
81            write(word)
82            col = col + len(word)
83            atbreak = 1
84        self.col = col
85        self.atbreak = data[-1] in string.whitespace
86
87
88class HTMLParserAnchor(htmllib.HTMLParser):
89
90    def __init__(self, formatter, verbose=0):
91        htmllib.HTMLParser.__init__(self, formatter, verbose)
92
93    def anchor_bgn(self, href, name, type):
94        self.anchor = href
95
96    def anchor_end(self):
97        if self.anchor:
98            self.handle_data(' (%s) ' % self.anchor)
99            self.anchor = None
100
101#------------------------------------------------------------------------------
102
103
104def escape_html(s):
105    if s is None:
106        return None
107    s = s.replace("&", "&amp;")  # Must be done first!
108    s = s.replace("<", "&lt;")
109    s = s.replace(">", "&gt;")
110    s = s.replace("'", "&apos;")
111    s = s.replace('"', "&quot;")
112    return s
113
114
115def unescape_html(s):
116    if s is None:
117        return None
118    if '&' not in s:
119        return s
120    s = s.replace("&lt;", "<")
121    s = s.replace("&gt;", ">")
122    s = s.replace("&apos;", "'")
123    s = s.replace("&quot;", '"')
124    s = s.replace("&amp;", "&")  # Must be last
125    return s
126
127
128def html_to_text(html, maxcol=80):
129    try:
130        buffer = StringIO.StringIO()
131        formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol))
132        parser = HTMLParserAnchor(formatter)
133        parser.feed(html)
134        parser.close()
135        text = buffer.getvalue()
136        buffer.close()
137        return text
138    except Exception, e:
139        log_program.error('cannot convert html to text: %s' % e)
140        return None
141
142
143def html_document(*body_components):
144    '''Wrap the body components in a HTML document structure with a valid header.
145    Accepts a variable number of arguments of of which canb be:
146    * string
147    * a sequences of strings (tuple or list).
148    * a callable object taking no parameters and returning a string or sequence of strings.
149    '''
150    head = '<html>\n  <head>\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n  </head>\n  <body>\n'
151    tail = '\n  </body>\n</html>'
152
153    doc = head
154
155    for body_component in body_components:
156        if type(body_component) is StringTypes:
157            doc += body_component
158        elif type(body_component) in [TupleType, ListType]:
159            for item in body_component:
160                doc += item
161        elif callable(body_component):
162            result = body_component()
163            if type(result) in [TupleType, ListType]:
164                for item in result:
165                    doc += item
166            else:
167                doc += result
168        else:
169            doc += body_component
170
171    doc += tail
172    return doc
173