1"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20  - each line defines a rule; if the rule matches, the issue
21    is ignored.
22  - line number may be empty (that is, nothing between the
23    commas: ",,"). In this case, line numbers are ignored (the
24    rule matches anywhere in the file).
25  - the last field does not have to be a complete line; some
26    surrounding text (never more than a line) is enough for
27    context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
44import os
45import re
46import csv
47import sys
48
49from docutils import nodes
50from sphinx.builders import Builder
51
52detect_all = re.compile(r'''
53    ::(?=[^=])|            # two :: (but NOT ::=)
54    :[a-zA-Z][a-zA-Z0-9]+| # :foo
55    `|                     # ` (seldom used by itself)
56    (?<!\.)\.\.[ \t]*\w+:  # .. foo: (but NOT ... else:)
57    ''', re.UNICODE | re.VERBOSE).finditer
58
59py3 = sys.version_info >= (3, 0)
60
61
62class Rule:
63    def __init__(self, docname, lineno, issue, line):
64        """A rule for ignoring issues"""
65        self.docname = docname # document to which this rule applies
66        self.lineno = lineno   # line number in the original source;
67                               # this rule matches only near that.
68                               # None -> don't care
69        self.issue = issue     # the markup fragment that triggered this rule
70        self.line = line       # text of the container element (single line only)
71        self.used = False
72
73    def __repr__(self):
74        return '{0.docname},,{0.issue},{0.line}'.format(self)
75
76
77
78class dialect(csv.excel):
79    """Our dialect: uses only linefeed as newline."""
80    lineterminator = '\n'
81
82
83class CheckSuspiciousMarkupBuilder(Builder):
84    """
85    Checks for possibly invalid markup that may leak into the output.
86    """
87    name = 'suspicious'
88
89    def init(self):
90        # create output file
91        self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
92        open(self.log_file_name, 'w').close()
93        # load database of previously ignored issues
94        self.load_rules(os.path.join(os.path.dirname(__file__), '..',
95                                     'susp-ignored.csv'))
96
97    def get_outdated_docs(self):
98        return self.env.found_docs
99
100    def get_target_uri(self, docname, typ=None):
101        return ''
102
103    def prepare_writing(self, docnames):
104        pass
105
106    def write_doc(self, docname, doctree):
107        # set when any issue is encountered in this document
108        self.any_issue = False
109        self.docname = docname
110        visitor = SuspiciousVisitor(doctree, self)
111        doctree.walk(visitor)
112
113    def finish(self):
114        unused_rules = [rule for rule in self.rules if not rule.used]
115        if unused_rules:
116            self.warn('Found %s/%s unused rules:' %
117                      (len(unused_rules), len(self.rules)))
118            for rule in unused_rules:
119                self.info(repr(rule))
120        return
121
122    def check_issue(self, line, lineno, issue):
123        if not self.is_ignored(line, lineno, issue):
124            self.report_issue(line, lineno, issue)
125
126    def is_ignored(self, line, lineno, issue):
127        """Determine whether this issue should be ignored."""
128        docname = self.docname
129        for rule in self.rules:
130            if rule.docname != docname: continue
131            if rule.issue != issue: continue
132            # Both lines must match *exactly*. This is rather strict,
133            # and probably should be improved.
134            # Doing fuzzy matches with levenshtein distance could work,
135            # but that means bringing other libraries...
136            # Ok, relax that requirement: just check if the rule fragment
137            # is contained in the document line
138            if rule.line not in line: continue
139            # Check both line numbers. If they're "near"
140            # this rule matches. (lineno=None means "don't care")
141            if (rule.lineno is not None) and \
142                abs(rule.lineno - lineno) > 5: continue
143            # if it came this far, the rule matched
144            rule.used = True
145            return True
146        return False
147
148    def report_issue(self, text, lineno, issue):
149        if not self.any_issue: self.info()
150        self.any_issue = True
151        self.write_log_entry(lineno, issue, text)
152        if py3:
153            self.warn('[%s:%d] "%s" found in "%-.120s"' %
154                      (self.docname, lineno, issue, text))
155        else:
156            self.warn('[%s:%d] "%s" found in "%-.120s"' % (
157                self.docname.encode(sys.getdefaultencoding(),'replace'),
158                lineno,
159                issue.encode(sys.getdefaultencoding(),'replace'),
160                text.strip().encode(sys.getdefaultencoding(),'replace')))
161        self.app.statuscode = 1
162
163    def write_log_entry(self, lineno, issue, text):
164        if py3:
165            f = open(self.log_file_name, 'a')
166            writer = csv.writer(f, dialect)
167            writer.writerow([self.docname, lineno, issue, text.strip()])
168            f.close()
169        else:
170            f = open(self.log_file_name, 'ab')
171            writer = csv.writer(f, dialect)
172            writer.writerow([self.docname.encode('utf-8'),
173                             lineno,
174                             issue.encode('utf-8'),
175                             text.strip().encode('utf-8')])
176            f.close()
177
178    def load_rules(self, filename):
179        """Load database of previously ignored issues.
180
181        A csv file, with exactly the same format as suspicious.csv
182        Fields: document name (normalized), line number, issue, surrounding text
183        """
184        self.info("loading ignore rules... ", nonl=1)
185        self.rules = rules = []
186        try:
187            if py3:
188                f = open(filename, 'r')
189            else:
190                f = open(filename, 'rb')
191        except IOError:
192            return
193        for i, row in enumerate(csv.reader(f)):
194            if len(row) != 4:
195                raise ValueError(
196                    "wrong format in %s, line %d: %s" % (filename, i+1, row))
197            docname, lineno, issue, text = row
198            if lineno:
199                lineno = int(lineno)
200            else:
201                lineno = None
202            if not py3:
203                docname = docname.decode('utf-8')
204                issue = issue.decode('utf-8')
205                text = text.decode('utf-8')
206            rule = Rule(docname, lineno, issue, text)
207            rules.append(rule)
208        f.close()
209        self.info('done, %d rules loaded' % len(self.rules))
210
211
212def get_lineno(node):
213    """Obtain line number information for a node."""
214    lineno = None
215    while lineno is None and node:
216        node = node.parent
217        lineno = node.line
218    return lineno
219
220
221def extract_line(text, index):
222    """text may be a multiline string; extract
223    only the line containing the given character index.
224
225    >>> extract_line("abc\ndefgh\ni", 6)
226    >>> 'defgh'
227    >>> for i in (0, 2, 3, 4, 10):
228    ...   print extract_line("abc\ndefgh\ni", i)
229    abc
230    abc
231    abc
232    defgh
233    defgh
234    i
235    """
236    p = text.rfind('\n', 0, index) + 1
237    q = text.find('\n', index)
238    if q < 0:
239        q = len(text)
240    return text[p:q]
241
242
243class SuspiciousVisitor(nodes.GenericNodeVisitor):
244
245    lastlineno = 0
246
247    def __init__(self, document, builder):
248        nodes.GenericNodeVisitor.__init__(self, document)
249        self.builder = builder
250
251    def default_visit(self, node):
252        if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
253            text = node.astext()
254            # lineno seems to go backwards sometimes (?)
255            self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
256            seen = set() # don't report the same issue more than only once per line
257            for match in detect_all(text):
258                issue = match.group()
259                line = extract_line(text, match.start())
260                if (issue, line) not in seen:
261                    self.builder.check_issue(line, lineno, issue)
262                    seen.add((issue, line))
263
264    unknown_visit = default_visit
265
266    def visit_document(self, node):
267        self.lastlineno = 0
268
269    def visit_comment(self, node):
270        # ignore comments -- too much false positives.
271        # (although doing this could miss some errors;
272        # there were two sections "commented-out" by mistake
273        # in the Python docs that would not be caught)
274        raise nodes.SkipNode
275