1#!/usr/bin/env python
2
3"""
4CmpRuns - A simple tool for comparing two static analyzer runs to determine
5which reports have been added, removed, or changed.
6
7This is designed to support automated testing using the static analyzer, from
8two perspectives:
9  1. To monitor changes in the static analyzer's reports on real code bases, for
10     regression testing.
11
12  2. For use by end users who want to integrate regular static analyzer testing
13     into a buildbot like environment.
14
15Usage:
16
17    # Load the results of both runs, to obtain lists of the corresponding
18    # AnalysisDiagnostic objects.
19    #
20    resultsA = loadResultsFromSingleRun(singleRunInfoA, deleteEmpty)
21    resultsB = loadResultsFromSingleRun(singleRunInfoB, deleteEmpty)
22
23    # Generate a relation from diagnostics in run A to diagnostics in run B
24    # to obtain a list of triples (a, b, confidence).
25    diff = compareResults(resultsA, resultsB)
26
27"""
28
29import os
30import plistlib
31import CmpRuns
32
33# Information about analysis run:
34# path - the analysis output directory
35# root - the name of the root directory, which will be disregarded when
36# determining the source file name
37class SingleRunInfo:
38    def __init__(self, path, root="", verboseLog=None):
39        self.path = path
40        self.root = root.rstrip("/\\")
41        self.verboseLog = verboseLog
42
43class AnalysisDiagnostic:
44    def __init__(self, data, report, htmlReport):
45        self._data = data
46        self._loc = self._data['location']
47        self._report = report
48        self._htmlReport = htmlReport
49
50    def getFileName(self):
51        root = self._report.run.root
52        fileName = self._report.files[self._loc['file']]
53        if fileName.startswith(root) and len(root) > 0:
54            return fileName[len(root)+1:]
55        return fileName
56
57    def getLine(self):
58        return self._loc['line']
59
60    def getColumn(self):
61        return self._loc['col']
62
63    def getCategory(self):
64        return self._data['category']
65
66    def getDescription(self):
67        return self._data['description']
68
69    def getIssueIdentifier(self) :
70        id = self.getFileName() + "+"
71        if 'issue_context' in self._data :
72          id += self._data['issue_context'] + "+"
73        if 'issue_hash_content_of_line_in_context' in self._data :
74          id += str(self._data['issue_hash_content_of_line_in_context'])
75        return id
76
77    def getReport(self):
78        if self._htmlReport is None:
79            return " "
80        return os.path.join(self._report.run.path, self._htmlReport)
81
82    def getReadableName(self):
83        return '%s:%d:%d, %s: %s' % (self.getFileName(), self.getLine(),
84                                     self.getColumn(), self.getCategory(),
85                                     self.getDescription())
86
87    # Note, the data format is not an API and may change from one analyzer
88    # version to another.
89    def getRawData(self):
90        return self._data
91
92class multidict:
93    def __init__(self, elts=()):
94        self.data = {}
95        for key,value in elts:
96            self[key] = value
97
98    def __getitem__(self, item):
99        return self.data[item]
100    def __setitem__(self, key, value):
101        if key in self.data:
102            self.data[key].append(value)
103        else:
104            self.data[key] = [value]
105    def items(self):
106        return self.data.items()
107    def values(self):
108        return self.data.values()
109    def keys(self):
110        return self.data.keys()
111    def __len__(self):
112        return len(self.data)
113    def get(self, key, default=None):
114        return self.data.get(key, default)
115
116class CmpOptions:
117    def __init__(self, verboseLog=None, rootA="", rootB=""):
118        self.rootA = rootA
119        self.rootB = rootB
120        self.verboseLog = verboseLog
121
122class AnalysisReport:
123    def __init__(self, run, files):
124        self.run = run
125        self.files = files
126        self.diagnostics = []
127
128class AnalysisRun:
129    def __init__(self, info):
130        self.path = info.path
131        self.root = info.root
132        self.info = info
133        self.reports = []
134        # Cumulative list of all diagnostics from all the reports.
135        self.diagnostics = []
136        self.clang_version = None
137
138    def getClangVersion(self):
139        return self.clang_version
140
141    def readSingleFile(self, p, deleteEmpty):
142        data = plistlib.readPlist(p)
143
144        # We want to retrieve the clang version even if there are no
145        # reports. Assume that all reports were created using the same
146        # clang version (this is always true and is more efficient).
147        if 'clang_version' in data:
148            if self.clang_version == None:
149                self.clang_version = data.pop('clang_version')
150            else:
151                data.pop('clang_version')
152
153        # Ignore/delete empty reports.
154        if not data['files']:
155            if deleteEmpty == True:
156                os.remove(p)
157            return
158
159        # Extract the HTML reports, if they exists.
160        if 'HTMLDiagnostics_files' in data['diagnostics'][0]:
161            htmlFiles = []
162            for d in data['diagnostics']:
163                # FIXME: Why is this named files, when does it have multiple
164                # files?
165                assert len(d['HTMLDiagnostics_files']) == 1
166                htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
167        else:
168            htmlFiles = [None] * len(data['diagnostics'])
169
170        report = AnalysisReport(self, data.pop('files'))
171        diagnostics = [AnalysisDiagnostic(d, report, h)
172                       for d,h in zip(data.pop('diagnostics'),
173                                      htmlFiles)]
174
175        assert not data
176
177        report.diagnostics.extend(diagnostics)
178        self.reports.append(report)
179        self.diagnostics.extend(diagnostics)
180
181
182# Backward compatibility API.
183def loadResults(path, opts, root = "", deleteEmpty=True):
184    return loadResultsFromSingleRun(SingleRunInfo(path, root, opts.verboseLog),
185                                    deleteEmpty)
186
187# Load results of the analyzes from a given output folder.
188# - info is the SingleRunInfo object
189# - deleteEmpty specifies if the empty plist files should be deleted
190def loadResultsFromSingleRun(info, deleteEmpty=True):
191    path = info.path
192    run = AnalysisRun(info)
193
194    if os.path.isfile(path):
195        run.readSingleFile(path, deleteEmpty)
196    else:
197        for (dirpath, dirnames, filenames) in os.walk(path):
198            for f in filenames:
199                if (not f.endswith('plist')):
200                    continue
201                p = os.path.join(dirpath, f)
202                run.readSingleFile(p, deleteEmpty)
203
204    return run
205
206def cmpAnalysisDiagnostic(d) :
207    return d.getIssueIdentifier()
208
209def compareResults(A, B):
210    """
211    compareResults - Generate a relation from diagnostics in run A to
212    diagnostics in run B.
213
214    The result is the relation as a list of triples (a, b, confidence) where
215    each element {a,b} is None or an element from the respective run, and
216    confidence is a measure of the match quality (where 0 indicates equality,
217    and None is used if either element is None).
218    """
219
220    res = []
221
222    # Quickly eliminate equal elements.
223    neqA = []
224    neqB = []
225    eltsA = list(A.diagnostics)
226    eltsB = list(B.diagnostics)
227    eltsA.sort(key = cmpAnalysisDiagnostic)
228    eltsB.sort(key = cmpAnalysisDiagnostic)
229    while eltsA and eltsB:
230        a = eltsA.pop()
231        b = eltsB.pop()
232        if (a.getIssueIdentifier() == b.getIssueIdentifier()) :
233            res.append((a, b, 0))
234        elif a.getIssueIdentifier() > b.getIssueIdentifier():
235            eltsB.append(b)
236            neqA.append(a)
237        else:
238            eltsA.append(a)
239            neqB.append(b)
240    neqA.extend(eltsA)
241    neqB.extend(eltsB)
242
243    # FIXME: Add fuzzy matching. One simple and possible effective idea would be
244    # to bin the diagnostics, print them in a normalized form (based solely on
245    # the structure of the diagnostic), compute the diff, then use that as the
246    # basis for matching. This has the nice property that we don't depend in any
247    # way on the diagnostic format.
248
249    for a in neqA:
250        res.append((a, None, None))
251    for b in neqB:
252        res.append((None, b, None))
253
254    return res
255
256def dumpScanBuildResultsDiff(dirA, dirB, opts, deleteEmpty=True):
257    # Load the run results.
258    resultsA = loadResults(dirA, opts, opts.rootA, deleteEmpty)
259    resultsB = loadResults(dirB, opts, opts.rootB, deleteEmpty)
260
261    # Open the verbose log, if given.
262    if opts.verboseLog:
263        auxLog = open(opts.verboseLog, "wb")
264    else:
265        auxLog = None
266
267    diff = compareResults(resultsA, resultsB)
268    foundDiffs = 0
269    for res in diff:
270        a,b,confidence = res
271        if a is None:
272            print "ADDED: %r" % b.getReadableName()
273            foundDiffs += 1
274            if auxLog:
275                print >>auxLog, ("('ADDED', %r, %r)" % (b.getReadableName(),
276                                                        b.getReport()))
277        elif b is None:
278            print "REMOVED: %r" % a.getReadableName()
279            foundDiffs += 1
280            if auxLog:
281                print >>auxLog, ("('REMOVED', %r, %r)" % (a.getReadableName(),
282                                                          a.getReport()))
283        elif confidence:
284            print "CHANGED: %r to %r" % (a.getReadableName(),
285                                         b.getReadableName())
286            foundDiffs += 1
287            if auxLog:
288                print >>auxLog, ("('CHANGED', %r, %r, %r, %r)"
289                                 % (a.getReadableName(),
290                                    b.getReadableName(),
291                                    a.getReport(),
292                                    b.getReport()))
293        else:
294            pass
295
296    TotalReports = len(resultsB.diagnostics)
297    print "TOTAL REPORTS: %r" % TotalReports
298    print "TOTAL DIFFERENCES: %r" % foundDiffs
299    if auxLog:
300        print >>auxLog, "('TOTAL NEW REPORTS', %r)" % TotalReports
301        print >>auxLog, "('TOTAL DIFFERENCES', %r)" % foundDiffs
302
303    return foundDiffs, len(resultsA.diagnostics), len(resultsB.diagnostics)
304
305def main():
306    from optparse import OptionParser
307    parser = OptionParser("usage: %prog [options] [dir A] [dir B]")
308    parser.add_option("", "--rootA", dest="rootA",
309                      help="Prefix to ignore on source files for directory A",
310                      action="store", type=str, default="")
311    parser.add_option("", "--rootB", dest="rootB",
312                      help="Prefix to ignore on source files for directory B",
313                      action="store", type=str, default="")
314    parser.add_option("", "--verbose-log", dest="verboseLog",
315                      help="Write additional information to LOG [default=None]",
316                      action="store", type=str, default=None,
317                      metavar="LOG")
318    (opts, args) = parser.parse_args()
319
320    if len(args) != 2:
321        parser.error("invalid number of arguments")
322
323    dirA,dirB = args
324
325    dumpScanBuildResultsDiff(dirA, dirB, opts)
326
327if __name__ == '__main__':
328    main()
329