1868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#!/usr/bin/env python
2868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)
3868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)"""
4868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)CmpRuns - A simple tool for comparing two static analyzer runs to determine
5868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)which reports have been added, removed, or changed.
6868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)
7868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)This is designed to support automated testing using the static analyzer, from
8868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)two perspectives:
9868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)  1. To monitor changes in the static analyzer's reports on real code bases, for
10868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)     regression testing.
11868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)
12bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  2. For use by end users who want to integrate regular static analyzer testing
13868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)     into a buildbot like environment.
14868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)
15868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)Usage:
16868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)
17868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    # Load the results of both runs, to obtain lists of the corresponding
18868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    # AnalysisDiagnostic objects.
19868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    #
20bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch    resultsA = loadResultsFromSingleRun(singleRunInfoA, deleteEmpty)
21868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    resultsB = loadResultsFromSingleRun(singleRunInfoB, deleteEmpty)
22868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)
23868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    # Generate a relation from diagnostics in run A to diagnostics in run B
24868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    # to obtain a list of triples (a, b, confidence).
25868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    diff = compareResults(resultsA, resultsB)
26a3f7b4e666c476898878fa745f637129375cd889Ben Murdoch
27a3f7b4e666c476898878fa745f637129375cd889Ben Murdoch"""
28a3f7b4e666c476898878fa745f637129375cd889Ben Murdoch
29868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)import os
30868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)import plistlib
31868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)import CmpRuns
32868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)
33868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)# Information about analysis run:
34868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)# path - the analysis output directory
35868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)# root - the name of the root directory, which will be disregarded when
36868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)# determining the source file name
37868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)class SingleRunInfo:
3803b57e008b61dfcb1fbad3aea950ae0e001748b0Torne (Richard Coles)    def __init__(self, path, root="", verboseLog=None):
39868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)        self.path = path
40868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)        self.root = root
41868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)        self.verboseLog = verboseLog
42868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)
43868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)class AnalysisDiagnostic:
44868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)    def __init__(self, data, report, htmlReport):
45        self._data = data
46        self._loc = self._data['location']
47        self._report = report
48        self._htmlReport = htmlReport
49
50    def getFileName(self):
51        root = self._report.run.root
52        fileName = self._report.files[self._loc['file']]
53        if fileName.startswith(root) :
54            return fileName[len(root):]
55        return fileName
56
57    def getLine(self):
58        return self._loc['line']
59
60    def getColumn(self):
61        return self._loc['col']
62
63    def getCategory(self):
64        return self._data['category']
65
66    def getDescription(self):
67        return self._data['description']
68
69    def getIssueIdentifier(self) :
70        id = self.getFileName() + "+"
71        if 'issue_context' in self._data :
72          id += self._data['issue_context'] + "+"
73        if 'issue_hash' in self._data :
74          id += str(self._data['issue_hash'])
75        return id
76
77    def getReport(self):
78        if self._htmlReport is None:
79            return " "
80        return os.path.join(self._report.run.path, self._htmlReport)
81
82    def getReadableName(self):
83        return '%s:%d:%d, %s: %s' % (self.getFileName(), self.getLine(),
84                                     self.getColumn(), self.getCategory(),
85                                     self.getDescription())
86
87    # Note, the data format is not an API and may change from one analyzer
88    # version to another.
89    def getRawData(self):
90        return self._data
91
92class multidict:
93    def __init__(self, elts=()):
94        self.data = {}
95        for key,value in elts:
96            self[key] = value
97
98    def __getitem__(self, item):
99        return self.data[item]
100    def __setitem__(self, key, value):
101        if key in self.data:
102            self.data[key].append(value)
103        else:
104            self.data[key] = [value]
105    def items(self):
106        return self.data.items()
107    def values(self):
108        return self.data.values()
109    def keys(self):
110        return self.data.keys()
111    def __len__(self):
112        return len(self.data)
113    def get(self, key, default=None):
114        return self.data.get(key, default)
115
116class CmpOptions:
117    def __init__(self, verboseLog=None, rootA="", rootB=""):
118        self.rootA = rootA
119        self.rootB = rootB
120        self.verboseLog = verboseLog
121
122class AnalysisReport:
123    def __init__(self, run, files):
124        self.run = run
125        self.files = files
126        self.diagnostics = []
127
128class AnalysisRun:
129    def __init__(self, info):
130        self.path = info.path
131        self.root = info.root
132        self.info = info
133        self.reports = []
134        # Cumulative list of all diagnostics from all the reports.
135        self.diagnostics = []
136        self.clang_version = None
137
138    def getClangVersion(self):
139        return self.clang_version
140
141    def readSingleFile(self, p, deleteEmpty):
142        data = plistlib.readPlist(p)
143
144        # We want to retrieve the clang version even if there are no
145        # reports. Assume that all reports were created using the same
146        # clang version (this is always true and is more efficient).
147        if 'clang_version' in data:
148            if self.clang_version == None:
149                self.clang_version = data.pop('clang_version')
150            else:
151                data.pop('clang_version')
152
153        # Ignore/delete empty reports.
154        if not data['files']:
155            if deleteEmpty == True:
156                os.remove(p)
157            return
158
159        # Extract the HTML reports, if they exists.
160        if 'HTMLDiagnostics_files' in data['diagnostics'][0]:
161            htmlFiles = []
162            for d in data['diagnostics']:
163                # FIXME: Why is this named files, when does it have multiple
164                # files?
165                assert len(d['HTMLDiagnostics_files']) == 1
166                htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
167        else:
168            htmlFiles = [None] * len(data['diagnostics'])
169
170        report = AnalysisReport(self, data.pop('files'))
171        diagnostics = [AnalysisDiagnostic(d, report, h)
172                       for d,h in zip(data.pop('diagnostics'),
173                                      htmlFiles)]
174
175        assert not data
176
177        report.diagnostics.extend(diagnostics)
178        self.reports.append(report)
179        self.diagnostics.extend(diagnostics)
180
181
182# Backward compatibility API.
183def loadResults(path, opts, root = "", deleteEmpty=True):
184    return loadResultsFromSingleRun(SingleRunInfo(path, root, opts.verboseLog),
185                                    deleteEmpty)
186
187# Load results of the analyzes from a given output folder.
188# - info is the SingleRunInfo object
189# - deleteEmpty specifies if the empty plist files should be deleted
190def loadResultsFromSingleRun(info, deleteEmpty=True):
191    path = info.path
192    run = AnalysisRun(info)
193
194    if os.path.isfile(path):
195        run.readSingleFile(path, deleteEmpty)
196    else:
197        for (dirpath, dirnames, filenames) in os.walk(path):
198            for f in filenames:
199                if (not f.endswith('plist')):
200                    continue
201                p = os.path.join(dirpath, f)
202                run.readSingleFile(p, deleteEmpty)
203
204    return run
205
206def cmpAnalysisDiagnostic(d) :
207    return d.getIssueIdentifier()
208
209def compareResults(A, B):
210    """
211    compareResults - Generate a relation from diagnostics in run A to
212    diagnostics in run B.
213
214    The result is the relation as a list of triples (a, b, confidence) where
215    each element {a,b} is None or an element from the respective run, and
216    confidence is a measure of the match quality (where 0 indicates equality,
217    and None is used if either element is None).
218    """
219
220    res = []
221
222    # Quickly eliminate equal elements.
223    neqA = []
224    neqB = []
225    eltsA = list(A.diagnostics)
226    eltsB = list(B.diagnostics)
227    eltsA.sort(key = cmpAnalysisDiagnostic)
228    eltsB.sort(key = cmpAnalysisDiagnostic)
229    while eltsA and eltsB:
230        a = eltsA.pop()
231        b = eltsB.pop()
232        if (a.getIssueIdentifier() == b.getIssueIdentifier()) :
233            res.append((a, b, 0))
234        elif a.getIssueIdentifier() > b.getIssueIdentifier():
235            eltsB.append(b)
236            neqA.append(a)
237        else:
238            eltsA.append(a)
239            neqB.append(b)
240    neqA.extend(eltsA)
241    neqB.extend(eltsB)
242
243    # FIXME: Add fuzzy matching. One simple and possible effective idea would be
244    # to bin the diagnostics, print them in a normalized form (based solely on
245    # the structure of the diagnostic), compute the diff, then use that as the
246    # basis for matching. This has the nice property that we don't depend in any
247    # way on the diagnostic format.
248
249    for a in neqA:
250        res.append((a, None, None))
251    for b in neqB:
252        res.append((None, b, None))
253
254    return res
255
256def dumpScanBuildResultsDiff(dirA, dirB, opts, deleteEmpty=True):
257    # Load the run results.
258    resultsA = loadResults(dirA, opts, opts.rootA, deleteEmpty)
259    resultsB = loadResults(dirB, opts, opts.rootB, deleteEmpty)
260
261    # Open the verbose log, if given.
262    if opts.verboseLog:
263        auxLog = open(opts.verboseLog, "wb")
264    else:
265        auxLog = None
266
267    diff = compareResults(resultsA, resultsB)
268    foundDiffs = 0
269    for res in diff:
270        a,b,confidence = res
271        if a is None:
272            print "ADDED: %r" % b.getReadableName()
273            foundDiffs += 1
274            if auxLog:
275                print >>auxLog, ("('ADDED', %r, %r)" % (b.getReadableName(),
276                                                        b.getReport()))
277        elif b is None:
278            print "REMOVED: %r" % a.getReadableName()
279            foundDiffs += 1
280            if auxLog:
281                print >>auxLog, ("('REMOVED', %r, %r)" % (a.getReadableName(),
282                                                          a.getReport()))
283        elif confidence:
284            print "CHANGED: %r to %r" % (a.getReadableName(),
285                                         b.getReadableName())
286            foundDiffs += 1
287            if auxLog:
288                print >>auxLog, ("('CHANGED', %r, %r, %r, %r)"
289                                 % (a.getReadableName(),
290                                    b.getReadableName(),
291                                    a.getReport(),
292                                    b.getReport()))
293        else:
294            pass
295
296    TotalReports = len(resultsB.diagnostics)
297    print "TOTAL REPORTS: %r" % TotalReports
298    print "TOTAL DIFFERENCES: %r" % foundDiffs
299    if auxLog:
300        print >>auxLog, "('TOTAL NEW REPORTS', %r)" % TotalReports
301        print >>auxLog, "('TOTAL DIFFERENCES', %r)" % foundDiffs
302
303    return foundDiffs
304
305def main():
306    from optparse import OptionParser
307    parser = OptionParser("usage: %prog [options] [dir A] [dir B]")
308    parser.add_option("", "--rootA", dest="rootA",
309                      help="Prefix to ignore on source files for directory A",
310                      action="store", type=str, default="")
311    parser.add_option("", "--rootB", dest="rootB",
312                      help="Prefix to ignore on source files for directory B",
313                      action="store", type=str, default="")
314    parser.add_option("", "--verbose-log", dest="verboseLog",
315                      help="Write additional information to LOG [default=None]",
316                      action="store", type=str, default=None,
317                      metavar="LOG")
318    (opts, args) = parser.parse_args()
319
320    if len(args) != 2:
321        parser.error("invalid number of arguments")
322
323    dirA,dirB = args
324
325    dumpScanBuildResultsDiff(dirA, dirB, opts)
326
327if __name__ == '__main__':
328    main()
329