1"""Utilities for comparing files and directories.
2
3Classes:
4    dircmp
5
6Functions:
7    cmp(f1, f2, shallow=1) -> int
8    cmpfiles(a, b, common) -> ([], [], [])
9
10"""
11
12import os
13import stat
14from itertools import ifilter, ifilterfalse, imap, izip
15
16__all__ = ["cmp","dircmp","cmpfiles"]
17
18_cache = {}
19BUFSIZE=8*1024
20
21def cmp(f1, f2, shallow=1):
22    """Compare two files.
23
24    Arguments:
25
26    f1 -- First file name
27
28    f2 -- Second file name
29
30    shallow -- Just check stat signature (do not read the files).
31               defaults to 1.
32
33    Return value:
34
35    True if the files are the same, False otherwise.
36
37    This function uses a cache for past comparisons and the results,
38    with a cache invalidation mechanism relying on stale signatures.
39
40    """
41
42    s1 = _sig(os.stat(f1))
43    s2 = _sig(os.stat(f2))
44    if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG:
45        return False
46    if shallow and s1 == s2:
47        return True
48    if s1[1] != s2[1]:
49        return False
50
51    result = _cache.get((f1, f2))
52    if result and (s1, s2) == result[:2]:
53        return result[2]
54    outcome = _do_cmp(f1, f2)
55    _cache[f1, f2] = s1, s2, outcome
56    return outcome
57
58def _sig(st):
59    return (stat.S_IFMT(st.st_mode),
60            st.st_size,
61            st.st_mtime)
62
63def _do_cmp(f1, f2):
64    bufsize = BUFSIZE
65    with open(f1, 'rb') as fp1, open(f2, 'rb') as fp2:
66        while True:
67            b1 = fp1.read(bufsize)
68            b2 = fp2.read(bufsize)
69            if b1 != b2:
70                return False
71            if not b1:
72                return True
73
74# Directory comparison class.
75#
76class dircmp:
77    """A class that manages the comparison of 2 directories.
78
79    dircmp(a,b,ignore=None,hide=None)
80      A and B are directories.
81      IGNORE is a list of names to ignore,
82        defaults to ['RCS', 'CVS', 'tags'].
83      HIDE is a list of names to hide,
84        defaults to [os.curdir, os.pardir].
85
86    High level usage:
87      x = dircmp(dir1, dir2)
88      x.report() -> prints a report on the differences between dir1 and dir2
89       or
90      x.report_partial_closure() -> prints report on differences between dir1
91            and dir2, and reports on common immediate subdirectories.
92      x.report_full_closure() -> like report_partial_closure,
93            but fully recursive.
94
95    Attributes:
96     left_list, right_list: The files in dir1 and dir2,
97        filtered by hide and ignore.
98     common: a list of names in both dir1 and dir2.
99     left_only, right_only: names only in dir1, dir2.
100     common_dirs: subdirectories in both dir1 and dir2.
101     common_files: files in both dir1 and dir2.
102     common_funny: names in both dir1 and dir2 where the type differs between
103        dir1 and dir2, or the name is not stat-able.
104     same_files: list of identical files.
105     diff_files: list of filenames which differ.
106     funny_files: list of files which could not be compared.
107     subdirs: a dictionary of dircmp objects, keyed by names in common_dirs.
108     """
109
110    def __init__(self, a, b, ignore=None, hide=None): # Initialize
111        self.left = a
112        self.right = b
113        if hide is None:
114            self.hide = [os.curdir, os.pardir] # Names never to be shown
115        else:
116            self.hide = hide
117        if ignore is None:
118            self.ignore = ['RCS', 'CVS', 'tags'] # Names ignored in comparison
119        else:
120            self.ignore = ignore
121
122    def phase0(self): # Compare everything except common subdirectories
123        self.left_list = _filter(os.listdir(self.left),
124                                 self.hide+self.ignore)
125        self.right_list = _filter(os.listdir(self.right),
126                                  self.hide+self.ignore)
127        self.left_list.sort()
128        self.right_list.sort()
129
130    def phase1(self): # Compute common names
131        a = dict(izip(imap(os.path.normcase, self.left_list), self.left_list))
132        b = dict(izip(imap(os.path.normcase, self.right_list), self.right_list))
133        self.common = map(a.__getitem__, ifilter(b.__contains__, a))
134        self.left_only = map(a.__getitem__, ifilterfalse(b.__contains__, a))
135        self.right_only = map(b.__getitem__, ifilterfalse(a.__contains__, b))
136
137    def phase2(self): # Distinguish files, directories, funnies
138        self.common_dirs = []
139        self.common_files = []
140        self.common_funny = []
141
142        for x in self.common:
143            a_path = os.path.join(self.left, x)
144            b_path = os.path.join(self.right, x)
145
146            ok = 1
147            try:
148                a_stat = os.stat(a_path)
149            except os.error, why:
150                # print 'Can\'t stat', a_path, ':', why[1]
151                ok = 0
152            try:
153                b_stat = os.stat(b_path)
154            except os.error, why:
155                # print 'Can\'t stat', b_path, ':', why[1]
156                ok = 0
157
158            if ok:
159                a_type = stat.S_IFMT(a_stat.st_mode)
160                b_type = stat.S_IFMT(b_stat.st_mode)
161                if a_type != b_type:
162                    self.common_funny.append(x)
163                elif stat.S_ISDIR(a_type):
164                    self.common_dirs.append(x)
165                elif stat.S_ISREG(a_type):
166                    self.common_files.append(x)
167                else:
168                    self.common_funny.append(x)
169            else:
170                self.common_funny.append(x)
171
172    def phase3(self): # Find out differences between common files
173        xx = cmpfiles(self.left, self.right, self.common_files)
174        self.same_files, self.diff_files, self.funny_files = xx
175
176    def phase4(self): # Find out differences between common subdirectories
177        # A new dircmp object is created for each common subdirectory,
178        # these are stored in a dictionary indexed by filename.
179        # The hide and ignore properties are inherited from the parent
180        self.subdirs = {}
181        for x in self.common_dirs:
182            a_x = os.path.join(self.left, x)
183            b_x = os.path.join(self.right, x)
184            self.subdirs[x]  = dircmp(a_x, b_x, self.ignore, self.hide)
185
186    def phase4_closure(self): # Recursively call phase4() on subdirectories
187        self.phase4()
188        for sd in self.subdirs.itervalues():
189            sd.phase4_closure()
190
191    def report(self): # Print a report on the differences between a and b
192        # Output format is purposely lousy
193        print 'diff', self.left, self.right
194        if self.left_only:
195            self.left_only.sort()
196            print 'Only in', self.left, ':', self.left_only
197        if self.right_only:
198            self.right_only.sort()
199            print 'Only in', self.right, ':', self.right_only
200        if self.same_files:
201            self.same_files.sort()
202            print 'Identical files :', self.same_files
203        if self.diff_files:
204            self.diff_files.sort()
205            print 'Differing files :', self.diff_files
206        if self.funny_files:
207            self.funny_files.sort()
208            print 'Trouble with common files :', self.funny_files
209        if self.common_dirs:
210            self.common_dirs.sort()
211            print 'Common subdirectories :', self.common_dirs
212        if self.common_funny:
213            self.common_funny.sort()
214            print 'Common funny cases :', self.common_funny
215
216    def report_partial_closure(self): # Print reports on self and on subdirs
217        self.report()
218        for sd in self.subdirs.itervalues():
219            print
220            sd.report()
221
222    def report_full_closure(self): # Report on self and subdirs recursively
223        self.report()
224        for sd in self.subdirs.itervalues():
225            print
226            sd.report_full_closure()
227
228    methodmap = dict(subdirs=phase4,
229                     same_files=phase3, diff_files=phase3, funny_files=phase3,
230                     common_dirs = phase2, common_files=phase2, common_funny=phase2,
231                     common=phase1, left_only=phase1, right_only=phase1,
232                     left_list=phase0, right_list=phase0)
233
234    def __getattr__(self, attr):
235        if attr not in self.methodmap:
236            raise AttributeError, attr
237        self.methodmap[attr](self)
238        return getattr(self, attr)
239
240def cmpfiles(a, b, common, shallow=1):
241    """Compare common files in two directories.
242
243    a, b -- directory names
244    common -- list of file names found in both directories
245    shallow -- if true, do comparison based solely on stat() information
246
247    Returns a tuple of three lists:
248      files that compare equal
249      files that are different
250      filenames that aren't regular files.
251
252    """
253    res = ([], [], [])
254    for x in common:
255        ax = os.path.join(a, x)
256        bx = os.path.join(b, x)
257        res[_cmp(ax, bx, shallow)].append(x)
258    return res
259
260
261# Compare two files.
262# Return:
263#       0 for equal
264#       1 for different
265#       2 for funny cases (can't stat, etc.)
266#
267def _cmp(a, b, sh, abs=abs, cmp=cmp):
268    try:
269        return not abs(cmp(a, b, sh))
270    except os.error:
271        return 2
272
273
274# Return a copy with items that occur in skip removed.
275#
276def _filter(flist, skip):
277    return list(ifilterfalse(skip.__contains__, flist))
278
279
280# Demonstration and testing.
281#
282def demo():
283    import sys
284    import getopt
285    options, args = getopt.getopt(sys.argv[1:], 'r')
286    if len(args) != 2:
287        raise getopt.GetoptError('need exactly two args', None)
288    dd = dircmp(args[0], args[1])
289    if ('-r', '') in options:
290        dd.report_full_closure()
291    else:
292        dd.report()
293
294if __name__ == '__main__':
295    demo()
296