1#!/usr/bin/env python
2# Copyright (C) 2010 Google Inc. All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions
6# are met:
7#
8# 1.  Redistributions of source code must retain the above copyright
9#     notice, this list of conditions and the following disclaimer.
10# 2.  Redistributions in binary form must reproduce the above copyright
11#     notice, this list of conditions and the following disclaimer in the
12#     documentation and/or other materials provided with the distribution.
13#
14# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
15# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
18# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24
25"""deduplicate_tests -- lists duplicated between platforms.
26
27If platform/mac-leopard is missing an expected test output, we fall back on
28platform/mac.  This means it's possible to grow redundant test outputs,
29where we have the same expected data in both a platform directory and another
30platform it falls back on.
31"""
32
33import collections
34import fnmatch
35import os
36import subprocess
37import sys
38import re
39
40from webkitpy.common.checkout import scm
41from webkitpy.common.system import executive
42from webkitpy.common.system import logutils
43from webkitpy.common.system import ospath
44from webkitpy.layout_tests.port import factory as port_factory
45
46_log = logutils.get_logger(__file__)
47
48_BASE_PLATFORM = 'base'
49
50
51def port_fallbacks():
52    """Get the port fallback information.
53    Returns:
54        A dictionary mapping platform name to a list of other platforms to fall
55        back on.  All platforms fall back on 'base'.
56    """
57    fallbacks = {_BASE_PLATFORM: []}
58    for port_name in port_factory.all_port_names():
59        try:
60            platforms = port_factory.get(port_name).baseline_search_path()
61        except NotImplementedError:
62            _log.error("'%s' lacks baseline_search_path(), please fix."
63                       % port_name)
64            fallbacks[port_name] = [_BASE_PLATFORM]
65            continue
66        fallbacks[port_name] = [os.path.basename(p) for p in platforms][1:]
67        fallbacks[port_name].append(_BASE_PLATFORM)
68
69    return fallbacks
70
71
72def parse_git_output(git_output, glob_pattern):
73    """Parses the output of git ls-tree and filters based on glob_pattern.
74    Args:
75        git_output: result of git ls-tree -r HEAD LayoutTests.
76        glob_pattern: a pattern to filter the files.
77    Returns:
78        A dictionary mapping (test name, hash of content) => [paths]
79    """
80    hashes = collections.defaultdict(set)
81    for line in git_output.split('\n'):
82        if not line:
83            break
84        attrs, path = line.strip().split('\t')
85        if not fnmatch.fnmatch(path, glob_pattern):
86            continue
87        path = path[len('LayoutTests/'):]
88        match = re.match(r'^(platform/.*?/)?(.*)', path)
89        test = match.group(2)
90        _, _, hash = attrs.split(' ')
91        hashes[(test, hash)].add(path)
92    return hashes
93
94
95def cluster_file_hashes(glob_pattern):
96    """Get the hashes of all the test expectations in the tree.
97    We cheat and use git's hashes.
98    Args:
99        glob_pattern: a pattern to filter the files.
100    Returns:
101        A dictionary mapping (test name, hash of content) => [paths]
102    """
103
104    # A map of file hash => set of all files with that hash.
105    hashes = collections.defaultdict(set)
106
107    # Fill in the map.
108    cmd = ('git', 'ls-tree', '-r', 'HEAD', 'LayoutTests')
109    try:
110        git_output = executive.Executive().run_command(cmd,
111            cwd=scm.find_checkout_root())
112    except OSError, e:
113        if e.errno == 2:  # No such file or directory.
114            _log.error("Error: 'No such file' when running git.")
115            _log.error("This script requires git.")
116            sys.exit(1)
117        raise e
118    return parse_git_output(git_output, glob_pattern)
119
120
121def dirname_to_platform(dirname):
122    if dirname == 'chromium-linux':
123        return 'chromium-linux-x86'
124    elif dirname == 'chromium-win':
125        return 'chromium-win-win7'
126    elif dirname == 'chromium-mac':
127        return 'chromium-mac-snowleopard'
128    return dirname
129
130def extract_platforms(paths):
131    """Extracts the platforms from a list of paths matching ^platform/(.*?)/.
132    Args:
133        paths: a list of paths.
134    Returns:
135        A dictionary containing all platforms from paths.
136    """
137    platforms = {}
138    for path in paths:
139        match = re.match(r'^platform/(.*?)/', path)
140        if match:
141            platform = dirname_to_platform(match.group(1))
142        else:
143            platform = _BASE_PLATFORM
144        platforms[platform] = path
145    return platforms
146
147
148def has_intermediate_results(test, fallbacks, matching_platform,
149                             path_exists=os.path.exists):
150    """Returns True if there is a test result that causes us to not delete
151    this duplicate.
152
153    For example, chromium-linux may be a duplicate of the checked in result,
154    but chromium-win may have a different result checked in.  In this case,
155    we need to keep the duplicate results.
156
157    Args:
158        test: The test name.
159        fallbacks: A list of platforms we fall back on.
160        matching_platform: The platform that we found the duplicate test
161            result.  We can stop checking here.
162        path_exists: Optional parameter that allows us to stub out
163            os.path.exists for testing.
164    """
165    for dirname in fallbacks:
166        platform = dirname_to_platform(dirname)
167        if platform == matching_platform:
168            return False
169        test_path = os.path.join('LayoutTests', 'platform', dirname, test)
170        if path_exists(test_path):
171            return True
172    return False
173
174
175def get_relative_test_path(filename, relative_to,
176                           checkout_root=scm.find_checkout_root()):
177    """Constructs a relative path to |filename| from |relative_to|.
178    Args:
179        filename: The test file we're trying to get a relative path to.
180        relative_to: The absolute path we're relative to.
181    Returns:
182        A relative path to filename or None if |filename| is not below
183        |relative_to|.
184    """
185    layout_test_dir = os.path.join(checkout_root, 'LayoutTests')
186    abs_path = os.path.join(layout_test_dir, filename)
187    return ospath.relpath(abs_path, relative_to)
188
189
190def find_dups(hashes, port_fallbacks, relative_to):
191    """Yields info about redundant test expectations.
192    Args:
193        hashes: a list of hashes as returned by cluster_file_hashes.
194        port_fallbacks: a list of fallback information as returned by
195            get_port_fallbacks.
196        relative_to: the directory that we want the results relative to
197    Returns:
198        a tuple containing (test, platform, fallback, platforms)
199    """
200    for (test, hash), cluster in hashes.items():
201        if len(cluster) < 2:
202            continue  # Common case: only one file with that hash.
203
204        # Compute the list of platforms we have this particular hash for.
205        platforms = extract_platforms(cluster)
206        if len(platforms) == 1:
207            continue
208
209        # See if any of the platforms are redundant with each other.
210        for platform in platforms.keys():
211            if platform not in port_factory.all_port_names():
212                continue
213            for dirname in port_fallbacks[platform]:
214                fallback = dirname_to_platform(dirname)
215                if fallback not in platforms.keys():
216                    continue
217                # We have to verify that there isn't an intermediate result
218                # that causes this duplicate hash to exist.
219                if has_intermediate_results(test, port_fallbacks[platform],
220                                            fallback):
221                    continue
222                # We print the relative path so it's easy to pipe the results
223                # to xargs rm.
224                path = get_relative_test_path(platforms[platform], relative_to)
225                if not path:
226                    continue
227                yield {
228                    'test': test,
229                    'platform': platform,
230                    'fallback': dirname,
231                    'path': path,
232                }
233
234
235def deduplicate(glob_pattern):
236    """Traverses LayoutTests and returns information about duplicated files.
237    Args:
238        glob pattern to filter the files in LayoutTests.
239    Returns:
240        a dictionary containing test, path, platform and fallback.
241    """
242    fallbacks = port_fallbacks()
243    hashes = cluster_file_hashes(glob_pattern)
244    return list(find_dups(hashes, fallbacks, os.getcwd()))
245