path_canonicalizer.py revision 010d83a9304c5a91596085d917d248abff47903a
1# Copyright 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from collections import defaultdict
6import posixpath
7
8from future import Future
9from path_util import SplitParent
10from special_paths import SITE_VERIFICATION_FILE
11
12def _Normalize(file_name, splittext=False):
13  normalized = file_name
14  if splittext:
15    normalized = posixpath.splitext(file_name)[0]
16  normalized = normalized.replace('.', '').replace('-', '').replace('_', '')
17  return normalized.lower()
18
19def _CommonNormalizedPrefix(first_file, second_file):
20  return posixpath.commonprefix((_Normalize(first_file),
21                                 _Normalize(second_file)))
22
23
24class PathCanonicalizer(object):
25  '''Transforms paths into their canonical forms. Since the docserver has had
26  many incarnations - e.g. there didn't use to be apps/ - there may be old
27  paths lying around the webs. We try to redirect those to where they are now.
28  '''
29  def __init__(self,
30               file_system,
31               object_store_creator,
32               strip_extensions):
33    # |strip_extensions| is a list of file extensions (e.g. .html) that should
34    # be stripped for a path's canonical form.
35    self._cache = object_store_creator.Create(
36        PathCanonicalizer, category=file_system.GetIdentity())
37    self._file_system = file_system
38    self._strip_extensions = strip_extensions
39
40  def _LoadCache(self):
41    cached_future = self._cache.GetMulti(('canonical_paths',
42                                          'simplified_paths_map'))
43
44    def resolve():
45      # |canonical_paths| is the pre-calculated set of canonical paths.
46      # |simplified_paths_map| is a lazily populated mapping of simplified file
47      # names to a list of full paths that contain them. For example,
48      #  - browseraction: [extensions/browserAction.html]
49      #  - storage: [apps/storage.html, extensions/storage.html]
50      cached = cached_future.Get()
51      canonical_paths, simplified_paths_map = (
52          cached.get('canonical_paths'), cached.get('simplified_paths_map'))
53
54      if canonical_paths is None:
55        assert simplified_paths_map is None
56        canonical_paths = set()
57        simplified_paths_map = defaultdict(list)
58        for base, dirs, files in self._file_system.Walk(''):
59          for path in dirs + files:
60            path_without_ext, ext = posixpath.splitext(path)
61            canonical_path = posixpath.join(base, path_without_ext)
62            if (ext not in self._strip_extensions or
63                path == SITE_VERIFICATION_FILE):
64              canonical_path += ext
65            canonical_paths.add(canonical_path)
66            simplified_paths_map[_Normalize(path, splittext=True)].append(
67                canonical_path)
68        # Store |simplified_paths_map| sorted. Ties in length are broken by
69        # taking the shortest, lexicographically smallest path.
70        for path_list in simplified_paths_map.itervalues():
71          path_list.sort(key=lambda p: (len(p), p))
72        self._cache.SetMulti({
73          'canonical_paths': canonical_paths,
74          'simplified_paths_map': simplified_paths_map,
75        })
76      else:
77        assert simplified_paths_map is not None
78
79      return canonical_paths, simplified_paths_map
80
81    return Future(callback=resolve)
82
83  def Canonicalize(self, path):
84    '''Returns the canonical path for |path|.
85    '''
86    canonical_paths, simplified_paths_map = self._LoadCache().Get()
87
88    # Path may already be the canonical path.
89    if path in canonical_paths:
90      return path
91
92    # Path not found. Our single heuristic: find |base| in the directory
93    # structure with the longest common prefix of |path|.
94    _, base = SplitParent(path)
95
96    # Paths with a non-extension dot separator lose information in
97    # _SimplifyFileName, so we try paths both with and without the dot to
98    # maximize the possibility of finding the right path.
99    potential_paths = (
100        simplified_paths_map.get(_Normalize(base), []) +
101        simplified_paths_map.get(_Normalize(base, splittext=True), []))
102
103    if potential_paths == []:
104      # There is no file with anything close to that name.
105      return path
106
107    # The most likely canonical file is the one with the longest common prefix
108    # with |path|. This is slightly weaker than it could be; |path| is
109    # compared without symbols, not the simplified form of |path|,
110    # which may matter.
111    max_prefix = potential_paths[0]
112    max_prefix_length = len(_CommonNormalizedPrefix(max_prefix, path))
113    for path_for_file in potential_paths[1:]:
114      prefix_length = len(_CommonNormalizedPrefix(path_for_file, path))
115      if prefix_length > max_prefix_length:
116        max_prefix, max_prefix_length = path_for_file, prefix_length
117
118    return max_prefix
119
120  def Cron(self):
121    return self._LoadCache()
122