path_canonicalizer.py revision 010d83a9304c5a91596085d917d248abff47903a
1# Copyright 2013 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from collections import defaultdict 6import posixpath 7 8from future import Future 9from path_util import SplitParent 10from special_paths import SITE_VERIFICATION_FILE 11 12def _Normalize(file_name, splittext=False): 13 normalized = file_name 14 if splittext: 15 normalized = posixpath.splitext(file_name)[0] 16 normalized = normalized.replace('.', '').replace('-', '').replace('_', '') 17 return normalized.lower() 18 19def _CommonNormalizedPrefix(first_file, second_file): 20 return posixpath.commonprefix((_Normalize(first_file), 21 _Normalize(second_file))) 22 23 24class PathCanonicalizer(object): 25 '''Transforms paths into their canonical forms. Since the docserver has had 26 many incarnations - e.g. there didn't use to be apps/ - there may be old 27 paths lying around the webs. We try to redirect those to where they are now. 28 ''' 29 def __init__(self, 30 file_system, 31 object_store_creator, 32 strip_extensions): 33 # |strip_extensions| is a list of file extensions (e.g. .html) that should 34 # be stripped for a path's canonical form. 35 self._cache = object_store_creator.Create( 36 PathCanonicalizer, category=file_system.GetIdentity()) 37 self._file_system = file_system 38 self._strip_extensions = strip_extensions 39 40 def _LoadCache(self): 41 cached_future = self._cache.GetMulti(('canonical_paths', 42 'simplified_paths_map')) 43 44 def resolve(): 45 # |canonical_paths| is the pre-calculated set of canonical paths. 46 # |simplified_paths_map| is a lazily populated mapping of simplified file 47 # names to a list of full paths that contain them. For example, 48 # - browseraction: [extensions/browserAction.html] 49 # - storage: [apps/storage.html, extensions/storage.html] 50 cached = cached_future.Get() 51 canonical_paths, simplified_paths_map = ( 52 cached.get('canonical_paths'), cached.get('simplified_paths_map')) 53 54 if canonical_paths is None: 55 assert simplified_paths_map is None 56 canonical_paths = set() 57 simplified_paths_map = defaultdict(list) 58 for base, dirs, files in self._file_system.Walk(''): 59 for path in dirs + files: 60 path_without_ext, ext = posixpath.splitext(path) 61 canonical_path = posixpath.join(base, path_without_ext) 62 if (ext not in self._strip_extensions or 63 path == SITE_VERIFICATION_FILE): 64 canonical_path += ext 65 canonical_paths.add(canonical_path) 66 simplified_paths_map[_Normalize(path, splittext=True)].append( 67 canonical_path) 68 # Store |simplified_paths_map| sorted. Ties in length are broken by 69 # taking the shortest, lexicographically smallest path. 70 for path_list in simplified_paths_map.itervalues(): 71 path_list.sort(key=lambda p: (len(p), p)) 72 self._cache.SetMulti({ 73 'canonical_paths': canonical_paths, 74 'simplified_paths_map': simplified_paths_map, 75 }) 76 else: 77 assert simplified_paths_map is not None 78 79 return canonical_paths, simplified_paths_map 80 81 return Future(callback=resolve) 82 83 def Canonicalize(self, path): 84 '''Returns the canonical path for |path|. 85 ''' 86 canonical_paths, simplified_paths_map = self._LoadCache().Get() 87 88 # Path may already be the canonical path. 89 if path in canonical_paths: 90 return path 91 92 # Path not found. Our single heuristic: find |base| in the directory 93 # structure with the longest common prefix of |path|. 94 _, base = SplitParent(path) 95 96 # Paths with a non-extension dot separator lose information in 97 # _SimplifyFileName, so we try paths both with and without the dot to 98 # maximize the possibility of finding the right path. 99 potential_paths = ( 100 simplified_paths_map.get(_Normalize(base), []) + 101 simplified_paths_map.get(_Normalize(base, splittext=True), [])) 102 103 if potential_paths == []: 104 # There is no file with anything close to that name. 105 return path 106 107 # The most likely canonical file is the one with the longest common prefix 108 # with |path|. This is slightly weaker than it could be; |path| is 109 # compared without symbols, not the simplified form of |path|, 110 # which may matter. 111 max_prefix = potential_paths[0] 112 max_prefix_length = len(_CommonNormalizedPrefix(max_prefix, path)) 113 for path_for_file in potential_paths[1:]: 114 prefix_length = len(_CommonNormalizedPrefix(path_for_file, path)) 115 if prefix_length > max_prefix_length: 116 max_prefix, max_prefix_length = path_for_file, prefix_length 117 118 return max_prefix 119 120 def Cron(self): 121 return self._LoadCache() 122