1c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)# Copyright 2013 The Chromium Authors. All rights reserved. 2c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)# Use of this source code is governed by a BSD-style license that can be 3c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)# found in the LICENSE file. 4c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 55d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)from collections import defaultdict 6ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochimport posixpath 7ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 8effb81e5f8246d0db0270817048dc992db66e9fbBen Murdochfrom future import Future 95d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)from path_util import SplitParent 105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)from special_paths import SITE_VERIFICATION_FILE 11f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 12c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochdef _Normalize(file_name, splittext=False): 13c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch normalized = file_name 14c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch if splittext: 15c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch normalized = posixpath.splitext(file_name)[0] 16c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch normalized = normalized.replace('.', '').replace('-', '').replace('_', '') 17c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch return normalized.lower() 18ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 19010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)def _CommonNormalizedPrefix(first_file, second_file): 20010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) return posixpath.commonprefix((_Normalize(first_file), 21010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) _Normalize(second_file))) 22010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) 23f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 24c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)class PathCanonicalizer(object): 255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) '''Transforms paths into their canonical forms. Since the docserver has had 26c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) many incarnations - e.g. there didn't use to be apps/ - there may be old 27c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) paths lying around the webs. We try to redirect those to where they are now. 28c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) ''' 295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) def __init__(self, 305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) file_system, 315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) object_store_creator, 325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) strip_extensions): 335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # |strip_extensions| is a list of file extensions (e.g. .html) that should 345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # be stripped for a path's canonical form. 355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) self._cache = object_store_creator.Create( 365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) PathCanonicalizer, category=file_system.GetIdentity()) 375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) self._file_system = file_system 385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) self._strip_extensions = strip_extensions 395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) def _LoadCache(self): 411320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci def load(cached): 425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # |canonical_paths| is the pre-calculated set of canonical paths. 435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # |simplified_paths_map| is a lazily populated mapping of simplified file 445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # names to a list of full paths that contain them. For example, 455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # - browseraction: [extensions/browserAction.html] 465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # - storage: [apps/storage.html, extensions/storage.html] 475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) canonical_paths, simplified_paths_map = ( 485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) cached.get('canonical_paths'), cached.get('simplified_paths_map')) 495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if canonical_paths is None: 515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) assert simplified_paths_map is None 525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) canonical_paths = set() 535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) simplified_paths_map = defaultdict(list) 545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for base, dirs, files in self._file_system.Walk(''): 555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for path in dirs + files: 565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) path_without_ext, ext = posixpath.splitext(path) 575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) canonical_path = posixpath.join(base, path_without_ext) 585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (ext not in self._strip_extensions or 595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) path == SITE_VERIFICATION_FILE): 605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) canonical_path += ext 615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) canonical_paths.add(canonical_path) 62c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch simplified_paths_map[_Normalize(path, splittext=True)].append( 63c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch canonical_path) 645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # Store |simplified_paths_map| sorted. Ties in length are broken by 655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # taking the shortest, lexicographically smallest path. 665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for path_list in simplified_paths_map.itervalues(): 675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) path_list.sort(key=lambda p: (len(p), p)) 685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) self._cache.SetMulti({ 695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 'canonical_paths': canonical_paths, 705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 'simplified_paths_map': simplified_paths_map, 715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) }) 725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) else: 735d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) assert simplified_paths_map is not None 745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return canonical_paths, simplified_paths_map 761320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci return self._cache.GetMulti(('canonical_paths', 771320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci 'simplified_paths_map')).Then(load) 785d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 79c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 80c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) def Canonicalize(self, path): 815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) '''Returns the canonical path for |path|. 82ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch ''' 835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) canonical_paths, simplified_paths_map = self._LoadCache().Get() 845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # Path may already be the canonical path. 865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if path in canonical_paths: 875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return path 885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # Path not found. Our single heuristic: find |base| in the directory 905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # structure with the longest common prefix of |path|. 915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) _, base = SplitParent(path) 92c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch 93c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch # Paths with a non-extension dot separator lose information in 94c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch # _SimplifyFileName, so we try paths both with and without the dot to 95c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch # maximize the possibility of finding the right path. 96c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch potential_paths = ( 97c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch simplified_paths_map.get(_Normalize(base), []) + 98c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch simplified_paths_map.get(_Normalize(base, splittext=True), [])) 99c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch 100c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch if potential_paths == []: 1015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # There is no file with anything close to that name. 1025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return path 1035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # The most likely canonical file is the one with the longest common prefix 1055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) # with |path|. This is slightly weaker than it could be; |path| is 106c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch # compared without symbols, not the simplified form of |path|, 107c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch # which may matter. 1085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) max_prefix = potential_paths[0] 109010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) max_prefix_length = len(_CommonNormalizedPrefix(max_prefix, path)) 1105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for path_for_file in potential_paths[1:]: 111010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles) prefix_length = len(_CommonNormalizedPrefix(path_for_file, path)) 1125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if prefix_length > max_prefix_length: 1135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) max_prefix, max_prefix_length = path_for_file, prefix_length 1145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return max_prefix 1165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1171320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci def Refresh(self): 1185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) return self._LoadCache() 119