1c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)# Copyright 2013 The Chromium Authors. All rights reserved.
2c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)# Use of this source code is governed by a BSD-style license that can be
3c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)# found in the LICENSE file.
4c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
55d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)from collections import defaultdict
6ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochimport posixpath
7ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
8effb81e5f8246d0db0270817048dc992db66e9fbBen Murdochfrom future import Future
95d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)from path_util import SplitParent
105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)from special_paths import SITE_VERIFICATION_FILE
11f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
12c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdochdef _Normalize(file_name, splittext=False):
13c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch  normalized = file_name
14c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch  if splittext:
15c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch    normalized = posixpath.splitext(file_name)[0]
16c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch  normalized = normalized.replace('.', '').replace('-', '').replace('_', '')
17c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch  return normalized.lower()
18ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
19010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)def _CommonNormalizedPrefix(first_file, second_file):
20010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  return posixpath.commonprefix((_Normalize(first_file),
21010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)                                 _Normalize(second_file)))
22010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
23f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
24c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)class PathCanonicalizer(object):
255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  '''Transforms paths into their canonical forms. Since the docserver has had
26c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  many incarnations - e.g. there didn't use to be apps/ - there may be old
27c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  paths lying around the webs. We try to redirect those to where they are now.
28c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  '''
295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  def __init__(self,
305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)               file_system,
315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)               object_store_creator,
325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)               strip_extensions):
335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # |strip_extensions| is a list of file extensions (e.g. .html) that should
345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # be stripped for a path's canonical form.
355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    self._cache = object_store_creator.Create(
365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        PathCanonicalizer, category=file_system.GetIdentity())
375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    self._file_system = file_system
385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    self._strip_extensions = strip_extensions
395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  def _LoadCache(self):
411320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    def load(cached):
425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      # |canonical_paths| is the pre-calculated set of canonical paths.
435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      # |simplified_paths_map| is a lazily populated mapping of simplified file
445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      # names to a list of full paths that contain them. For example,
455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      #  - browseraction: [extensions/browserAction.html]
465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      #  - storage: [apps/storage.html, extensions/storage.html]
475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      canonical_paths, simplified_paths_map = (
485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          cached.get('canonical_paths'), cached.get('simplified_paths_map'))
495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      if canonical_paths is None:
515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        assert simplified_paths_map is None
525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        canonical_paths = set()
535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        simplified_paths_map = defaultdict(list)
545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        for base, dirs, files in self._file_system.Walk(''):
555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          for path in dirs + files:
565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            path_without_ext, ext = posixpath.splitext(path)
575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            canonical_path = posixpath.join(base, path_without_ext)
585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            if (ext not in self._strip_extensions or
595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                path == SITE_VERIFICATION_FILE):
605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)              canonical_path += ext
615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            canonical_paths.add(canonical_path)
62c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch            simplified_paths_map[_Normalize(path, splittext=True)].append(
63c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch                canonical_path)
645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        # Store |simplified_paths_map| sorted. Ties in length are broken by
655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        # taking the shortest, lexicographically smallest path.
665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        for path_list in simplified_paths_map.itervalues():
675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          path_list.sort(key=lambda p: (len(p), p))
685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        self._cache.SetMulti({
695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          'canonical_paths': canonical_paths,
705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          'simplified_paths_map': simplified_paths_map,
715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        })
725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      else:
735d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        assert simplified_paths_map is not None
745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      return canonical_paths, simplified_paths_map
761320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    return self._cache.GetMulti(('canonical_paths',
771320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci                                 'simplified_paths_map')).Then(load)
785d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
79c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
80c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  def Canonicalize(self, path):
815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    '''Returns the canonical path for |path|.
82ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    '''
835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    canonical_paths, simplified_paths_map = self._LoadCache().Get()
845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # Path may already be the canonical path.
865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if path in canonical_paths:
875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      return path
885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # Path not found. Our single heuristic: find |base| in the directory
905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # structure with the longest common prefix of |path|.
915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    _, base = SplitParent(path)
92c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch
93c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch    # Paths with a non-extension dot separator lose information in
94c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch    # _SimplifyFileName, so we try paths both with and without the dot to
95c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch    # maximize the possibility of finding the right path.
96c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch    potential_paths = (
97c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch        simplified_paths_map.get(_Normalize(base), []) +
98c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch        simplified_paths_map.get(_Normalize(base, splittext=True), []))
99c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch
100c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch    if potential_paths == []:
1015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      # There is no file with anything close to that name.
1025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      return path
1035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # The most likely canonical file is the one with the longest common prefix
1055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # with |path|. This is slightly weaker than it could be; |path| is
106c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch    # compared without symbols, not the simplified form of |path|,
107c5cede9ae108bb15f6b7a8aea21c7e1fefa2834cBen Murdoch    # which may matter.
1085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    max_prefix = potential_paths[0]
109010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)    max_prefix_length = len(_CommonNormalizedPrefix(max_prefix, path))
1105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    for path_for_file in potential_paths[1:]:
111010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)      prefix_length = len(_CommonNormalizedPrefix(path_for_file, path))
1125d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      if prefix_length > max_prefix_length:
1135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        max_prefix, max_prefix_length = path_for_file, prefix_length
1145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    return max_prefix
1165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1171320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  def Refresh(self):
1185d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    return self._LoadCache()
119