path_canonicalizer.py revision effb81e5f8246d0db0270817048dc992db66e9fb
1c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)# Copyright 2013 The Chromium Authors. All rights reserved.
2c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)# Use of this source code is governed by a BSD-style license that can be
3c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)# found in the LICENSE file.
4c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
55d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)from collections import defaultdict
6ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochimport posixpath
7ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
8effb81e5f8246d0db0270817048dc992db66e9fbBen Murdochfrom future import Future
95d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)from path_util import SplitParent
105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)from special_paths import SITE_VERIFICATION_FILE
11f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
12c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
13ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochdef _SimplifyFileName(file_name):
14ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  return (posixpath.splitext(file_name)[0]
15ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      .lower()
16ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      .replace('.', '')
17ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      .replace('-', '')
18ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      .replace('_', ''))
19ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
20f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
21c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)class PathCanonicalizer(object):
225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  '''Transforms paths into their canonical forms. Since the docserver has had
23c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  many incarnations - e.g. there didn't use to be apps/ - there may be old
24c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  paths lying around the webs. We try to redirect those to where they are now.
25c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  '''
265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  def __init__(self,
275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)               file_system,
285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)               object_store_creator,
295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)               strip_extensions):
305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # |strip_extensions| is a list of file extensions (e.g. .html) that should
315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # be stripped for a path's canonical form.
325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    self._cache = object_store_creator.Create(
335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        PathCanonicalizer, category=file_system.GetIdentity())
345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    self._file_system = file_system
355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    self._strip_extensions = strip_extensions
365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  def _LoadCache(self):
385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    cached_future = self._cache.GetMulti(('canonical_paths',
395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                                          'simplified_paths_map'))
405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    def resolve():
425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      # |canonical_paths| is the pre-calculated set of canonical paths.
435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      # |simplified_paths_map| is a lazily populated mapping of simplified file
445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      # names to a list of full paths that contain them. For example,
455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      #  - browseraction: [extensions/browserAction.html]
465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      #  - storage: [apps/storage.html, extensions/storage.html]
475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      cached = cached_future.Get()
485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      canonical_paths, simplified_paths_map = (
495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          cached.get('canonical_paths'), cached.get('simplified_paths_map'))
505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      if canonical_paths is None:
525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        assert simplified_paths_map is None
535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        canonical_paths = set()
545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        simplified_paths_map = defaultdict(list)
555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        for base, dirs, files in self._file_system.Walk(''):
565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          for path in dirs + files:
575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            path_without_ext, ext = posixpath.splitext(path)
585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            canonical_path = posixpath.join(base, path_without_ext)
595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            if (ext not in self._strip_extensions or
605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                path == SITE_VERIFICATION_FILE):
615d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)              canonical_path += ext
625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            canonical_paths.add(canonical_path)
635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)            simplified_paths_map[_SimplifyFileName(path)].append(canonical_path)
645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        # Store |simplified_paths_map| sorted. Ties in length are broken by
655d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        # taking the shortest, lexicographically smallest path.
665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        for path_list in simplified_paths_map.itervalues():
675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          path_list.sort(key=lambda p: (len(p), p))
685d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        self._cache.SetMulti({
695d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          'canonical_paths': canonical_paths,
705d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          'simplified_paths_map': simplified_paths_map,
715d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        })
725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      else:
735d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        assert simplified_paths_map is not None
745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      return canonical_paths, simplified_paths_map
765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
77effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch    return Future(callback=resolve)
78c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
79c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  def Canonicalize(self, path):
805d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    '''Returns the canonical path for |path|.
81ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    '''
825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    canonical_paths, simplified_paths_map = self._LoadCache().Get()
835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
845d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # Path may already be the canonical path.
855d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if path in canonical_paths:
865d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      return path
875d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
885d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # Path not found. Our single heuristic: find |base| in the directory
895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # structure with the longest common prefix of |path|.
905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    _, base = SplitParent(path)
915d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    potential_paths = simplified_paths_map.get(_SimplifyFileName(base))
925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    if not potential_paths:
935d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      # There is no file with anything close to that name.
945d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      return path
955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # The most likely canonical file is the one with the longest common prefix
975d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # with |path|. This is slightly weaker than it could be; |path| is
985d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    # compared, not the simplified form of |path|, which may matter.
995d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    max_prefix = potential_paths[0]
1005d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    max_prefix_length = len(posixpath.commonprefix((max_prefix, path)))
1015d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    for path_for_file in potential_paths[1:]:
1025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      prefix_length = len(posixpath.commonprefix((path_for_file, path)))
1035d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      if prefix_length > max_prefix_length:
1045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)        max_prefix, max_prefix_length = path_for_file, prefix_length
1055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1065d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    return max_prefix
1075d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)
1085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  def Cron(self):
1095d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    return self._LoadCache()
110